## Imports

In [1]:
import pathlib

import numpy as np
import pandas as pd
import tensorflow as tf

## Check GPU

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
tf.test.is_built_with_cuda(), tf.test.is_built_with_gpu_support()

(True, True)

## Constants & Paths

In [20]:
RANDOM_STATE = 32
NOTEBOOK_ROOT_PATH = pathlib.Path.cwd()
DATAFILE_PATH = "../emails_mit_Metadaten/emails_mit_Metadaten.csv"

## Loading of the Transformed Dataset

In [18]:
dataframe = pd.read_csv(DATAFILE_PATH)
dataframe = dataframe.drop("Unnamed: 0", axis=1)
dataframe.head()

Unnamed: 0,metadata,content,spam
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",False
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,##############################################...,False
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,I thought you might like these:\n1) Slim Down ...,False


## Splitting into trainable Datasets

In [21]:
train_df, val_df ,test_df = np.split(dataframe.sample(frac=1, random_state=RANDOM_STATE), [int(0.6*len(dataframe)), int(0.8*len(dataframe))])

## Transform to Dataset

In [23]:
def df_to_dataset(df, shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.pop('spam')
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds

batch_size = 32
train_ds = df_to_dataset(train_df, batch_size=batch_size)
val_ds = df_to_dataset(val_df, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test_df, shuffle=False, batch_size=batch_size)