# SetUp

# Transformer Model with Metadata
## Notebook SetUp

In [1]:
# !pip install -r requirements.txt

## Import of Libraries

In [1]:
import pathlib
import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak
import transformers as ts

## Project Constants and Paths
### Constants

In [2]:
RANDOM_STATE = 43
INPUT_FILE_NAME = "emails_mit_Metadaten.csv"
BASE_MODEL_NAME = "distilbert-base-uncased"
OUTPUT_FOLDER_NAME = "output_model"

### Paths

In [3]:
NOTEBOOK_ROOT_PATH = pathlib.Path.cwd()
INPUT_FILE_PATH = pathlib.Path.joinpath(NOTEBOOK_ROOT_PATH, INPUT_FILE_NAME)
OUTPUT_FOLDER_PATH = pathlib.Path.joinpath(NOTEBOOK_ROOT_PATH, OUTPUT_FOLDER_NAME)

## Import of File to Dataframe (Pandas)

In [4]:
dataframe = pd.read_csv(INPUT_FILE_NAME)
dataframe = dataframe.drop("Unnamed: 0", axis=1)
dataframe.head()

Unnamed: 0,metadata,content,spam
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",False
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,##############################################...,False
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,I thought you might like these:\n1) Slim Down ...,False


### Encode of the Class into integer

In [5]:
def encode_spam(boolean: bool):
    if boolean:
        return 1
    else:
        return 0


dataframe["label"] = dataframe.spam.apply(lambda x: encode_spam(x))
dataframe.head()

Unnamed: 0,metadata,content,spam,label
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",False,0
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False,0
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False,0
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,##############################################...,False,0
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,I thought you might like these:\n1) Slim Down ...,False,0


# Transformer Model- Generation

# Definition of the AutoKeras Model
## Input Objects

In [7]:
meta_txt_input = ak.TextInput()
content_txt_input = ak.TextInput()

## Processing Block

In [8]:
meta_txt_output = ak.TextBlock(block_type="transformer")(meta_txt_input)
content_txt_output = ak.TextBlock(block_type="transformer")(content_txt_input)

## Merge the Output of the Text Blocks

In [9]:
merged_output = ak.Merge()([meta_txt_output, content_txt_output])
classification_head = ak.ClassificationHead()(merged_output)

## Combination of the Model

In [10]:
model = ak.AutoModel(
    inputs=[meta_txt_input, content_txt_input],
    outputs=[classification_head],
    overwrite= True,
    max_trials= 10
)

# Preperation of the Dataset

In [6]:
train_df, test_df = np.split(dataframe.sample(frac=1, random_state=RANDOM_STATE),
                                     [int(0.8 * len(dataframe))])

# Training of the Model

In [None]:
model.fit(
    [np.array(train_df.metadata), np.array(train_df.content)],
    [train_df.spam],
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
    epochs=5,
)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089

Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
5000              |?                 |text_block_1/max_tokens
64                |?                 |text_block_1/text_to_int_sequence_1/output_sequence_length
none              |?                 |text_block_1/transformer_1/pretraining
128               |?                 |text_block_1/transformer_1/embedding_dim
8                 |?                 |text_block_1/transformer_1/num_heads
2048              |?                 |text_block_1/transformer_1/dense_dim
0                 |?                 |text_block_1/transformer_1/dropout
flatten           |?                 |text_block_1/spatial_reduction_1/reduction_type
False             |?                 |text_block_1/dense_block_1/use_batchnorm
2               

KeyboardInterrupt: KeyboardInterrupt: interrupted by user

In [None]:
best_model = model.tuner.get_best_model()
best_model.save(pathlib.Path.joinpath(OUTPUT_FOLDER_PATH, 'transformer-metadata.tf'))

INFO:tensorflow:Assets written to: /data/notebook_files/output_model/model-2.tf/assets


INFO:tensorflow:Assets written to: /data/notebook_files/output_model/model-2.tf/assets


# Evaluation

In [7]:
best_model = tf.keras.models.load_model(pathlib.Path.joinpath(OUTPUT_FOLDER_PATH, 'transformer-metadata.tf'))

In [8]:
best_model.evaluate([np.array(test_df.metadata), np.array(test_df.content)], test_df.spam, verbose=0)

[9.686202247394249e-06, 1.0]