# Transformer Model with Metadata
## Notebook SetUp

In [1]:
# !pip install -r requirements.txt

## Import of Libraries

In [1]:
import pathlib
import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

## Project Constants and Paths
### Constants

In [2]:
RANDOM_STATE = 43
INPUT_FILE_NAME = "emails_mit_Metadaten.csv"
BASE_MODEL_NAME = "distilbert-base-uncased"
OUTPUT_FOLDER_NAME = "with meta"

### Paths

In [3]:
NOTEBOOK_ROOT_PATH = pathlib.Path.cwd()
INPUT_FILE_PATH = NOTEBOOK_ROOT_PATH.joinpath(INPUT_FILE_NAME)
OUTPUT_FOLDER_PATH = pathlib.Path.joinpath(NOTEBOOK_ROOT_PATH, OUTPUT_FOLDER_NAME)
INPUT_FILE_PATH

WindowsPath('D:/NLP-22-23/Transformer/emails_mit_Metadaten.csv')

## Import of File to Dataframe (Pandas)

In [5]:
dataframe = pd.read_csv(INPUT_FILE_NAME)
dataframe = dataframe.drop("Unnamed: 0", axis=1)
dataframe.head()

Unnamed: 0,metadata,content,spam
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",False
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1) Fight The Risk of Cancer!\r\nhttp://www.adc...,False
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1) Fight The Risk of Cancer!\r\nhttp://www.adc...,False
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,##############################################...,False
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,I thought you might like these:\r\n1) Slim Dow...,False


### Encode of the Class into integer

In [6]:
def encode_spam(boolean: bool):
    if boolean:
        return 1
    else:
        return 0


dataframe["label"] = dataframe.spam.apply(lambda x: encode_spam(x))
dataframe.head()

Unnamed: 0,metadata,content,spam,label
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",False,0
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1) Fight The Risk of Cancer!\r\nhttp://www.adc...,False,0
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1) Fight The Risk of Cancer!\r\nhttp://www.adc...,False,0
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,##############################################...,False,0
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,I thought you might like these:\r\n1) Slim Dow...,False,0


# Definition of the AutoKeras Model
## Input Objects

In [7]:
meta_txt_input = ak.TextInput()
content_txt_input = ak.TextInput()

## Processing Block

In [8]:
meta_txt_output = ak.TextBlock(block_type="transformer")(meta_txt_input)
content_txt_output = ak.TextBlock(block_type="transformer")(content_txt_input)

## Metrics

In [9]:
model_metrics = [
    tf.keras.metrics.BinaryAccuracy(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall(),
]

## Merge the Output of the Text Blocks

In [10]:
merged_output = ak.Merge()([meta_txt_output, content_txt_output])
classification_head = ak.ClassificationHead(
    num_classes= 2,
    metrics= model_metrics
)(merged_output)

## Combination of the Model

In [11]:
model = ak.AutoModel(
    inputs=[meta_txt_input, content_txt_input],
    outputs=[classification_head],
    overwrite= True,
    max_trials= 10,
    objective="val_binary_accuracy",
    seed=RANDOM_STATE,
)

## Callbacks

In [12]:
model_callbacks = [
    tf.keras.callbacks.CSVLogger(
        'meta.log', separator=',', append=True
    ),
    tf.keras.callbacks.EarlyStopping(monitor='binary_accuracy', patience=3),
    tf.keras.callbacks.History(),
    tf.keras.callbacks.TensorBoard(log_dir="./logs-meta")
]

# Preperation of the Dataset

In [13]:
train_df, test_df = np.split(dataframe.sample(frac=1, random_state=RANDOM_STATE),
                             [int(0.8 * len(dataframe))])

# Training of the Model

In [14]:
history = model.fit(
    [np.array(train_df.metadata), np.array(train_df.content)],
    [train_df.spam],
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
    epochs=5,
    callbacks=model_callbacks
)

Trial 10 Complete [00h 00m 17s]
val_binary_accuracy: 1.0

Best val_binary_accuracy So Far: 1.0
Total elapsed time: 00h 02m 50s
INFO:tensorflow:Oracle triggered exit
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: .\auto_model\best_model\assets


## Save the best Model

In [15]:
best_model = model.tuner.get_best_model()
best_model.save(pathlib.Path.joinpath(OUTPUT_FOLDER_PATH, 'transformer-metadata.tf'))

INFO:tensorflow:Assets written to: D:\NLP-22-23\Transformer\with meta\transformer-metadata.tf\assets


## Print the Structure of the best Modell

In [6]:
dot_img_file = pathlib.Path.joinpath(OUTPUT_FOLDER_PATH, 'model.png')
tf.keras.utils.plot_model(best_model, to_file=dot_img_file, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


# Evaluation

In [4]:
best_model = tf.keras.models.load_model(pathlib.Path.joinpath(OUTPUT_FOLDER_PATH, 'transformer-metadata.tf'))

In [17]:
results = best_model.evaluate([np.array(test_df.metadata), np.array(test_df.content)], test_df.spam, verbose=0)

In [18]:
print(f'''
--- Model Performance ---
loss = {results[0]}
accuracy = {results[1]}
precision = {results[2]}
recall = {results[3]}
''')


--- Model Performance ---
loss = 0.0006802839343436062
accuracy = 1.0
precision = 1.0
recall = 1.0

