# SetUp

# Transformer Model with Metadata
## Notebook SetUp

In [None]:
# !pip install -r requirements.txt

## Import of Libraries

In [None]:
import pathlib
import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak
import transformers as ts

## Project Constants and Paths
### Constants

In [None]:
RANDOM_STATE = 43
INPUT_FILE_NAME = "emails_mit_Metadaten.csv"
BASE_MODEL_NAME = "distilbert-base-uncased"
OUTPUT_FOLDER_NAME = "output_model"

### Paths

In [None]:
NOTEBOOK_ROOT_PATH = pathlib.Path.cwd()
INPUT_FILE_PATH = pathlib.Path.joinpath(NOTEBOOK_ROOT_PATH, INPUT_FILE_NAME)
OUTPUT_FOLDER_PATH = pathlib.Path.joinpath(NOTEBOOK_ROOT_PATH, OUTPUT_FOLDER_NAME)

## Import of File to Dataframe (Pandas)

In [None]:
dataframe = pd.read_csv(INPUT_FILE_NAME)
dataframe = dataframe.drop("Unnamed: 0", axis=1)
dataframe.head()

### Encode of the Class into integer

In [None]:
def encode_spam(boolean: bool):
    if boolean:
        return 1
    else:
        return 0


dataframe["label"] = dataframe.spam.apply(lambda x: encode_spam(x))
dataframe.head()

# Transformer Model- Generation

# Definition of the AutoKeras Model
## Input Objects

In [None]:
meta_txt_input = ak.TextInput()
content_txt_input = ak.TextInput()

## Processing Block

In [None]:
meta_txt_output = ak.TextBlock(block_type="transformer")(meta_txt_input)
content_txt_output = ak.TextBlock(block_type="transformer")(content_txt_input)

## Merge the Output of the Text Blocks

In [None]:
merged_output = ak.Merge()([meta_txt_output, content_txt_output])
classification_head = ak.ClassificationHead()(merged_output)

## Combination of the Model

In [None]:
model = ak.AutoModel(
    inputs=[meta_txt_input, content_txt_input],
    outputs=[classification_head],
    overwrite= True,
    max_trials= 10
)

# Preperation of the Dataset

In [None]:
train_df, test_df = np.split(dataframe.sample(frac=1, random_state=RANDOM_STATE),
                                     [int(0.8 * len(dataframe))])

# Training of the Model

In [None]:
model.fit(
    [np.array(train_df.metadata), np.array(train_df.content)],
    [train_df.spam],
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
    epochs=5,
)

In [None]:
best_model = model.tuner.get_best_model()
best_model.save(pathlib.Path.joinpath(OUTPUT_FOLDER_PATH, 'transformer-metadata.tf'))

# Evaluation

In [None]:
best_model = tf.keras.models.load_model(pathlib.Path.joinpath(OUTPUT_FOLDER_PATH, 'transformer-metadata.tf'))

In [None]:
best_model.evaluate([np.array(test_df.metadata), np.array(test_df.content)], test_df.spam, verbose=0)