### Load and Preprocess Raw Data

In [79]:
import json 
import pandas as pd

files = ['legit','phish']

# Create an empty list to store the dictionaries
dict_list = []
    
for file in files:
    
    with open(f'{file}_preprocessed_json.json') as fp:
        data = json.load(fp)
    fp.close()

    # Iterate over each item in the JSON data
    for item in data:
        if item != {}:
            if "1" not in item["rejected-for"]:
                # Extract the subject and header values
                subject = item['header']['Subject']
                body = item['body']

                # Create a dictionary with subject and header
                dict_item = {'text': subject+" "+body, 'labels': file}
                
                # Append the dictionary to the list
                dict_list.append(dict_item)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(dict_list)

In [80]:
df.labels.value_counts()

legit    3730
phish     496
Name: labels, dtype: int64

In [81]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from helper_prabowo_ml import clean_html, remove_links, non_ascii, lower, email_address, removeStopWords, punct, remove_
import re


df = df.reset_index()

# PREPROCESS THE DATA
def preproc(df, colname):
  df[colname] = df[colname].apply(func=clean_html)
  df[colname] = df[colname].apply(func=remove_links)
  df[colname] = df[colname].apply(func=non_ascii)
  df[colname] = df[colname].apply(func=lower)
  df[colname] = df[colname].apply(func=email_address)
  # df[colname] = df[colname].apply(func=removeStopWords)
  df[colname] = df[colname].apply(func=punct)
  df[colname] = df[colname].apply(func=remove_)
  return(df)

df_clean = preproc(df, 'text')
df_clean.drop('index', axis=1, inplace=True)
df_clean['num_words'] = df_clean['text'].apply(lambda x: len(x.split()))

# Convert 'labels' column to categorical data type
df_clean['labels'] = pd.Categorical(df_clean['labels'])

df_clean['labels'] = df_clean['labels'].cat.codes
encoded_dict = {'legit':0, 'phish':1} 

### Load Model and Tokenizer

In [83]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFBertModel 

df_train, df_test = train_test_split(df_clean, test_size=0.3, random_state=42,
                                     stratify=df_clean['labels'])

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

max_len = 70

X_train = tokenizer(
    text=df_train['text'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

X_test = tokenizer(
    text=df_test['text'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the che

### Model Fitting

In [86]:

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy,BinaryAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids, attention_mask = input_mask)[0]

embeddings = bert(input_ids, attention_mask = input_mask)[0] # 0 = last hidden state, 1 = poller_output
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)

y = Dense(1, activation='sigmoid')(out)


model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=[y])
model.layers[2].trainable = True

optimizer = Adam(
    learning_rate=5e-05, # HF recommendation
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
)

loss = BinaryCrossentropy()
metric = BinaryAccuracy()


model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)

history = model.fit(
    x = {'input_ids':X_train['input_ids'], 'attention_mask':X_train['attention_mask']},
    y = df_train['labels'],
    validation_data = ({
        'input_ids':X_test['input_ids'], 'attention_mask':X_test['attention_mask']},
                       df_test['labels']),
    epochs=1,
    batch_size=32
)



In [94]:
from sklearn.metrics import classification_report

predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']})
y_predicted = np.argmax(predicted, axis=1)
print(classification_report(df_test['labels'], y_predicted,zero_division=0))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1119
           1       0.00      0.00      0.00       149

    accuracy                           0.88      1268
   macro avg       0.44      0.50      0.47      1268
weighted avg       0.78      0.88      0.83      1268



In [91]:
df_test.labels.value_counts()

0    1119
1     149
Name: labels, dtype: int64

In [95]:
df

Unnamed: 0,text,labels,num_words
0,re i may have a meeting around 3pm i have to g...,0,25
1,r re r re r re r fw tax form dear mesfer thank...,0,1961
2,thank you just got your get well package thank...,0,32
3,r re r re r re r fw tax form dear mesfer thank...,0,1961
4,for rr2 trump invests in the same companies he...,0,776
...,...,...,...
4221,scotiabank important update dear customer scot...,1,243
4222,unauthorized access to your paypal account bod...,1,248
4223,paypal please restore your account dear paypal...,1,310
4224,billing issues dear valued ebay member we rece...,1,203
