### Load and Preprocess Raw Data

In [1]:
import json 
import pandas as pd

files = ['phish','legit']

# Create an empty list to store the dictionaries
dict_list = []
    
for file in files:
    
    with open(f'{file}_preprocessed_json.json') as fp:
        data = json.load(fp)
    fp.close()

    # Iterate over each item in the JSON data
    for item in data:
        if item != {}:
            if "1" not in item["rejected-for"]:
                # Extract the subject and header values
                subject = item['header']['Subject']
                body = item['body']

                # Create a dictionary with subject and header
                dict_item = {'text': subject+" "+body, 'labels': file}
                
                # Append the dictionary to the list
                dict_list.append(dict_item)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(dict_list)

"""
Note:

Rejected-for:
0 - Not Rejected (default value)
1 - Missing Subject or Body
2 - Unacceptable Header Size
3 - Unacceptable Body Size
"""

'\nNote:\n\nRejected-for:\n0 - Not Rejected (default value)\n1 - Missing Subject or Body\n2 - Unacceptable Header Size\n3 - Unacceptable Body Size\n'

In [2]:
df.labels.value_counts()

legit    3730
phish     496
Name: labels, dtype: int64

In [3]:
df

Unnamed: 0,text,labels
0,"paypal flagged account dear paypal member, you...",phish
1,uh security alert uh security alert=2c =a0a dg...,phish
2,account review department online banking profi...,phish
3,re:websit hi there ! web site ... on -line- my...,phish
4,you have one new message at capital one. dear ...,phish
...,...,...
4221,democrats hold trump accountable across the co...,legit
4222,"daily political guidance -- wednesday, april 2...",legit
4223,re: organizationchair next tuesday? will get r...,legit
4224,re: dws arizona republic op-ed is now up i'll ...,legit


In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from helper_prabowo_ml import clean_html, remove_links, non_ascii, lower, email_address, removeStopWords, punct, remove_
import re


df = df.reset_index()

# PREPROCESS THE DATA
def preproc(df, colname):
  df[colname] = df[colname].apply(func=clean_html)
  df[colname] = df[colname].apply(func=remove_links)
  df[colname] = df[colname].apply(func=non_ascii)
  df[colname] = df[colname].apply(func=lower)
  df[colname] = df[colname].apply(func=email_address)
  # df[colname] = df[colname].apply(func=removeStopWords)
  df[colname] = df[colname].apply(func=punct)
  df[colname] = df[colname].apply(func=remove_)
  return(df)

df_clean = preproc(df, 'text')
df_clean.drop('index', axis=1, inplace=True)
df_clean['num_words'] = df_clean['text'].apply(lambda x: len(x.split()))

# Convert 'labels' column to categorical data type
df_clean['labels'] = pd.Categorical(df_clean['labels'])

df_clean['labels'] = df_clean['labels'].cat.codes
encoded_dict = {'legit':0, 'phish':1} 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VENUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
# df_clean['labels'] = df_clean['labels'].apply(lambda x: 1-x)
# df_clean

In [50]:
# import pandas as pd

# # Assuming your DataFrame is named 'df'
# random_samples = df_clean.groupby('labels').apply(lambda x: x.sample(n=450, random_state=42)).reset_index(drop=True)

In [40]:
# import pandas as pd

# # Assuming your DataFrame is named 'df' and the label you want to drop from is '0'
# label_to_drop = 1
# num_samples_to_drop = 400

# # Filter the DataFrame to select the label you want to drop from
# label_df = random_samples[random_samples['labels'] == label_to_drop]

# # Check if the number of samples to drop is greater than the available samples in the label_df
# if num_samples_to_drop >= len(label_df):
#     print(f"Cannot drop {num_samples_to_drop} samples as there are only {len(label_df)} samples in the '{label_to_drop}' label.")
# else:
#     # Randomly select the samples to drop
#     samples_to_drop = label_df.sample(n=num_samples_to_drop, random_state=42)

#     # Drop the selected samples from the original DataFrame
#     random_samples = random_samples.drop(samples_to_drop.index)

#     # Reset the indices of the DataFrame
#     random_samples = random_samples.reset_index(drop=True)


In [51]:
# random_samples.labels.value_counts()

0    450
1    450
Name: labels, dtype: int64

### Load Model and Tokenizer

In [4]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFBertModel 

df_train, df_test = train_test_split(df_clean, test_size=0.3, random_state=42,
                                     stratify=df_clean['labels'])

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

max_len = 70

X_train = tokenizer(
    text=df_train['text'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

X_test = tokenizer(
    text=df_test['text'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### Model Fitting

In [6]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids, attention_mask = input_mask)[0]

embeddings = bert(input_ids, attention_mask = input_mask)[0] # 0 = last hidden state, 1 = poller_output
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)

y = Dense(2, activation='softmax')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

optimizer = Adam(
    learning_rate=5e-05, # HF recommendation
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
)

loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('balanced_accuracy')

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)

history = model.fit(
    x = {'input_ids':X_train['input_ids'], 'attention_mask':X_train['attention_mask']},
    y = to_categorical(df_train['labels']),
    validation_data = ({'input_ids':X_test['input_ids'], 'attention_mask':X_test['attention_mask']},
                        to_categorical(df_test['labels'])),
    epochs=1,
    batch_size=32
)

  output, from_logits = _get_logits(




In [10]:
# Saving the model
model.save("bert_mc.h5")

In [8]:
from transformers import TFBertModel
import tensorflow as tf

# Load the TFBertModel layer
bert = TFBertModel.from_pretrained("bert-base-uncased")

# Load the saved model
model = tf.keras.models.load_model("bert_mc.h5", custom_objects={"TFBertModel": bert})

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [38]:
from sklearn.metrics import classification_report

predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']})
y_predicted = np.argmax(predicted, axis=1)
print(classification_report(df_test['labels'], y_predicted,zero_division=0))
encoded_dict

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1119
           1       0.96      0.92      0.94       149

    accuracy                           0.99      1268
   macro avg       0.98      0.96      0.97      1268
weighted avg       0.99      0.99      0.99      1268



{'legit': 0, 'phish': 1}

In [12]:
from sklearn.metrics import classification_report
import textwrap
import random

# Select the index of the sample you want to predict
for _ in range(5):
    sample_index = random.randint(1, 1268)
    # Get the input tensors for the selected sample
    input_ids = X_test['input_ids'][sample_index]
    attention_mask = X_test['attention_mask'][sample_index]

    # Reshape the input tensors to match the expected shape
    input_ids = np.reshape(input_ids, (1, -1))
    attention_mask = np.reshape(attention_mask, (1, -1))

    # Predict the sample
    predicted = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})
    y_predicted = np.argmax(predicted, axis=1)

    # Get the true label for the selected sample
    true_label = list(df_test['labels'])[sample_index]
    email_text = list(df_test['text'])[sample_index]

    # Print the prediction and true label
    print("Predicted label:", y_predicted[0])
    print("True label:", true_label)
    print(encoded_dict)
    # Set the desired width for each line
    line_width = 100
    formatted_string = textwrap.fill(email_text, line_width)
    print(formatted_string[:300])
    print("\n")
    

Predicted label: 0
True label: 0
{'legit': 0, 'phish': 1}
domain com discovery isn t preparing a bid for scripps networks although it s intrigued deadline
hollywood has posted urgent news discovery isn t preparing a bid for scripps networks although it s
intrigued for all of deadline s headlines follow us deadline on twitter this email was sent to by
deadl


Predicted label: 1
True label: 1
{'legit': 0, 'phish': 1}
webmaster we have upgraded to 500mg email space login into your account to confirm if your account
is still active then update the informations below for confirmation and upgrade full name email
password confirm password note if you have not been upgraded please fill out the above to upgarde to
500m


Predicted label: 0
True label: 0
{'legit': 0, 'phish': 1}
a walk sorry to have been so out of touch and missing seeing you after your kind not a lot has been
going on at home and work that have been very distracting i don t know if you are around this
weekend but i d love to go