In [None]:
# initial GRU model

In [1]:
import pandas as pd
import re
import sklearn as sk
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# file management
import os
import sys

# NLP
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
import gensim
import gensim.downloader as gdownload

# deep learning
import tensorflow as tf
import keras
from keras import layers
print(tf.config.list_physical_devices('GPU')) # check if gpu is detected
from keras import backend as K
import gc
from sklearn.model_selection import KFold

# visualization

import matplotlib.pyplot as plt
from IPython.display import display
# tensorboard
%load_ext tensorboard

# performance
import multiprocessing

[nltk_data] Downloading package wordnet to /home/ashkan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ashkan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-08-23 11:22:07.862055: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-23 11:22:07.910157: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2023-08-23 11:22:09.781266: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 11:22:09.836807: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 11:22:09.836867: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [2]:
df = pd.read_csv('./PhishingDataset_HFES2020.csv', encoding='windows-1252')
df = df.replace('ham', 'Ham') # replace ham with Ham
df = df[df['Email_type'] != 'Attention_check'] # remove Attention_check s
display(df.head())
display(df['Email_type'].unique())

Unnamed: 0,Email_ID,Sender,Subject,Email,Email_type
0,1,noreply@powerballs.com,You Have Won!,<p>*********PLEASE DO NOT RESPOND TO THIS EMAI...,Phishing
1,2,noreply@paypalceo.com,PayPal Breach,<p>********* RESPONES TO THIS EMAIL WILL NOT B...,Phishing
2,3,support@credit.chase.com,URGENT: Fraudulent activity detected,"<p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...",Phishing
3,4,mary@yahoo.com,Donations needed for Mark,"<p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...",Phishing
4,5,support@security.amazon.com,Your Amazon Account,<p><strong>The account number associated with ...,Phishing


array(['Phishing', 'Ham'], dtype=object)

In [3]:
# preprocessin funcs
# preprocessing functions

def multiple_replace(arr, replace, source):
    for item in arr:
        source = re.sub(item, replace, source)

    return source

def text_preprocessing(tweets_list, embedding, maxlen):
    set_stopwords = set(stopwords.words('english'))

    processed_tweets = []
    for i, txt in enumerate(tweets_list):        
        # replace stuff
        txt = re.sub(r'\b\S*[\x80-\xFF]\S*\b', ' ', txt) # any words with non-ascii characters
        txt = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', ' url ', txt) # urls
        txt = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', ' email ', txt) # emails
        txt = re.sub(r'<.*?>', '', txt) # remove html tags completely
        txt = re.sub(r'&.*?;', ' ', txt) # remove HTML entities
        txt = re.sub(r'#', ' ', txt) # hastags --> just remove the tag
        txt = re.sub(r'\b\d+\b', ' num ', txt) # numbers
        txt = re.sub(r'[^\w\s]', r' \g<0> ', txt) # punctuation
        
        # lowercase
        txt = txt.lower()

        # https://saturncloud.io/blog/reshaping-text-data-for-lstm-models-in-keras-a-comprehensive-guide/

        # split
        # nltk handles all punctuation as features
        word_arr = re.split(f'\s+', txt) # returns list of words
    
        # remove stopwords and drop empty strings
        word_arr = [word for word in word_arr if word not in set_stopwords and len(word) != 0]
        
        # lemmatize
        lemmatizer = nltk.stem.WordNetLemmatizer()
        word_arr = [lemmatizer.lemmatize(word) for word in word_arr]

        if i % 10000 == 0: # log the processed message in specified intervals
            print(f"Processed text #{i}:", word_arr)
            print("---------------------------")

        processed_tweets.append(word_arr)
    
    # tokenize (I ditched the old tokenizer)
    print("tokenizing...")
    embedding_length = len(embedding)
    # convert each word to its index. if it doesn't exist, set it to the last index. I don't care that it ruins one word's meaning
    tokenized = [[embedding.key_to_index[word] if word in embedding else (embedding_length - 1) for word in split_sentence] for split_sentence in processed_tweets]

    # add padding and convert to numpy array
    print('padding sequences...')
    tokenized = np.asarray(keras.preprocessing.sequence.pad_sequences(
            tokenized,
            padding = 'post',
            maxlen = maxlen,
    ))

    # DEBUG
    print(tokenized)
    print('feature vector shape:', tokenized.shape)

    return tokenized

# preprocess annotations for initial binary classification
def annotation_preprocessing(annotation_list):
    # set all "threat" to 1, the rest to 0
    return np.asarray([1 if x == "Phishing" else 0 for x in annotation_list])

def train_valid_test_split(ds, train_ratio, valid_ratio, batch_size, k = None):
    train_ratio = 0.8
    valid_ratio = 0.1
    init_len = len(ds)
    num_train = np.floor(init_len * train_ratio)
    num_valid = np.floor(init_len * valid_ratio)

    train_ds = ds.take(num_train).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    valid_ds = ds.skip(num_train).take(num_valid).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds = ds.skip(num_train).skip(num_valid).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    print(f'train ds has {num_train} items in {len(train_ds)} batches.')
    print(f'valid ds has {num_valid} items in {len(valid_ds)} batches.')
    print(f'test ds has {init_len - num_train - num_valid} items in {len(test_ds)} batches.')

    return (train_ds, valid_ds, test_ds)

def train_and_evaluate(model, train_ds, test_ds, epochs, optimizer, loss, valid_ds = None):
    model.compile(
        loss = loss,
        optimizer = optimizer,
        metrics = ['acc'],
    )

    print(model.summary())
    history = model.fit(
        train_ds,
        validation_data = valid_ds, # ignored if None
        epochs = epochs,
    )

    if valid_ds != None:
        # plot losses over time --> shown after training
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('Accuracy')
        plt.xlabel('epoch')
        plt.xlabel('accuracy')
        plt.legend(['train','val'], loc='upper left')
        plt.grid()
        plt.ylim(0.5, 1)
        plt.show()

        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('Loss')
        plt.xlabel('epoch')
        plt.xlabel('loss')
        plt.legend(['train','val'], loc='upper left')
        plt.grid()
        plt.show()

    # evaluate
    return model.evaluate(test_ds)[1]

def tokenize_types(unique_types):
    index_to_type = {index: value for index, value in zip(range(len(unique_types)), unique_types)}
    type_to_index = {value: index for index, value in index_to_type.items()}
    return (index_to_type, type_to_index)

def types_preprocessing(typescol):
    # notes: 
    # I'll just use the first vulnerability type 
    # If the type is "all", I'll drop it from the input

    processed = []
    for string in typescol:
        # print(string)
        string = string.lower() # lowercase
        string = re.sub(r"[,\[\]']", " ", string) # replace [, ], and ' with space
        strlist = string.split() # split list by space
        if 'all' in strlist: strlist.remove('all') # remove the "all" type
        one_type = strlist[0] if len(strlist) > 0 else "empty" # select just the first type

        processed.append(one_type)

    index_to_type, type_to_index = tokenize_types(set(processed))
    # print(type_to_index.items()) # DEBUG

    for i in range(len(processed)):
        processed[i] = type_to_index[processed[i]] # convert each type to index

    return (index_to_type, type_to_index, np.asarray(processed))

def shuffle(nparr, random_state = 23):
    rng = np.random.RandomState(random_state) # reset the seed
    return rng.permutation(nparr)

In [4]:
# # get the average length of emails
# tweetlengths = df['Email'].apply(lambda x: len(x.split()))
# print(np.average(tweetlengths))
# # I'll use 100 words

In [5]:
# gdownload.info()

In [6]:
# actual preprocessing
# embedding = gdownload.load('glove-wiki-gigaword-100') # pretrained embedding --> much cleaner than twitter stuff
embedding = gdownload.load('glove-twitter-200') # pretrained embedding 
# embedding = gdownload.load('glove-twitter-50') # pretrained embedding 
# embedding = gdownload.load('word2vec-google-news-300')
original_texts = np.asarray(df['Email']) # I'll use this to check the preprocessing process

max_words = 80
email_bodies = text_preprocessing(df['Email'], embedding, maxlen = max_words) 
annotation_labels = annotation_preprocessing(df['Email_type'])
# index_to_type, type_to_index, type_labels = types_preprocessing(df['type'])

# shuffle the data

###### vvvv SEED IS HERE vvvv ######
# seed = 183
# seed = 89
# seed = 11
# seed = 42
seed = 30
###### ^^^^ SEED IS HERE ^^^^ ######

original_texts = shuffle(original_texts, random_state = seed) # debug
email_bodies = shuffle(email_bodies, random_state = seed)
annotation_labels = shuffle(annotation_labels, random_state = seed)
# type_labels = shuffle(type_labels)

# reduce data for faster training # REMOVE LATER
ratio_keep = 1 
original_texts = original_texts[:int(len(original_texts) * ratio_keep)] # debug
email_bodies = email_bodies[:int(len(email_bodies) * ratio_keep)]
annotation_labels = annotation_labels[:int(len(annotation_labels) * ratio_keep)]
# type_labels = type_labels[:int(len(type_labels) * ratio_keep)]

# DEBUG
def print_list(title, list):
    print(title)
    for i, x in enumerate(list):
        print(f'<<{i}>>', x)
    print("------------------------------------------")

sample_length = 10
print_list("original data:", original_texts[:sample_length])
tokenized_input_sample = [[index for index in x] for x in email_bodies][:sample_length]
print_list("split input: ", [[embedding.index_to_key[index] for index in example] for example in tokenized_input_sample])
print_list("tokenized input: ", tokenized_input_sample)
print("labels: ", annotation_labels[:sample_length])


Processed text #0: ['*', '*', '*', '*', '*', '*', '*', '*', '*', 'please', 'respond', 'email', '*', '*', '*', '*', '*', '*', '*', '*', '*', 'record', 'show', 'entered', 'win', 'state', 'powerball', 'jackpot', 'num', '/', 'num', '/', 'num', '.', 'receiving', 'email', 'listed', 'one', 'winner', '.', 'claim', 'prize', 'please', 'visit', 'site', 'fill', 'information', 'needed', 'collect', '.', 'must', 'process', 'information', 'within', 'week', 'time', 'may', 'lose', 'winning', '.', 'congratulation', '!', 'collect', 'earnings', '!', 'please', 'click', 'prompt', 'response', 'regarding', 'matter', 'appreciated', '.', 'sincerely', ',', 'powerball', 'team']
---------------------------
tokenizing...
padding sequences...
[[   42    42    42 ...     0     0     0]
 [   42    42    42 ...     0     0     0]
 [  996     4  3145 ...     0     0     0]
 ...
 [  589 31134     4 ...     0     0     0]
 [    1  3709     1 ...  2471  2311     1]
 [  589     4 11532 ...     0     0     0]]
feature vector 

In [7]:
# model training funcs + k-fold

def pretrained_embedding(embedding):
    # note: embedding is declared in the previous cell
    
    vocab_size = len(embedding)
    embedding_vector_size = len(embedding[embedding.index_to_key[0]])

    # create embedding matrix
    embedding_matrix = np.zeros((vocab_size, embedding_vector_size))
    # iterate through embedding and copy word vectors as weights
    for i in range(vocab_size):
        embedding_matrix[i, :] = embedding[embedding.index_to_key[i]]

    embedding_layer = layers.Embedding(input_dim = vocab_size, output_dim = embedding_vector_size, trainable = False)
    embedding_layer.build((None,)) # I have no idea why I should do this
    embedding_layer.set_weights([embedding_matrix]) # square brackets are because some layers take multiple types of weights
    
    return embedding_layer

def build_model():
  model = keras.Sequential([
    layers.Input(shape = (max_words,)),
    pretrained_embedding(embedding),
    # layers.GRU(MAX_TWEET_WORDS, return_sequences=True), # not a difference
    layers.BatchNormalization(),
    layers.Bidirectional(
        layers.GRU(
            max_words,
            dropout = 0.2,
        ),
    ),

    layers.Dense(32, activation = 'relu'),
    layers.Dropout(0.7),
    layers.Dense(8, activation = 'relu'),
    layers.Dropout(0.7),
    layers.Dense(1, activation = 'sigmoid'),
  ])

  return model

def kfold(ds, epochs, batch_size, k):
  loss = keras.losses.BinaryCrossentropy()
  optimizer = keras.optimizers.Adam(learning_rate = 0.001)
  autotune = tf.data.AUTOTUNE

  if k == None:
    # normal stuff
    model = build_model()

    train_ds, valid_ds, test_ds = train_valid_test_split(ds, 0.6, 0.2, batch_size)
    train_and_evaluate(
      model,
      train_ds = train_ds,
      valid_ds = valid_ds,
      test_ds = test_ds,
      epochs = epochs,
      loss = loss,
      optimizer = optimizer,
    )

  else:
    accuracies = []
    for i in range(k):
      print(f'fold {i}')
      
      model = build_model()
      num_total = len(ds)
      num_test = np.floor(num_total / k)
      num_train = num_total - num_test

      test_range = [np.floor((i) * num_test), np.floor((i + 1) * num_test)]
      train_ds_p1 = ds.take(test_range[0])
      train_ds_p2 = ds.skip(test_range[1])
      train_ds = train_ds_p1.concatenate(train_ds_p2).batch(batch_size).prefetch(autotune)
      print(f'train dataset range: {test_range[0]} - {test_range[1]}')
      test_ds = ds.skip(np.floor((i) * num_test)).take(num_test).batch(batch_size).prefetch(autotune)
      print(f'test dataset range: {test_range[0]} - {test_range[1]}')

      print(f'train ds has {num_train} items in {len(train_ds)} batches.')
      print(f'test ds has {num_test} items in {len(test_ds)} batches.')
      
      accuracy = train_and_evaluate(
        model,
        train_ds,
        test_ds,
        epochs = epochs,
        loss = loss,
        optimizer = optimizer,
      )
      
      print("accuracy: ", accuracy)
      accuracies.append(accuracy)

    print(f"average accuracy: {np.average(accuracies)}")

In [8]:
model1_ds = tf.data.Dataset.from_tensor_slices((email_bodies, annotation_labels))
kfold(ds = model1_ds, epochs = 200, batch_size = 32, k = None)
# note: I have serious memory leak problems with k-fold.
# I'll use the following seeds to verify the average accuracy:
# 183, 89, 11, 42, 30

2023-08-23 11:23:43.419201: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 11:23:43.419492: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 11:23:43.419522: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 11:23:44.160769: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 11:23:44.160856: I tensorflow/compile

train ds has 191.0 items in 6 batches.
valid ds has 23.0 items in 1 batches.
test ds has 25.0 items in 1 batches.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 200)           238702800 
                                                                 
 batch_normalization (Batch  (None, 80, 200)           800       
 Normalization)                                                  
                                                                 
 bidirectional (Bidirection  (None, 160)               135360    
 al)                                                             
                                                                 
 dense (Dense)               (None, 32)                5152      
                                                                 
 dropout (Dropout)           (None, 32)                0         
        

2023-08-23 11:23:50.929452: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-08-23 11:23:51.090742: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-23 11:23:51.096246: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fc37402d250 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-23 11:23:51.096288: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-08-23 11:23:51.101613: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-23 11:23:51.205027: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-08-23 11:

KeyboardInterrupt: 