In [None]:
# includes separate classifier (bottom of notebook)

In [1]:
import pandas as pd
import re
import sklearn as sk
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# file management
import os
import sys

# NLP
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
import gensim
import gensim.downloader as gdownload

# deep learning
import tensorflow as tf
import keras
from keras import layers
print(tf.config.list_physical_devices('GPU')) # check if gpu is detected
from keras import backend as K
import gc
from sklearn.model_selection import KFold

# visualization
import datetime
import matplotlib.pyplot as plt
from IPython.display import display
# tensorboard
%load_ext tensorboard

# performance
import multiprocessing

[nltk_data] Downloading package wordnet to /home/ashkan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ashkan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-08-23 10:05:10.188065: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-23 10:05:10.241397: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2023-08-23 10:05:11.751700: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 10:05:11.810923: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 10:05:11.810981: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [2]:
df = pd.read_csv('./PhishingDataset_HFES2020.csv', encoding='windows-1252')
df = df.replace('ham', 'Ham') # replace ham with Ham
df = df[df['Email_type'] != 'Attention_check'] # remove Attention_check s
display(df.head())
display(df['Email_type'].unique())

Unnamed: 0,Email_ID,Sender,Subject,Email,Email_type
0,1,noreply@powerballs.com,You Have Won!,<p>*********PLEASE DO NOT RESPOND TO THIS EMAI...,Phishing
1,2,noreply@paypalceo.com,PayPal Breach,<p>********* RESPONES TO THIS EMAIL WILL NOT B...,Phishing
2,3,support@credit.chase.com,URGENT: Fraudulent activity detected,"<p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...",Phishing
3,4,mary@yahoo.com,Donations needed for Mark,"<p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...",Phishing
4,5,support@security.amazon.com,Your Amazon Account,<p><strong>The account number associated with ...,Phishing


array(['Phishing', 'Ham'], dtype=object)

In [3]:
# preprocessin funcs
# preprocessing functions

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

def multiple_replace(arr, replace, source):
    for item in arr:
        source = re.sub(item, replace, source)

    return source

def preprocess_txt(txt):
    set_stopwords = set(stopwords.words('english'))
    
    # replace stuff
    txt = re.sub(r'\b\S*[\x80-\xFF]\S*\b', ' ', txt) # any words with non-ascii characters
    txt = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', ' url ', txt) # urls
    txt = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', ' email ', txt) # emails
    txt = re.sub(r'<.*?>', ' ', txt) # remove html tags completely
    txt = re.sub(r'&.*?;', ' ', txt) # remove HTML entities
    txt = re.sub(r'#', ' ', txt) # hastags --> just remove the tag
    txt = re.sub(r'\b\d+\b', ' num ', txt) # numbers
    txt = re.sub(r'[^\w\s]', r' \g<0> ', txt) # punctuation
    
    # lowercase
    txt = txt.lower()

    # https://saturncloud.io/blog/reshaping-text-data-for-lstm-models-in-keras-a-comprehensive-guide/

    # split
    # nltk handles all punctuation as features
    word_arr = re.split(f'\s+', txt) # returns list of words

    # remove stopwords and drop empty strings
    word_arr = [word for word in word_arr if word not in set_stopwords and len(word) != 0]
    
    # lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_arr = [lemmatizer.lemmatize(word) for word in word_arr]

    return word_arr

def preprocess_txt_list(txt_list, embedding, sequence_length):

    processed_tweets = []
    for i, txt in enumerate(txt_list):

        word_list = preprocess_txt(txt)
        processed_tweets.append(word_list)
                                    
        if i % 10000 == 0: # log the processed message in specified intervals
            print(f"Processed text #{i}:", word_list)
            print("---------------------------")        
    
    # tokenize (I ditched the old tokenizer)
    print("tokenizing...")
    embedding_length = len(embedding)
    # convert each word to its index. if it doesn't exist, set it to the last index. I don't care that it ruins one word's meaning
    tokenized = [[embedding.key_to_index[word] if word in embedding else (embedding_length - 1) for word in split_sentence] for split_sentence in processed_tweets]

    # add padding and convert to numpy array
    print('padding sequences...')
    tokenized = np.asarray(keras.preprocessing.sequence.pad_sequences(
            tokenized,
            padding = 'post',
            maxlen = sequence_length,
    ))

    # DEBUG
    print(tokenized)
    print('feature vector shape:', tokenized.shape)

    return tokenized

# preprocess annotations for initial binary classification
def preprocess_annotations(annotation_list):
    # set all "threat" to 1, the rest to 0
    return np.asarray([1 if x == "Phishing" else 0 for x in annotation_list])

def train_valid_test_split(ds, train_ratio, valid_ratio, batch_size):
    train_ratio = 0.8
    valid_ratio = 0.1
    init_len = len(ds)
    num_train = np.floor(init_len * train_ratio)
    num_valid = np.floor(init_len * valid_ratio)

    train_ds = ds.take(num_train).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    valid_ds = ds.skip(num_train).take(num_valid).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds = ds.skip(num_train).skip(num_valid).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    print(f'train ds has {num_train} items in {len(train_ds)} batches.')
    print(f'valid ds has {num_valid} items in {len(valid_ds)} batches.')
    print(f'test ds has {init_len - num_train - num_valid} items in {len(test_ds)} batches.')

    return (train_ds, valid_ds, test_ds)

def shuffle(nparr, random_state = 23):
    rng = np.random.RandomState(random_state) # reset the seed
    return rng.permutation(nparr)

def train_and_evaluate(model, train_ds, test_ds, epochs, 
                        optimizer = keras.optimizers.Adam(learning_rate = 0.001),
                        loss = keras.losses.BinaryCrossentropy(),
                        valid_ds = None):
    model.compile(
        loss = loss,
        optimizer = optimizer,
        metrics = ['acc'],
    )

    print(model.summary())
    history = model.fit(
        train_ds,
        validation_data = valid_ds, # ignored if None
        epochs = epochs,
        callbacks = [tensorboard_callback],
    )

    if valid_ds != None:
        # plot losses over time --> shown after training
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('Accuracy')
        plt.xlabel('epoch')
        plt.xlabel('accuracy')
        plt.legend(['train','val'], loc='upper left')
        plt.grid()
        plt.ylim(0.5, 1)
        plt.show()

        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('Loss')
        plt.xlabel('epoch')
        plt.xlabel('loss')
        plt.legend(['train','val'], loc='upper left')
        plt.grid()
        plt.show()

    # evaluate
    return (model.evaluate(test_ds)[1], model)

# model training funcs + k-fold

def pretrained_embedding(embedding):
    # note: embedding is declared in the previous cell
    
    vocab_size = len(embedding)
    embedding_vector_size = len(embedding[embedding.index_to_key[0]])

    # create embedding matrix
    embedding_matrix = np.zeros((vocab_size, embedding_vector_size))
    # iterate through embedding and copy word vectors as weights
    for i in range(vocab_size):
        embedding_matrix[i, :] = embedding[embedding.index_to_key[i]]

    embedding_layer = layers.Embedding(input_dim = vocab_size, output_dim = embedding_vector_size, trainable = False)
    embedding_layer.build((None,)) # I have no idea why I should do this
    embedding_layer.set_weights([embedding_matrix]) # square brackets are because some layers take multiple types of weights
    
    return embedding_layer

def build_model(sequence_length, embedding):
  model = keras.Sequential([
    layers.Input(shape = (sequence_length,)),
    pretrained_embedding(embedding),
    # layers.GRU(MAX_TWEET_WORDS, return_sequences=True), # not a difference
    layers.BatchNormalization(),
    layers.Bidirectional(
        layers.GRU(
            sequence_length,
            dropout = 0.2,
        ),
    ),

    layers.Dense(32, activation = 'relu'),
    layers.Dropout(0.7),
    layers.Dense(8, activation = 'relu'),
    layers.Dropout(0.7),
    layers.Dense(1, activation = 'sigmoid'),
  ])

  return model

def kfold(ds, epochs, batch_size, k, sequence_length, embedding):
  loss = keras.losses.BinaryCrossentropy()
  optimizer = keras.optimizers.Adam(learning_rate = 0.001)
  autotune = tf.data.AUTOTUNE

  if k == None:
    # normal stuff
    model = build_model(sequence_length, embedding)

    train_ds, valid_ds, test_ds = train_valid_test_split(ds, 0.6, 0.2, batch_size)
    train_and_evaluate(
      model,
      train_ds = train_ds,
      valid_ds = valid_ds,
      test_ds = test_ds,
      epochs = epochs,
      loss = loss,
      optimizer = optimizer,
    )

  else:
    accuracies = []
    for i in range(k):
      print(f'fold {i}')
      
      model = build_model(sequence_length, embedding)
      num_total = len(ds)
      num_test = np.floor(num_total / k)
      num_train = num_total - num_test

      test_range = [np.floor((i) * num_test), np.floor((i + 1) * num_test)]
      train_ds_p1 = ds.take(test_range[0])
      train_ds_p2 = ds.skip(test_range[1])
      train_ds = train_ds_p1.concatenate(train_ds_p2).batch(batch_size).prefetch(autotune)
      print(f'train dataset range: {test_range[0]} - {test_range[1]}')
      test_ds = ds.skip(np.floor((i) * num_test)).take(num_test).batch(batch_size).prefetch(autotune)
      print(f'test dataset range: {test_range[0]} - {test_range[1]}')

      print(f'train ds has {num_train} items in {len(train_ds)} batches.')
      print(f'test ds has {num_test} items in {len(test_ds)} batches.')
      
      accuracy = train_and_evaluate(
        model,
        train_ds,
        test_ds,
        epochs = epochs,
        loss = loss,
        optimizer = optimizer,
      )[0]
      
      print("accuracy: ", accuracy)
      accuracies.append(accuracy)

    print(f"average accuracy: {np.average(accuracies)}")

2023-08-23 10:05:12.158683: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 10:05:12.158846: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 10:05:12.158888: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 10:05:12.849159: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-23 10:05:12.849267: I tensorflow/compile

In [4]:
# # get the average length of emails
# tweetlengths = df['Email'].apply(lambda x: len(x.split()))
# print(np.average(tweetlengths))

In [5]:
# gdownload.info()

In [6]:
# actual preprocessing
# embedding = gdownload.load('glove-wiki-gigaword-100') # pretrained embedding --> much cleaner than twitter stuff
embedding = gdownload.load('glove-twitter-200') # pretrained embedding 
# embedding = gdownload.load('glove-twitter-50') # pretrained embedding 
# embedding = gdownload.load('word2vec-google-news-300')
original_texts = np.asarray(df['Email']) # I'll use this to check the preprocessing process

email_bodies = preprocess_txt_list(df['Email'], embedding, sequence_length = 80) 
annotation_labels = preprocess_annotations(df['Email_type'])

# shuffle the data

###### vvvv SEED IS HERE vvvv ######
# seed = 183
# seed = 89
# seed = 11
# seed = 42
seed = 30
###### ^^^^ SEED IS HERE ^^^^ ######

original_texts = shuffle(original_texts, random_state = seed) # debug
email_bodies = shuffle(email_bodies, random_state = seed)
annotation_labels = shuffle(annotation_labels, random_state = seed)
# type_labels = shuffle(type_labels)

# reduce data for faster training # REMOVE LATER
ratio_keep = 1 
original_texts = original_texts[:int(len(original_texts) * ratio_keep)] # debug
email_bodies = email_bodies[:int(len(email_bodies) * ratio_keep)]
annotation_labels = annotation_labels[:int(len(annotation_labels) * ratio_keep)]
# type_labels = type_labels[:int(len(type_labels) * ratio_keep)]

# DEBUG
def print_list(title, list):
    print(title)
    for i, x in enumerate(list):
        print(f'<<{i}>>', x)
    print("------------------------------------------")

sample_length = 10
print_list("original data:", original_texts[:sample_length])
tokenized_input_sample = [[index for index in x] for x in email_bodies][:sample_length]
print_list("split input: ", [[embedding.index_to_key[index] for index in example] for example in tokenized_input_sample])
print_list("tokenized input: ", tokenized_input_sample)
print("labels: ", annotation_labels[:sample_length])


Processed text #0: ['*', '*', '*', '*', '*', '*', '*', '*', '*', 'please', 'respond', 'email', '*', '*', '*', '*', '*', '*', '*', '*', '*', 'record', 'show', 'entered', 'win', 'state', 'powerball', 'jackpot', 'num', '/', 'num', '/', 'num', '.', 'receiving', 'email', 'listed', 'one', 'winner', '.', 'claim', 'prize', 'please', 'visit', 'site', 'fill', 'information', 'needed', 'collect', '.', 'must', 'process', 'information', 'within', 'week', 'time', 'may', 'lose', 'winning', '.', 'congratulation', '!', 'collect', 'earnings', '!', 'please', 'click', 'prompt', 'response', 'regarding', 'matter', 'appreciated', '.', 'sincerely', ',', 'powerball', 'team']
---------------------------
tokenizing...
padding sequences...
[[   42    42    42 ...     0     0     0]
 [   42    42    42 ...     0     0     0]
 [  996     4  3145 ...     0     0     0]
 ...
 [  589 31134     4 ...     0     0     0]
 [ 3709     1   416 ...  2471  2311     1]
 [  589     4 11532 ...     0     0     0]]
feature vector 

In [8]:
model1_ds = tf.data.Dataset.from_tensor_slices((email_bodies, annotation_labels))
kfold(
  ds = model1_ds,
  epochs = 200,
  batch_size = 32, 
  k = None, # anything other than non is broken
  sequence_length = len(email_bodies[0]),
  embedding = embedding
)
# note: I have serious memory leak problems with k-fold.
# I'll use the following seeds to verify the average accuracy:
# 183, 89, 11, 42, 30

train ds has 191.0 items in 6 batches.
valid ds has 23.0 items in 1 batches.
test ds has 25.0 items in 1 batches.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 200)           238702800 
                                                                 
 batch_normalization (Batch  (None, 80, 200)           800       
 Normalization)                                                  
                                                                 
 bidirectional (Bidirection  (None, 160)               135360    
 al)                                                             
                                                                 
 dense (Dense)               (None, 32)                5152      
                                                                 
 dropout (Dropout)           (None, 32)                0         
        

2023-08-23 10:08:52.983983: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-08-23 10:08:53.129905: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-23 10:08:53.134115: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fc298184510 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-23 10:08:53.134146: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-08-23 10:08:53.139077: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-23 10:08:53.241930: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-08-23 10:



2023-08-23 10:09:06.852222: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (mklcpu) ran out of memory trying to allocate 53.35GiB (rounded to 57288672000)requested by op OneHot
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-08-23 10:09:06.852429: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for mklcpu
2023-08-23 10:09:06.852439: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-08-23 10:09:06.852443: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-08-23 10:09:06.852448: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (1024): 	T

ResourceExhaustedError: {{function_node __wrapped__OneHot_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[238702800,30] and type double on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu [Op:OneHot] name: 

In [7]:
# train model for live classification

# live_model_ds = tf.data.Dataset.from_tensor_slices((email_bodies, annotation_labels))
# live_model_train, _, live_model_test = train_valid_test_split(live_model_ds, 0.8, 0, batch_size = 32)

# sequence_length = len(email_bodies[0])
# live_model = build_model(sequence_length, embedding)
# _, live_model = train_and_evaluate(live_model, live_model_train, live_model_test, epochs = 100)

train ds has 191.0 items in 6 batches.
valid ds has 23.0 items in 1 batches.
test ds has 25.0 items in 1 batches.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 200)           238702800 
                                                                 
 batch_normalization (Batch  (None, 80, 200)           800       
 Normalization)                                                  
                                                                 
 bidirectional (Bidirection  (None, 160)               135360    
 al)                                                             
                                                                 
 dense (Dense)               (None, 32)                5152      
                                                                 
 dropout (Dropout)           (None, 32)                0         
        

2023-08-23 10:03:31.910959: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-08-23 10:03:32.139663: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-23 10:03:32.147404: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f53ec0217f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-23 10:03:32.147456: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-08-23 10:03:32.174796: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-23 10:03:32.392614: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-08-23 10:

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [42]:
# live classification
input = ""
word_arr = preprocess_txt_list([input], embedding, sequence_length)

prediction = live_model.predict(word_arr)
print(prediction)
prediction = 1 if prediction > 0.5 else 0

prediction_to_class = {
  0: "Ham",
  1: "Phishing"
}

print(prediction_to_class[prediction])

Processed text #0: ['.', '.', '.']
---------------------------
tokenizing...
padding sequences...
[[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]]
feature vector shape: (1, 80)
[[0.9895074]]
Phishing
