In [62]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Input, Dropout, Flatten, Embedding, GRU
from tensorflow.keras import regularizers
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('snowball_data')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
stop_words = list(stopwords.words('english'))


emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package snowball_data to /root/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


In [11]:
df= pd.read_csv('spam.csv', encoding='ISO-8859-1')

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
print("Percentage of missing values: ")
display(100 * df.isnull().sum() / df.shape[0])

Percentage of missing values: 


Unnamed: 0,0
v1,0.0
v2,0.0
Unnamed: 2,99.102656
Unnamed: 3,99.784637
Unnamed: 4,99.892319


In [13]:
df[df["Unnamed: 4"].notna()]

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
2255,ham,I just lov this line: \Hurt me with the truth,I don't mind,i wil tolerat.bcs ur my someone..... But,"Never comfort me with a lie\"" gud ni8 and swe..."
3525,ham,\HEY BABE! FAR 2 SPUN-OUT 2 SPK AT DA MO... DE...,HAD A COOL NYTHO,TX 4 FONIN HON,"CALL 2MWEN IM BK FRMCLOUD 9! J X\"""""
4668,ham,"When I was born, GOD said, \Oh No! Another IDI...",GOD said,"\""OH No! COMPETITION\"". Who knew","one day these two will become FREINDS FOREVER!"""
5048,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""


In [14]:
df['Message'] = df['v2'].fillna('') + ' ' + df['Unnamed: 2'].fillna('') + ' ' + df['Unnamed: 3'].fillna('') + ' ' + df['Unnamed: 4'].fillna('')

In [15]:
df[df["Unnamed: 4"].notna()]["Message"][281]

'\\Wen u miss someone  the person is definitely special for u..... But if the person is so special  why to miss them  just Keep-in-touch\\" gdeve.."'

In [16]:
df['spam']=df['v1'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,,,,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,"Nah I don't think he goes to usf, he lives aro...",0


In [17]:
df.drop(["v1", "v2", "Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1,inplace=True)

In [18]:
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
email_pattern     = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
userPattern       = '@[^\s]+'
alphaPattern      = "[^a-zA-Z0-9]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

def preprocess(message, stem_or_lem=None) :
    message = message.lower()
    # Replace all URls with 'URL'
    message = re.sub(urlPattern,' URL ',message)
    # Replace all emojis by 'EMOJIemotion'
    for emoji in emojis.keys():
        message = message.replace(emoji, " EMOJI" + emojis[emoji] + " ")

    message = message.replace(" @ ", " at ")
    message = re.sub(email_pattern,' EMAIL ', message)
    # Replace @USERNAME by 'USER'.
    message = re.sub(userPattern,' USER ', message)
    # Replace all non alphabets.
    message = re.sub(alphaPattern, " ", message)
    # Replace 3 or more consecutive letters by 2 letter.
    message = re.sub(sequencePattern, seqReplacePattern, message)

    message_word = ''
    for word in message.split():
        # Checking if the word is a stopword.
        #if word not in stopwordlist:
        if len(word)>1 and word not in stop_words:
            # Lemmatizing or stemming the word.
            if stem_or_lem == "lem":
                word = lemmatizer.lemmatize(word)
            elif stem_or_lem == "stem":
                word = stemmer.stem(word)
            message_word += word+' '

    return message

In [19]:
df["Message_preprocessed"] = df["Message"].apply(lambda x : preprocess(x, None))
df["Message_preprocessed_lem"] = df["Message"].apply(lambda x : preprocess(x, "lem"))
df["Message_preprocessed_stem"] = df["Message"].apply(lambda x : preprocess(x, "stem"))

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df['Message'],df['spam'], stratify=df['spam'], random_state=24)
X_train.head()

Unnamed: 0,Message
579,Arngd marriage is while u r walkin unfortuntly...
4172,Ok... But they said i've got wisdom teeth hidd...
4758,Thanks 4 your continued support Your question ...
3764,Someone U know has asked our dating service 2 ...
4799,its cool but tyler had to take off so we're go...


Using [Bert](https://www.analyticsvidhya.com/blog/2021/12/text-classification-using-bert-and-tensorflow/) (transfer learning)

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4") # Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') # input layer (shape is a single string)
preprocessed_text = bert_preprocess(text_input) # tokenizes the text
outputs = bert_encoder(preprocessed_text) # generates contextualized embeddings for each token in the input sequence.
l = Dropout(0.1, name="dropout")(outputs['pooled_output']) # Dropout randomly sets 10% of the input units to 0 during training to help prevent overfitting.
l = Dense(1, activation='sigmoid', name="output")(l) # classify the text into either 0 or 1 category. It's the trainable part.

In [None]:
model_bert = tf.keras.Model(inputs=[text_input], outputs=[l]) # Build the model
model_bert.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model_bert.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [None]:
history = model_bert.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test)) # 136 mmin

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [30]:
def evaluate_model(model, history, X_test, y_true) :
    y_predicted = model.predict(X_test)
    # Convert predictions to class labels
    y_pred = (y_predicted > 0.5).astype(int)


    accuracy = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    report = classification_report(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print("Accuracy:", accuracy)
    print("Macro F1 Score:", f1_macro)
    print("Micro F1 Score:", f1_micro)
    print("\nClassification Report:\n", report)
    print("Confusion Matrix:\n", conf_matrix)
    px.line(history.history, y=["val_loss","loss"]).show()

In [None]:
evaluate_model(model_bert, history, X_test, y_test)

Accuracy: 0.9784637473079684
Macro F1 Score: 0.9516923290331529
Micro F1 Score: 0.9784637473079684

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1206
           1       0.96      0.87      0.92       187

    accuracy                           0.98      1393
   macro avg       0.97      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393

Confusion Matrix:
 [[1200    6]
 [  24  163]]


In [None]:
model_bert.save('Bert.keras')

Custom model

In [22]:
VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 100

def get_train_val(df, X_name, BATCH_SIZE = 100) :
    df_copy = df.copy()
    X = df_copy[X_name]
    # Tokenize
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(X)
    df_copy['Tokenized_Message'] = tokenizer.texts_to_sequences(X)
    # remove empty rows
    df_copy = df_copy[df_copy["Tokenized_Message"].apply(lambda x: len(x) > 0)]
    # Pad all tokenized messages to a uniform length of 100.
    input_tok_pad = tf.keras.preprocessing.sequence.pad_sequences(df_copy.Tokenized_Message, padding="post", maxlen=MAX_SEQUENCE_LENGTH)

    X_train, X_val, y_train, y_val = train_test_split(input_tok_pad, df_copy.spam, stratify=df_copy.spam, random_state=24)

    train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    val = tf.data.Dataset.from_tensor_slices((X_val, y_val))

    train = train.shuffle(len(train)).batch(BATCH_SIZE)
    val = val.batch(BATCH_SIZE)
    return train, val, y_val, tokenizer

In [23]:
train_0, val_0, y_val_0, tokenizer_0 = get_train_val(df, "Message")
train_1, val_1, y_val_1, tokenizer_1 = get_train_val(df, "Message_preprocessed")
train_2, val_2, y_val_2, tokenizer_2 = get_train_val(df, "Message_preprocessed_lem")
train_3, val_3, y_val_3, tokenizer_3 = get_train_val(df, "Message_preprocessed_stem")

In [58]:
def get_bi_GRU_model() :
    model = Sequential([
        Embedding(input_dim=VOCAB_SIZE, output_dim=128),
        Bidirectional(GRU(64, return_sequences=True)),
        Bidirectional(GRU(32)),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    return model

In [57]:
def get_fc_model():

    model = Sequential([
        Embedding(input_dim=VOCAB_SIZE, output_dim=128),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    return model

In [59]:
def get_bidirLSTM_model():
    model = Sequential([
        Embedding(input_dim=VOCAB_SIZE, output_dim=128),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(32)),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    return model

In [42]:
model_bi_gru = get_bi_GRU_model()
model_bi_gru.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
history = model_bi_gru.fit(train_0, validation_data=val_0, epochs=20) # 28 sec / T4 google colab

Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 0.8611 - loss: 0.5939 - val_accuracy: 0.8657 - val_loss: 0.3952
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8622 - loss: 0.3905 - val_accuracy: 0.8657 - val_loss: 0.3664
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - accuracy: 0.8645 - loss: 0.3520 - val_accuracy: 0.8657 - val_loss: 0.2903
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.8767 - loss: 0.2404 - val_accuracy: 0.9504 - val_loss: 0.1763
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.9649 - loss: 0.1493 - val_accuracy: 0.9605 - val_loss: 0.1476
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.9748 - loss: 0.1202 - val_accuracy: 0.9684 - val_loss: 0.1248
Epoch 7/20
[1m42/42[0m [32m━━━━

In [43]:
evaluate_model(model_bi_gru, history, val_0, y_val_0)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step
Accuracy: 0.9849137931034483
Macro F1 Score: 0.9667373293598369
Micro F1 Score: 0.9849137931034483

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1205
           1       0.97      0.91      0.94       187

    accuracy                           0.98      1392
   macro avg       0.98      0.96      0.97      1392
weighted avg       0.98      0.98      0.98      1392

Confusion Matrix:
 [[1200    5]
 [  16  171]]


In [44]:
model_bi_gru.save('Custom_model1.keras')

In [60]:
models = {"Bidirectional Gated Recurrent Unit (GRU) model" : get_bi_GRU_model, "Fully Connected Neural Network layer" : get_fc_model, "Bidirectional Long Short-Term Memory (LSTM) network" : get_bidirLSTM_model}
Set_names = ["Raw Data", "Processed Data", "Processed Data with Lemmatization", "Processed Data with Stemming"]
train_sets = [train_0, train_1, train_2, train_3]
val_sets = [val_0, val_1, val_2, val_3]
y_val_sets = [y_val_0, y_val_1, y_val_2, y_val_3]

In [61]:
histories = {}
trained_models  = {}

for model_name, model in models.items() :
    histories[model_name] = {}
    trained_models [model_name] = {}
    for i in range(4) :
        print()
        print()
        model_ = model()
        set_name = Set_names[i]
        train_set = train_sets[i]
        val_set = val_sets[i]
        y_val_set = y_val_sets[i]
        print(model_name, set_name, "train :")
        model_.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        history = model_.fit(train_set, validation_data=val_set, epochs=20)
        print()
        print(model_name, set_name, "scores :")
        evaluate_model(model_, history, val_set, y_val_set)
        histories[model_name][set_name] = history
        trained_models [model_name][set_name] = model_




Bidirectional Gated Recurrent Unit (GRU) model Raw Data train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.6545 - loss: 0.6728 - val_accuracy: 0.8657 - val_loss: 0.5502
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8712 - loss: 0.4967 - val_accuracy: 0.8657 - val_loss: 0.3906
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8661 - loss: 0.3747 - val_accuracy: 0.8657 - val_loss: 0.3322
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.8611 - loss: 0.3124 - val_accuracy: 0.8721 - val_loss: 0.2273
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9067 - loss: 0.2168 - val_accuracy: 0.9576 - val_loss: 0.1494
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9732 - loss: 0.1202 - val_accu



Bidirectional Gated Recurrent Unit (GRU) model Processed Data train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.8506 - loss: 0.6416 - val_accuracy: 0.8664 - val_loss: 0.4808
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8620 - loss: 0.4391 - val_accuracy: 0.8664 - val_loss: 0.3601
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8671 - loss: 0.3484 - val_accuracy: 0.8664 - val_loss: 0.2981
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8730 - loss: 0.2628 - val_accuracy: 0.9167 - val_loss: 0.1893
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9407 - loss: 0.1890 - val_accuracy: 0.9605 - val_loss: 0.1406
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9685 - loss: 0.1407 - va



Bidirectional Gated Recurrent Unit (GRU) model Processed Data with Lemmatization train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 0.8471 - loss: 0.6295 - val_accuracy: 0.8664 - val_loss: 0.4404
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8710 - loss: 0.3998 - val_accuracy: 0.8664 - val_loss: 0.3597
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8695 - loss: 0.3399 - val_accuracy: 0.8664 - val_loss: 0.2895
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8716 - loss: 0.2519 - val_accuracy: 0.9375 - val_loss: 0.1787
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9534 - loss: 0.1645 - val_accuracy: 0.9677 - val_loss: 0.1265
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9741 



Bidirectional Gated Recurrent Unit (GRU) model Processed Data with Stemming train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 35ms/step - accuracy: 0.6795 - loss: 0.6631 - val_accuracy: 0.8664 - val_loss: 0.5172
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8662 - loss: 0.4633 - val_accuracy: 0.8664 - val_loss: 0.3616
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8696 - loss: 0.3429 - val_accuracy: 0.8664 - val_loss: 0.2941
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8757 - loss: 0.2583 - val_accuracy: 0.9210 - val_loss: 0.1911
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9470 - loss: 0.1751 - val_accuracy: 0.9648 - val_loss: 0.1271
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9745 - lo



Fully Connected Neural Network layer Raw Data train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.8668 - loss: 0.6014 - val_accuracy: 0.8657 - val_loss: 0.3677
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8671 - loss: 0.3614 - val_accuracy: 0.8657 - val_loss: 0.3324
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8717 - loss: 0.3168 - val_accuracy: 0.8657 - val_loss: 0.2754
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8708 - loss: 0.2616 - val_accuracy: 0.8657 - val_loss: 0.2325
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8799 - loss: 0.2289 - val_accuracy: 0.9468 - val_loss: 0.1858
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9586 - loss: 0.1850 - val_accuracy: 0.9648 - 



Fully Connected Neural Network layer Processed Data train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.7353 - loss: 0.6437 - val_accuracy: 0.8657 - val_loss: 0.3737
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8582 - loss: 0.3801 - val_accuracy: 0.8664 - val_loss: 0.3367
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8677 - loss: 0.3318 - val_accuracy: 0.8664 - val_loss: 0.2812
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8699 - loss: 0.2808 - val_accuracy: 0.8664 - val_loss: 0.2345
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8726 - loss: 0.2412 - val_accuracy: 0.9353 - val_loss: 0.1864
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9505 - loss: 0.1932 - val_accuracy: 0.9



Fully Connected Neural Network layer Processed Data with Lemmatization train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.7967 - loss: 0.5504 - val_accuracy: 0.8664 - val_loss: 0.3565
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8728 - loss: 0.3360 - val_accuracy: 0.8664 - val_loss: 0.3023
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8662 - loss: 0.3013 - val_accuracy: 0.8664 - val_loss: 0.2429
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8843 - loss: 0.2415 - val_accuracy: 0.9526 - val_loss: 0.1938
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9465 - loss: 0.2088 - val_accuracy: 0.9555 - val_loss: 0.1604
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9623 - loss: 0.1725 



Fully Connected Neural Network layer Processed Data with Stemming train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.8538 - loss: 0.5304 - val_accuracy: 0.8664 - val_loss: 0.3520
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8775 - loss: 0.3303 - val_accuracy: 0.8664 - val_loss: 0.3035
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8672 - loss: 0.3001 - val_accuracy: 0.8664 - val_loss: 0.2398
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8662 - loss: 0.2645 - val_accuracy: 0.9253 - val_loss: 0.1871
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9521 - loss: 0.1956 - val_accuracy: 0.9691 - val_loss: 0.1356
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9698 - loss: 0.1437 - val



Bidirectional Long Short-Term Memory (LSTM) network Raw Data train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.7973 - loss: 0.6697 - val_accuracy: 0.8657 - val_loss: 0.5429
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8567 - loss: 0.4927 - val_accuracy: 0.8657 - val_loss: 0.3900
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8648 - loss: 0.3806 - val_accuracy: 0.8657 - val_loss: 0.3372
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8675 - loss: 0.3236 - val_accuracy: 0.8657 - val_loss: 0.2805
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8737 - loss: 0.2590 - val_accuracy: 0.9397 - val_loss: 0.2179
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9600 - loss: 0.2015 - val



Bidirectional Long Short-Term Memory (LSTM) network Processed Data train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.6750 - loss: 0.6782 - val_accuracy: 0.8664 - val_loss: 0.5583
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8662 - loss: 0.5000 - val_accuracy: 0.8664 - val_loss: 0.3757
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8558 - loss: 0.3772 - val_accuracy: 0.8664 - val_loss: 0.3131
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8620 - loss: 0.3134 - val_accuracy: 0.8664 - val_loss: 0.2603
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8695 - loss: 0.2639 - val_accuracy: 0.9476 - val_loss: 0.1884
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9661 - loss: 0.1725



Bidirectional Long Short-Term Memory (LSTM) network Processed Data with Lemmatization train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.8163 - loss: 0.6230 - val_accuracy: 0.8664 - val_loss: 0.3830
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.8634 - loss: 0.3667 - val_accuracy: 0.8664 - val_loss: 0.3192
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.8639 - loss: 0.3216 - val_accuracy: 0.8664 - val_loss: 0.2733
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.8706 - loss: 0.2698 - val_accuracy: 0.8664 - val_loss: 0.2206
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8943 - loss: 0.2301 - val_accuracy: 0.9634 - val_loss: 0.1592
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.



Bidirectional Long Short-Term Memory (LSTM) network Processed Data with Stemming train :
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.8535 - loss: 0.5735 - val_accuracy: 0.8664 - val_loss: 0.3547
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8716 - loss: 0.3421 - val_accuracy: 0.8664 - val_loss: 0.3049
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8642 - loss: 0.3053 - val_accuracy: 0.8664 - val_loss: 0.2395
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8771 - loss: 0.2532 - val_accuracy: 0.9583 - val_loss: 0.1648
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9643 - loss: 0.1656 - val_accuracy: 0.9749 - val_loss: 0.1132
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9764 

In [69]:
# Initialize results storage
results = {
    "Accuracy": [],
    "AUC": [],
    "F1": [],
    "Model": [],
    "Dataset": []
}

# Loop through models and datasets
for model_name, model_dict in trained_models.items():
    for set_name, model in model_dict.items():
        # Use the corresponding validation set and labels
        val_set = val_sets[Set_names.index(set_name)]
        y_val_set = y_val_sets[Set_names.index(set_name)]

        # Make predictions
        y_predicted = model.predict(val_set)
        y_pred = (y_predicted > 0.5).astype(int)

        # Calculate metrics
        accuracy = accuracy_score(y_val_set, y_pred)
        auc = roc_auc_score(y_val_set, y_predicted)
        f1 = f1_score(y_val_set, y_pred)

        # Store results
        results["Accuracy"].append(accuracy)
        results["AUC"].append(auc)
        results["F1"].append(f1)
        results["Model"].append(model_name)
        results["Dataset"].append(set_name)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [70]:
# Create the visualizations
fig1 = go.Figure()

# Accuracy plot
for model in trained_models.keys():
    fig1.add_trace(go.Bar(
        x=[set_name for set_name in Set_names],
        y=[results["Accuracy"][i] for i, m in enumerate(results["Model"]) if m == model],
        name=model,
        text=[f"{results['Accuracy'][i]:.4f}" for i, m in enumerate(results["Model"]) if m == model],  # Rounded text
        textposition='outside'  # Place text above the bars
    ))

fig1.update_layout(
    title="Comparison of Accuracy Across Models and Datasets",
    xaxis_title="Dataset",
    yaxis_title="Accuracy",
    barmode='group',
    yaxis=dict(range=[0.90, 1])  # Y-axis range set from 0.95 to 1
)

fig2 = go.Figure()

# AUC plot
for model in trained_models.keys():
    fig2.add_trace(go.Bar(
        x=[set_name for set_name in Set_names],
        y=[results["AUC"][i] for i, m in enumerate(results["Model"]) if m == model],
        name=model,
        text=[f"{results['AUC'][i]:.4f}" for i, m in enumerate(results["Model"]) if m == model],  # Rounded text
        textposition='outside'
    ))

fig2.update_layout(
    title="Comparison of AUC Across Models and Datasets",
    xaxis_title="Dataset",
    yaxis_title="AUC",
    barmode='group',
    yaxis=dict(range=[0.90, 1])  # Y-axis range set from 0.95 to 1
)

fig3 = go.Figure()

# F1 plot
for model in trained_models.keys():
    fig3.add_trace(go.Bar(
        x=[set_name for set_name in Set_names],
        y=[results["F1"][i] for i, m in enumerate(results["Model"]) if m == model],
        name=model,
        text=[f"{results['F1'][i]:.4f}" for i, m in enumerate(results["Model"]) if m == model],  # Rounded text
        textposition='outside'
    ))

fig3.update_layout(
    title="Comparison of F1 Score Across Models and Datasets",
    xaxis_title="Dataset",
    yaxis_title="F1 Score",
    barmode='group',
    yaxis=dict(range=[0.90, 1])  # Y-axis range set from 0.95 to 1
)

# Show figures
fig1.show()
fig2.show()
fig3.show()