In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/My\ Drive/


'Colab Notebooks'   Resume.pdf		  'WhatsApp Image 2024-10-22 at 23.38.51_5f8e5dd9.jpg'
'Google Earth'	    spam_Emails_data.csv


In [None]:
df = pd.read_csv('/content/drive/My Drive/spam_Emails_data.csv', sep='\t', header=0)
print(df.columns)

Index(['label,text'], dtype='object')


In [None]:
df.shape

(834590, 1)

In [None]:
df.head(10)

Unnamed: 0,"label,text"
0,"Spam,""viiiiiiagraaaa"
1,only for the ones that want to make her scream .
2,prodigy scrawny crow define upgrade spongy bal...
3,caan ' t do . . .
4,"\nHam,got ice thought look az original message..."
5,easy and imperceptible to take .
6,take just a candy and become ready for 36 hour...
7,? this is most modern and safe way not to cove...
8,? only 15 minutes to wait
9,? fda approved


In [None]:
# Check if the first word is "Spam" or "Ham" and assign it to the label
# Handle potential missing values by converting them to strings
df['label'] = df['label,text'].astype(str).apply(lambda x: 'Spam' if 'Spam' in x else ('Ham' if 'Ham' in x else np.nan))

# For the text, remove the label ("Spam" or "Ham") from the beginning and assign the rest to the 'text' column
# Handle potential missing values by converting them to strings
df['text'] = df['label,text'].astype(str).apply(lambda x: x.replace('Spam,', '').replace('Ham,', '').strip())

# Display the cleaned DataFrame
df.head(10)

Unnamed: 0,"label,text",label,text
0,"Spam,""viiiiiiagraaaa",Spam,"""viiiiiiagraaaa"
1,only for the ones that want to make her scream .,,only for the ones that want to make her scream .
2,prodigy scrawny crow define upgrade spongy bal...,,prodigy scrawny crow define upgrade spongy bal...
3,caan ' t do . . .,,caan ' t do . . .
4,"\nHam,got ice thought look az original message...",Spam,got ice thought look az original message ice o...
5,easy and imperceptible to take .,,easy and imperceptible to take .
6,take just a candy and become ready for 36 hour...,,take just a candy and become ready for 36 hour...
7,? this is most modern and safe way not to cove...,,? this is most modern and safe way not to cove...
8,? only 15 minutes to wait,,? only 15 minutes to wait
9,? fda approved,,? fda approved


In [None]:
df_cleaned = df.dropna(subset=['label'])

In [None]:
df_cleaned.head(10)

Unnamed: 0,"label,text",label,text
0,"Spam,""viiiiiiagraaaa",Spam,"""viiiiiiagraaaa"
4,"\nHam,got ice thought look az original message...",Spam,got ice thought look az original message ice o...
12,"\nHam,author jra date escapenumber escapenumbe...",Spam,author jra date escapenumber escapenumber esca...
17,"Ham,this is the version that we created earlie...",Ham,this is the version that we created earlier th...
18,"Ham,pulp writing printing paper escapenumber e...",Ham,pulp writing printing paper escapenumber escap...
19,"Ham, stefan metze metzmacher writes stefan met...",Ham,stefan metze metzmacher writes stefan metze me...
20,"Ham,""hey there - - life sounds horribly busy ....",Ham,"""hey there - - life sounds horribly busy . i f..."
69,"Ham,you'd think a firewall would catch all my ...",Ham,you'd think a firewall would catch all my emai...
70,"Spam,luckyday lottery international internatio...",Spam,luckyday lottery international international p...
71,"Spam, ti pvc jgjrkwhr j p tc bxff cou s roxly ...",Spam,ti pvc jgjrkwhr j p tc bxff cou s roxly exgxj ...


In [None]:
df_cleaned.shape

(147719, 3)

In [None]:
group_counts = df_cleaned['label'].value_counts()
print(group_counts)

label
Ham     74537
Spam    73182
Name: count, dtype: int64


In [None]:
df_cleaned.iloc[:, [1, 2]]

Unnamed: 0,label,text
0,Spam,"""viiiiiiagraaaa"
4,Spam,got ice thought look az original message ice o...
12,Spam,author jra date escapenumber escapenumber esca...
17,Ham,this is the version that we created earlier th...
18,Ham,pulp writing printing paper escapenumber escap...
...,...,...
834585,Ham,on escapenumber escapenumber escapenumber rob ...
834586,Spam,we have everything you need escapelong cialesc...
834587,Ham,hi quick question say i have a date variable i...
834588,Spam,thank you for your loan request which we recie...


In [None]:
df_cleaned = df_cleaned.copy()

df_cleaned['label_encoded'] = df_cleaned['label'].map({'Ham': 0, 'Spam': 1})

msg_label = df_cleaned['label_encoded'].values

train_msg, test_msg, train_labels, test_labels = train_test_split(
    df_cleaned['text'], msg_label, test_size=0.2, random_state=42)

In [None]:
vocab_size = 500
max_len = 50
oov_tok = '<OOV>'
padding_type = 'post'
trunc_type = 'post'

# Convert texts to sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) #tokenizer is used to convert text into numericals
tokenizer.fit_on_texts(train_msg.astype(str))

train_seq = tokenizer.texts_to_sequences(train_msg.astype(str))
train_pad = pad_sequences(train_seq, maxlen=max_len, padding=padding_type, truncating=trunc_type)

test_seq = tokenizer.texts_to_sequences(test_msg.astype(str))
test_pad = pad_sequences(test_seq, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [None]:
embedding_dim = 16 #sets the dimensionality
model = Sequential() #initializes  the model
model.add(Embedding(vocab_size, embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
history = model.fit(train_pad, train_labels, epochs=10, validation_data=(test_pad, test_labels), callbacks=[early_stop], verbose=2)

Epoch 1/10
3693/3693 - 9s - 2ms/step - accuracy: 0.8286 - loss: 0.3888 - val_accuracy: 0.8462 - val_loss: 0.3509
Epoch 2/10
3693/3693 - 6s - 2ms/step - accuracy: 0.8524 - loss: 0.3430 - val_accuracy: 0.8532 - val_loss: 0.3420
Epoch 3/10
3693/3693 - 11s - 3ms/step - accuracy: 0.8562 - loss: 0.3347 - val_accuracy: 0.8577 - val_loss: 0.3256
Epoch 4/10
3693/3693 - 6s - 2ms/step - accuracy: 0.8581 - loss: 0.3285 - val_accuracy: 0.8568 - val_loss: 0.3251
Epoch 5/10
3693/3693 - 10s - 3ms/step - accuracy: 0.8609 - loss: 0.3227 - val_accuracy: 0.8588 - val_loss: 0.3233
Epoch 6/10
3693/3693 - 10s - 3ms/step - accuracy: 0.8629 - loss: 0.3186 - val_accuracy: 0.8659 - val_loss: 0.3156
Epoch 7/10
3693/3693 - 12s - 3ms/step - accuracy: 0.8650 - loss: 0.3141 - val_accuracy: 0.8630 - val_loss: 0.3197
Epoch 8/10
3693/3693 - 10s - 3ms/step - accuracy: 0.8678 - loss: 0.3106 - val_accuracy: 0.8604 - val_loss: 0.3304
Epoch 9/10
3693/3693 - 9s - 2ms/step - accuracy: 0.8699 - loss: 0.3071 - val_accuracy: 0.86

In [None]:
loss, accuracy = model.evaluate(test_pad, test_labels)
print(f"Model loss: {loss}, Model accuracy: {accuracy}")

[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8657 - loss: 0.3234
Model loss: 0.3177762031555176, Model accuracy: 0.8666734099388123


In [None]:
def predict_spam(predict_msg):
    seq = tokenizer.texts_to_sequences([predict_msg])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding=padding_type, truncating=trunc_type)
    pred = model.predict(padded_seq)[0][0]  # Using the dense model
    if pred > 0.5:
        print("Spam Email")
    else:
        print("Not Spam Email")

In [None]:
predict_msg = "Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"
predict_spam(predict_msg)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Spam Email


In [None]:
predict_msg = "Congratulations! You've won a $1000 Walmart gift card. Click here to claim your prize."
predict_spam(predict_msg)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Spam Email


In [None]:
predict_msg = "Hi John, can we reschedule our meeting to 3 PM tomorrow? Let me know if that works for you."
predict_spam(predict_msg)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Not Spam Email


In [None]:
predict_msg = "You are selected for a $500 cash prize. Reply with your bank details to receive the amount instantly."
predict_spam(predict_msg)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Spam Email


In [None]:
predict_msg="Congratulations! You have won a $1000 gift card!"
predict_spam(predict_msg)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Spam Email
