In [264]:
import pandas as pd
import re
import pickle
import json
import tensorflow as tf
import numpy as np
import random

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [265]:
df = pd.read_csv(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailClassifier\datasets\spam.csv", encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [266]:
# drop unnecessary columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
# check data after dropping
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [267]:
# rename column for suitable name
df = df.rename(columns={'v1':'Labels', 'v2':'Messages'})
df.head()

Unnamed: 0,Labels,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [268]:
# check missing value
df.isnull().sum()

Labels      0
Messages    0
dtype: int64

In [269]:
# check duplicate rows
df.duplicated(subset='Messages').sum()

403

In [270]:
# remove duplicate rows, keep the first row
df.drop_duplicates(subset='Messages', keep='first', inplace=True)

In [271]:
# check duplicate rows again
df.duplicated().sum()

0

In [272]:
# set spam = 0, ham = 1
df.loc[df['Labels'] == 'spam', 'Labels',] = 0
df.loc[df['Labels'] == 'ham', 'Labels',] = 1

In [273]:
# separate X (features) and Y (labels)
X = df['Messages']
Y = df['Labels']

In [274]:
def preprocess_text(text):
    # remove all non-ASCII characters (retain only standard English characters)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # convert to lowercase for convenient calculating
    text = text.lower()
    # strip leading and trailing whitespace
    text = text.strip()

    return text

In [275]:
# apply the preprocessing function to the message column
X = X.apply(preprocess_text)

In [276]:
# save the processed data to a new CSV file
df.to_csv(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailClassifier\datasets\results.csv", encoding="latin-1", index=False, columns=['Labels', 'Messages'])

In [277]:
# feature extraction using TF-IDF
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [278]:
# split the data into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [279]:
# apply TF-IDF vectorization to the training and testing data
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [280]:
# convert target labels to integer format
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [281]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [282]:
# initialize the Tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [283]:
with open(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailClassifier\models\tokenizer.pkl", 'wb') as f:
    pickle.dump(tokenizer, f)

In [284]:
# pad sequences to ensure they all have the same length
max_len = max([len(x.split()) for x in X_train]) # Find the maximum text length in the training data 
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

with open(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailClassifier\models\config.json", 'w') as f:
    json.dump({'max_len': max_len}, f)

In [285]:
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [286]:
# build the CNN model
cnnModel = Sequential([
    # embedding layer
    Embedding(input_dim=800, output_dim=128),
    # convolutional layer
    Conv1D(128, 3, activation='relu'),
    # another convolutional layer
    Conv1D(128, 3, activation='relu'),
    # pooling layer
    GlobalMaxPooling1D(),
    # fully connected layer
    Dense(64, activation='relu'),
    # dropout for regularization
    Dropout(0.5),
    # additional dense layer
    Dense(32, activation='relu'),
    Dropout(0.5),
    # output layer for binary classification
    Dense(1, activation='sigmoid') # use for binary prediction
])

In [287]:
# compile the model
cnnModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [288]:
# count the number of ham and spam emails
category_counts = df['Labels'].value_counts()

# Print the number of ham and spam emails
print(f"Number of ham emails: {category_counts[1]}")
print(f"Number of spam emails: {category_counts[0]}")

Number of ham emails: 4516
Number of spam emails: 769


In [289]:
# train the CNN model
cnnModel.fit(X_train_padded, Y_train, epochs=20, batch_size=64, validation_data=(X_test_padded, Y_test))

Epoch 1/20


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.8117 - loss: 0.5277 - val_accuracy: 0.8515 - val_loss: 0.2765
Epoch 2/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8683 - loss: 0.2396 - val_accuracy: 0.9669 - val_loss: 0.1088
Epoch 3/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.9707 - loss: 0.1045 - val_accuracy: 0.9716 - val_loss: 0.1237
Epoch 4/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.9849 - loss: 0.0599 - val_accuracy: 0.9745 - val_loss: 0.1368
Epoch 5/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9891 - loss: 0.0495 - val_accuracy: 0.9735 - val_loss: 0.1410
Epoch 6/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.9937 - loss: 0.0306 - val_accuracy: 0.9631 - val_loss: 0.1759
Epoch 7/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x118f14d1f40>

In [290]:
# evaluate the model on test data
loss, accuracy = cnnModel.evaluate(X_test_padded, Y_test, batch_size=64)
print(f"Loss on test data: {loss}")
print(f"Accuracy on test data: {accuracy}")

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9707 - loss: 0.5084
Loss on test data: 0.4404020607471466
Accuracy on test data: 0.9744560122489929


In [291]:
# make predictions on the test set
y_prediction = (cnnModel.predict(X_test_padded) > 0.5).astype(int)
# details report
print(classification_report(Y_test, y_prediction, target_names=['spam', 'ham']))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
              precision    recall  f1-score   support

        spam       0.95      0.88      0.91       157
         ham       0.98      0.99      0.99       900

    accuracy                           0.97      1057
   macro avg       0.96      0.94      0.95      1057
weighted avg       0.97      0.97      0.97      1057



In [292]:
# save the trained model
cnnModel.save(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailClassifier\models\cnn_spam_classifier.keras")
print("Save successfully")

Save successfully


In [293]:
# load the saved model
loadedCnnModel = load_model(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailClassifier\models\cnn_spam_classifier.keras")

In [294]:
# test the model with new messages
new_message =["Congratulations! You've won a $1000 gift card. Claim it now by clicking this link.", 
              "Hello, Mr Linh, you have received a wonderful chance up to 1 billion dollar by clicking on this link.", 
              "Please confirm your attendance to the meeting.", 
              "Win a $1000 award now!", 
              "Hey James, remember to come to the party tomorrow!",
              "If you want to receive a gift up to $10000, click on this link www.reward.org/gift", 
              "Please confirm your attendance for tomorrow's meeting"]
new_message_seq = tokenizer.texts_to_sequences(new_message)
new_message_padded = pad_sequences(new_message_seq, maxlen=max_len, padding='post')

In [295]:
# make predictions on the new messages
cnnPrediction = loadedCnnModel.predict(new_message_padded)
print((cnnPrediction > 0.5).astype('int'))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[[0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]]
