In [159]:
import pandas as pd
import re
import pickle
import json

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [160]:
df = pd.read_csv(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailSpamClassifier_Web\datasets\spam.csv", encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [161]:
# drop unnecessary columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
# check data after dropping
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [162]:
# rename column for suitable name
df = df.rename(columns={'v1':'Labels', 'v2':'Messages'})
df.head()

Unnamed: 0,Labels,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [163]:
# check missing value
df.isnull().sum()

Labels      0
Messages    0
dtype: int64

In [164]:
# check duplicate rows
df.duplicated(subset='Messages').sum()

403

In [165]:
# remove duplicate rows, keep the first row
df.drop_duplicates(subset='Messages', keep='first', inplace=True)

In [166]:
# check duplicate rows again
df.duplicated().sum()

0

In [167]:
# set spam = 0, ham = 1
df.loc[df['Labels'] == 'spam', 'Labels',] = 0
df.loc[df['Labels'] == 'ham', 'Labels',] = 1

In [168]:
# separate X (features) and Y (labels)
X = df['Messages']
Y = df['Labels']

In [169]:
def preprocess_text(text):
    # remove all non-ASCII characters (retain only standard English characters)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # convert to lowercase for convenient calculating
    text = text.lower()
    # strip leading and trailing whitespace
    text = text.strip()

    return text

In [170]:
# apply the preprocessing function to the message column
df['Messages'] = df['Messages'].apply(preprocess_text)

In [171]:
# save the processed data to a new CSV file
df.to_csv(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailSpamClassifier_Web\datasets\results.csv", encoding="latin-1", index=False, columns=['Labels', 'Messages'])

In [172]:
# feature extraction using TF-IDF
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [173]:
# split the data into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [174]:
# apply TF-IDF vectorization to the training and testing data
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [175]:
# convert target labels to integer format
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [176]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [177]:
# initialize the Tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [178]:
with open(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailSpamClassifier_Web\models\tokenizer.pkl", 'wb') as f:
    pickle.dump(tokenizer, f)

In [179]:
# pad sequences to ensure they all have the same length
max_len = max([len(x.split()) for x in X_train]) # Find the maximum text length in the training data 
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

with open(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailSpamClassifier_Web\models\config.json", 'w') as f:
    json.dump({'max_len': max_len}, f)

In [180]:
# build the CNN model
cnnModel = Sequential([
    # embedding layer
    Embedding(input_dim=800, output_dim=128),
    # convolutional layer
    Conv1D(128, 3, activation='relu'),
    # another convolutional layer
    Conv1D(128, 3, activation='relu'),
    # pooling layer
    GlobalMaxPooling1D(),
    # fully connected layer
    Dense(64, activation='relu'),
    # dropout for regularization
    Dropout(0.5),
    # additional dense layer
    Dense(32, activation='relu'),
    Dropout(0.5),
    # output layer for binary classification
    Dense(1, activation='sigmoid') # use for binary prediction
])

In [181]:
# compile the model
cnnModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [182]:
# count the number of ham and spam emails
category_counts = df['Labels'].value_counts()

# Print the number of ham and spam emails
print(f"Number of ham emails: {category_counts[1]}")
print(f"Number of spam emails: {category_counts[0]}")

Number of ham emails: 4516
Number of spam emails: 653


In [183]:
# train the CNN model
cnnModel.fit(X_train_padded, Y_train, epochs=20, batch_size=64, validation_data=(X_test_padded, Y_test))

Epoch 1/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8555 - loss: 0.4770 - val_accuracy: 0.8646 - val_loss: 0.3073
Epoch 2/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.8872 - loss: 0.2368 - val_accuracy: 0.9681 - val_loss: 0.1244
Epoch 3/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9732 - loss: 0.0762 - val_accuracy: 0.9729 - val_loss: 0.0987
Epoch 4/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.9866 - loss: 0.0427 - val_accuracy: 0.9778 - val_loss: 0.1305
Epoch 5/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9936 - loss: 0.0315 - val_accuracy: 0.9768 - val_loss: 0.1469
Epoch 6/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.9972 - loss: 0.0164 - val_accuracy: 0.9729 - val_loss: 0.1554
Epoch 7/20
[1m65/65[0m [32m━━━━

<keras.src.callbacks.history.History at 0x140b2fd08f0>

In [184]:
# evaluate the model on test data
loss, accuracy = cnnModel.evaluate(X_test_padded, Y_test, batch_size=64)
print(f"Loss on test data: {loss}")
print(f"Accuracy on test data: {accuracy}")

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9694 - loss: 0.4635
Loss on test data: 0.4507347643375397
Accuracy on test data: 0.9709864854812622


In [185]:
# make predictions on the test set
y_prediction = (cnnModel.predict(X_test_padded) > 0.5).astype(int)
# details report
print(classification_report(Y_test, y_prediction, target_names=['spam', 'ham']))

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
              precision    recall  f1-score   support

        spam       0.89      0.89      0.89       140
         ham       0.98      0.98      0.98       894

    accuracy                           0.97      1034
   macro avg       0.94      0.94      0.94      1034
weighted avg       0.97      0.97      0.97      1034



In [186]:
# save the trained model
cnnModel.save(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailSpamClassifier_Web\models\cnn_spam_classifier.keras")
print("Save successfully")

Save successfully


In [187]:
# load the saved model
loadedCnnModel = load_model(r"D:\PTITHCM\Mon hoc\HK7(2024-2025)\IoT_And_Application\Final\EmailSpamClassifier_Web\models\cnn_spam_classifier.keras")

In [188]:
# test the model with new messages
new_message =["Congratulations! You've won a $1000 gift card. Claim it now by clicking this link.", 
              "Hello, Mr Linh, you have received a wonderful chance up to 1 billion dollar by clicking on this link.", 
              "Please confirm your attendance to the meeting.", 
              "Win a $1000 award now!", 
              "Hey James, remember to come to the party tomorrow!",
              "If you want to receive a gift up to $10000, click on this link www.reward.org/gift", 
              "Please confirm your attendance for tomorrow's meeting"]
new_message_seq = tokenizer.texts_to_sequences(new_message)
new_message_padded = pad_sequences(new_message_seq, maxlen=max_len, padding='post')

In [189]:
# make predictions on the new messages
cnnPrediction = loadedCnnModel.predict(new_message_padded)
print("Predict by CNN: ", (cnnPrediction > 0.5).astype('int'))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
Predict by CNN:  [[0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]]
