In [1]:
import pandas as pd
import numpy as np
from  sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
print("Hello World")

Hello World


In [3]:
# Read in the input data
# Data gathered from this link: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/data

df = pd.read_csv('data/spam.csv', encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Get rid of columns without any data in them
bad_columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
df.drop(bad_columns, axis=1, inplace=True)
df.columns = ['Category', 'Message']

# Check for missing values
df.isna().sum()

Category    0
Message     0
dtype: int64

In [5]:
# Drop duplicated rows
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

0

In [6]:
# Get the number of each category
num_spam = df[df['Category'] == 'spam'].shape[0]
num_non_spam = df[df['Category'] == 'ham'].shape[0]
print("Spam emails: ", num_spam, f" Percent of total: {100 * num_spam / (num_spam + num_non_spam):.1f}%")
print("Non-spam emails: ", num_non_spam, f" Percent of total: {100 * num_non_spam / (num_spam + num_non_spam):.1f}%")

Spam emails:  653  Percent of total: 12.6%
Non-spam emails:  4516  Percent of total: 87.4%


In [7]:
# Convert the category column to a column of 1's denoting spam, 0's denoting not spam.

mapping = {'ham': 0, 'spam': 1}
df['spam'] = df['Category'].map(mapping)
df = df.drop('Category', axis=1)
df.head()

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
# Split the data into a training set and test set
x_train, x_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size=0.2)

In [9]:
# Feature extraction
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

x_train_dense = x_train_tfidf.toarray()
x_test_dense = x_test_tfidf.toarray()

In [17]:
# Build the model
import tensorflow as tf
import keras
from keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2

# Using a simpler, statically trained model is fine here. The classification does not have any internal state so using more complex architectures like RNN's is not necessary.

model = keras.Sequential()
# model.add(Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.03, l2=0.00)))
# model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train_dense, y_train, epochs=20, batch_size=1, validation_split=0.2)

Epoch 1/20
[1m3308/3308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8892 - loss: 0.3163 - val_accuracy: 0.9879 - val_loss: 0.0608
Epoch 2/20
[1m3308/3308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9830 - loss: 0.0647 - val_accuracy: 0.9867 - val_loss: 0.0412
Epoch 3/20
[1m3308/3308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9937 - loss: 0.0208 - val_accuracy: 0.9867 - val_loss: 0.0406
Epoch 4/20
[1m3308/3308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9947 - loss: 0.0142 - val_accuracy: 0.9867 - val_loss: 0.0454
Epoch 5/20
[1m3308/3308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9960 - loss: 0.0091 - val_accuracy: 0.9879 - val_loss: 0.0498
Epoch 6/20
[1m3308/3308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9973 - loss: 0.0063 - val_accuracy: 0.9855 - val_loss: 0.0590
Epoch 7/20
[1m3

<keras.src.callbacks.history.History at 0x791868323dd0>

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test data
y_pred_probability = model.predict(x_test_dense)
y_pred = (y_pred_probability > 0.3).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 803us/step
Accuracy: 0.9835589941972921
Precision: 0.968
Recall: 0.9029850746268657
F1-score: 0.9343629343629344


In [32]:
def classify_email(email_text):
    email_tfidf = vectorizer.transform([email_text])
    print("Transformed email shape:", email_tfidf.shape)
    prediction = model.predict(email_tfidf)[0]
    print(prediction)
    return "Spam" if prediction >= 0.4 else "Not Spam"

# Example usage
new_email = "yur sfh sdthshfgsf ddf"
result = classify_email(new_email)
print("Email classification:", result)
model.summary()

Transformed email shape: (1, 7611)


2024-05-31 20:35:45.328192: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at ragged_gather_op.cc:77 : INVALID_ARGUMENT: indices[0] = 0 is not in [0, 0)
2024-05-31 20:35:45.328212: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at ragged_gather_op.cc:77 : INVALID_ARGUMENT: indices[0] = 0 is not in [0, 0)
2024-05-31 20:35:45.328255: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[0] = 0 is not in [0, 0)
	 [[{{node RaggedGather_1/RaggedGather}}]]


InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_1_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0] = 0 is not in [0, 0)
	 [[{{node RaggedGather_1/RaggedGather}}]] [Op:IteratorGetNext] name: 

In [23]:
 # Save the model using the pickle library
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [25]:
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)