In [1]:
import pandas as pd
import numpy as np
from  sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
print("Hello World")

In [3]:
# Read in the input data
# Data gathered from this link: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/data

df = pd.read_csv('data/spam.csv', encoding='latin1')
df.head()

In [4]:
# Get rid of columns without any data in them
bad_columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
df.drop(bad_columns, axis=1, inplace=True)
df.columns = ['Category', 'Message']

# Check for missing values
df.isna().sum()

In [5]:
# Drop duplicated rows
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

In [6]:
# Get the number of each category
num_spam = df[df['Category'] == 'spam'].shape[0]
num_non_spam = df[df['Category'] == 'ham'].shape[0]
print("Spam emails: ", num_spam, f" Percent of total: {100 * num_spam / (num_spam + num_non_spam):.1f}%")
print("Non-spam emails: ", num_non_spam, f" Percent of total: {100 * num_non_spam / (num_spam + num_non_spam):.1f}%")

In [7]:
# Convert the category column to a column of 1's denoting spam, 0's denoting not spam.

mapping = {'ham': 0, 'spam': 1}
df['spam'] = df['Category'].map(mapping)
df = df.drop('Category', axis=1)
df.head()

In [8]:
# Split the data into a training set and test set
x_train, x_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size=0.2)

In [9]:
# Feature extraction
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

x_train_dense = x_train_tfidf.toarray()
x_test_dense = x_test_tfidf.toarray()

In [None]:
# Build the model
import tensorflow as tf
import keras
from keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2

# Using a simpler, statically trained model is fine here. The classification does not have any internal state so using more complex architectures like RNN's is not necessary.

model = keras.Sequential()
# model.add(Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.03, l2=0.00)))
# model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train_dense, y_train, epochs=20, batch_size=1, validation_split=0.2)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test data
y_pred_probability = model.predict(x_test_dense)
y_pred = (y_pred_probability > 0.3).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
def classify_email(email_text):
    email_tfidf = vectorizer.transform([email_text])
    prediction = model.predict(email_tfidf)[0]
    print(prediction)
    return "Spam" if prediction >= 0.4 else "Not Spam"

# Example usage
new_email = "Get rich quick! Text GP to this number for more information"
result = classify_email(new_email)
print("Email classification:", result)