# Imports

In [156]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anthony\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anthony\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import dataset and cleaning


In [157]:
df = pd.read_csv("spam_dataset.csv", encoding="latin")
df.head()


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [158]:
# Clean column names
df = df.rename(columns={'label_num':'label_encoded'})
df = df[['label_encoded','text']]

df.head()

Unnamed: 0,label_encoded,text
0,0,Subject: enron methanol ; meter # : 988291\r\n...
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,1,"Subject: photoshop , windows , office . cheap ..."
4,0,Subject: re : indian springs\r\nthis deal is t...


In [159]:
# Get the length of the text, will be used for comparison later
li = []
for i in df['text']:
    li.append(len(nltk.word_tokenize(i)))

df['number_words'] = li
df.sample(3)

Unnamed: 0,label_encoded,text,number_words
375,0,Subject: re : request for application report l...,1334
50,1,Subject: notification of bequest\r\nfrom : dr ...,331
2328,0,Subject: econnect vpn\r\nyou have been approve...,64


# Data Preprocessing

In [160]:
# Stemming algorithm
ps = PorterStemmer()

# Transforms text by performing the following steps: lowercase, tokenizing, removing non-alphanumeric characters, removing stopwords and punctuation, and stemming
stopwords_set = set(stopwords.words('english'))
# 15sec runtime
def transforming_text(text):
    # Convert to lowercase and tokenize
    tokens = nltk.word_tokenize(text.lower())

    # Remove non-alphanumeric characters, stopwords, and punctuation
    tokens = [ps.stem(token) for token in tokens if token.isalnum() and token not in stopwords_set and token not in string.punctuation]

    return " ".join(tokens)

# slower implementations of this algorithm, left for testing purposes
# 2min 30sec runtime
# def transforming_text(text):
#     text = text.lower()
#     text = nltk.word_tokenize(text)
    
#     temp=[]
#     for i in text:
#         if i.isalnum():
#             temp.append(i)
#     text=temp[:]
#     temp.clear()
    
#     for i in text:
#         if i not in stopwords.words('english') and i not in string.punctuation:
#             temp.append(i)
    
#     text=temp[:]
#     temp.clear()
    
#     for i in text:
#         temp.append(ps.stem(i))
        
#     return " ".join(temp)

# 2min 15sec runtime
# def transforming_text(text):
#     # Convert to lowercase and tokenize
#     tokens = nltk.word_tokenize(text.lower())

#     # Remove non-alphanumeric characters
#     tokens = [token for token in tokens if token.isalnum()]

#     # Remove stopwords and punctuation
#     tokens = [token for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]

#     # Perform stemming
#     stemmed_tokens = [ps.stem(token) for token in tokens]

#     return " ".join(stemmed_tokens)


In [161]:
# Apply the transformation to all text, save in a new column
# This line takes approx 15 seconds to run depending on your pc
df['tranformed text'] = df['text'].apply(transforming_text)

In [162]:
# Vectorize text
cv = CountVectorizer()
tfid = TfidfVectorizer(max_features=3000)

X = tfid.fit_transform(df['tranformed text']).toarray()


# Run Models


## Train

In [163]:
# Train
y=df['label_encoded'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

lr = LogisticRegression(C=1,solver='liblinear',penalty='l2', max_iter=50)
lr.fit(X_train,y_train)

## Test
Modify the custom_email or use one of the default emails to check if it is spam.

In [164]:
# Test
custom_spam_email = "Congratulations! You've won a prize. Claim it now."
custom_real_email = "Hi John, I hope you're doing well. I wanted to follow up on our meeting last week and discuss the next steps for the project. Please let me know when you're available for a call. Looking forward to hearing from you. Best regards, Anthony"

custom_email = custom_spam_email
# custom_email = custom_real_email
# custom_email = "" 

preprocessed_text = transforming_text(custom_email)

# Transform the preprocessed text using the same TfidfVectorizer
custom_text_vectorized = tfid.transform([preprocessed_text]).toarray()

# Binary spam prediction
prediction = lr.predict(custom_text_vectorized)

# Predict the probability of the text being spam
probability_scores = lr.predict_proba(custom_text_vectorized)
spam_probability = probability_scores[0, 1]

print(f"Spam Probability: {spam_probability:.2%}")

# Internally does [0-50) real [50-100] spam
if prediction == 1:
    print("This is a spam message!")
else:
    print("This is a real message.")


Spam Probability: 67.06%
This is a spam message!


# Comparison

In [165]:
# TBD, need multiple models to compare