In [None]:
# use BagofWords and Word2Vec to do a spam email classification
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm

In [None]:
# Prepare dataset
import os

USE_KAGGLE = False
if USE_KAGGLE:
    !pip install kaggle

    download_dir = "data"
    os.makedirs(download_dir, exist_ok=True)

    # requires kaggle.json in ~/.kaggle
    os.system(f'kaggle datasets download -d purusinghvi/email-spam-classification-dataset -p {download_dir} --unzip')

    csv_path = os.path.join(download_dir, "combined_data.csv")
    if os.path.exists(csv_path):
        print(f"Downloaded and extracted: {csv_path}")
    else:
        print("Download failed or file not found.")

else:
    extract_folder = "data"

    # Ensure extraction folder exists
    os.makedirs(extract_folder, exist_ok=True)

    # Define file paths
    zip_path = "data/email-spam-classification-dataset.zip"
    extract_folder = "data"

    # Check if the dataset is already downloaded
    if not os.path.exists(zip_path):
        print("Downloading dataset...")
        !curl -L -o "{extract_folder}/email-spam-classification-dataset.zip" https://www.kaggle.com/api/v1/datasets/download/purusinghvi/email-spam-classification-dataset
    else:
        print("Dataset already downloaded.")

    # Check if the dataset is already extracted
    if not os.path.exists(os.path.join(extract_folder, "email.csv")):
        print("Extracting dataset...")
        !unzip -o {zip_path} -d {extract_folder}
    else:
        print("Dataset already extracted.")

In [None]:
df = pd.read_csv("data/combined_data.csv")

In [None]:
# print total number of emails, non-spam emails, and spam emails
print("Total number of emails:", len(df))
print("Number of non-spam emails:", len(df[df["label"] == 0]))
print("Number of spam emails:", len(df[df["label"] == 1]))

In [None]:
X = df["text"].values
y = df["label"].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=87)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features=1000)
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)

print("shape of X_train_bow:", X_train_bow.shape)
print("shape of X_test_bow:", X_test_bow.shape)

In [None]:
plt.imshow(X_train_bow.toarray()[:100,:200])
plt.xlabel("Word")
plt.ylabel("Email")
plt.colorbar()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_bow = LogisticRegression(max_iter=len(y_train))
lr_bow.fit(X_train_bow, y_train)

In [None]:
y_pred_train_bow = lr_bow.predict(X_train_bow)
print(f"Accuracy with BoW: {accuracy_score(y_train, y_pred_train_bow)}")

y_pred_test_bow = lr_bow.predict(X_test_bow)
print(f"Accuracy with BoW: {accuracy_score(y_test, y_pred_test_bow)}")

# Word2Vec

In [None]:
# use gensim to load pre-trained Word2Vec model
from gensim.models import Word2Vec
import gensim.downloader as api
wv = api.load("word2vec-google-news-300")

In [None]:
# get embedding of a word
print(wv["computer"].shape)
plt.plot(wv["computer"])

In [None]:
# show that man - woman = king - queen
# queen = king + woman - man
# w_1, w_2, w_3 = 'man', 'woman', 'king'
# similarily, apple - banana = red - ?
w_1, w_2, w_3 = 'apple', 'banana', 'red'

man, woman, king = wv[w_1], wv[w_2], wv[w_3]
tmp = king + woman - man

# find the most similar word to tmp
wv.most_similar(positive=[tmp])

# Try other calculations by yourself!

# Given one word, find the most similar email

In [None]:
N = 5000 # number of emails to use
import textwrap

# get embeddings of all emails
import re
X_w2v = np.zeros((N, wv.vector_size))
for i in tqdm.tqdm(range(N)):
    email = X[i]
    # convert to lowercase
    email = email.lower()
    # use regex to split the email into words and remove non-alphabetic characters and "_" character
    words = re.findall(r"[a-zA-Z]+", email)
    for word in words:
        if word in wv:
            X_w2v[i] += wv[word]

In [None]:
# normalize X_w2v
X_w2v_normalized = X_w2v / np.linalg.norm(X_w2v, axis=1).reshape(-1, 1)
# fill nan with 0
X_w2v_normalized = np.nan_to_num(X_w2v_normalized)

In [None]:
# use BoW to find the most similar email
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(X[:N]).toarray()
X_bow_normalized = X_bow / np.linalg.norm(X_bow, axis=1).reshape(-1, 1)
X_bow_normalized = np.nan_to_num(X_bow_normalized)

In [None]:
# given one topic, find the most similar email
topic = "romantic"
topic_vec = wv[topic]
topic_vec_normalized = topic_vec / np.linalg.norm(topic_vec)
similarity = np.dot(X_w2v_normalized, topic_vec_normalized)
most_similar_email = X[np.argmax(similarity)]
print("Found by Word2Vec:")
print(textwrap.fill(most_similar_email, 100))
print("Number of times the topic word appears in the email:", most_similar_email.lower().count(topic))


# given one topic, find the most similar email
topic_vec = count_vectorizer.transform([topic]).toarray()
topic_vec_normalized = topic_vec / np.linalg.norm(topic_vec)
similarity = np.dot(X_bow_normalized, topic_vec_normalized.T).flatten()
most_similar_email = X[np.argmax(similarity)]
print("\nFound by BoW:")
print(textwrap.fill(most_similar_email, 100))
print("Number of times the topic word appears in the email:", most_similar_email.lower().count(topic))