In [None]:
# Run this cell to suppress all FutureWarnings.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# More readable exceptions.
%pip install --quiet iwut
%load_ext iwut
%wut on

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style = "whitegrid", 
        color_codes = True,
        font_scale = 1.5)

In [None]:
import zipfile
with zipfile.ZipFile('spam_ham_data.zip') as item:
    item.extractall()

In [None]:
# Loading training and test datasets
original_training_data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Convert the emails to lowercase as the first step of text processing.
original_training_data['email'] = original_training_data['email'].str.lower()
test['email'] = test['email'].str.lower()

original_training_data.head()

In [None]:
print('Before imputation:')
print(original_training_data.isnull().sum())
original_training_data = original_training_data.fillna('')
print('------------')
print('After imputation:')
print(original_training_data.isnull().sum())

In [None]:
# This creates a 90/10 train-validation split on our labeled data.
from sklearn.model_selection import train_test_split

train, val = train_test_split(original_training_data, test_size = 0.1, random_state = 42)

In [None]:
def words_in_texts(words, texts):
    """
    Args:
        words (list): Words to find.
        texts (Series): Strings to search in.
    
    Returns:
        A 2D NumPy array of 0s and 1s with shape (n, d) where 
        n is the number of texts, and d is the number of words.
    """

    indicator_array = np.zeros((len(texts), len(words)), dtype=int)
    
    indicator_array = np.array([texts.str.contains(word).astype(int).values for word in words]).T
    
    return indicator_array

In [None]:
from IPython.display import display, Markdown
df = pd.DataFrame({
    'word_1': [1, 0, 1, 0],
    'word_2': [0, 1, 0, 1],
    'type': ['spam', 'ham', 'ham', 'ham']
})
display(Markdown("> Our original `DataFrame` has a `type` column and some columns corresponding to words. You can think of each row as a sentence, and the value of 1 or 0 indicates the number of occurrences of the word in this sentence."))
display(df);
display(Markdown("> `melt` will turn columns into entries in a variable column. Notice how `word_1` and `word_2` become entries in `variable`; their values are stored in the `value` column."))
display(df.melt("type"))

In [None]:
train = train.reset_index(drop=True) # We must do this in order to preserve the ordering of emails to labels for words_in_texts.
plt.figure(figsize=(8,6))

spam_emails = train[train['spam'] == 1]['email']
ham_emails = train[train['spam'] == 0]['email']

# Choose six words to analyze (you can customize these)
words = ['for you', "html", "body", "name", "please", "opportunity", "sir", "signup"]

# Calculate word presence using the words_in_texts function
spam_word_presence = words_in_texts(words, spam_emails)
ham_word_presence = words_in_texts(words, ham_emails)

# Calculate proportions
spam_proportions = np.mean(spam_word_presence, axis=0)
ham_proportions = np.mean(ham_word_presence, axis=0)

# Create a DataFrame for plotting
proportion_data = pd.DataFrame({
    'Words': words * 2,
    'Proportion': np.concatenate((spam_proportions, ham_proportions)),
    'Class': ['Spam'] * len(words) + ['Ham'] * len(words)
})

# Plotting the bar chart
plt.figure(figsize=(8, 6))
sns.barplot(x='Words', y='Proportion', hue='Class', data=proportion_data)
plt.title('Frequency of Words in Spam/Ham Emails')
plt.xlabel('Words')
plt.ylabel('Proportion of Emails')

plt.tight_layout()
plt.show()

In [None]:
words = ['free', 'for you', 'win', 'increase', 'special', 'click', "guarantee", "html", "you", "body", "name", "please", "opportunity", "sir", "message", "signup", "store"]
some_words = ['drug', 'bank', 'prescription', 'memo', 'private']

X_train = words_in_texts(some_words, train["email"])
Y_train = np.array(train["spam"])

X_train[:5], Y_train[:5]

In [None]:
from sklearn.linear_model import LogisticRegression

my_model = LogisticRegression()
my_model.fit(X_train, Y_train)


training_accuracy = my_model.score(X_train, Y_train)
print("Training Accuracy: ", training_accuracy)

In [None]:
#computing precision, recall, fpr, of classifier
Y_train_hat = my_model.predict(X_train)

TP = sum((Y_train == 1) & (Y_train_hat == 1))
TN = sum((Y_train == 0) & (Y_train_hat == 0))
FP = sum((Y_train == 0) & (Y_train_hat == 1))
FN = sum((Y_train == 1) & (Y_train_hat == 0))

logistic_predictor_precision = TP / (TP + FP)
logistic_predictor_recall =  TP / (TP + FN)
logistic_predictor_fpr = FP / (FP + TN)

print(f"{TP=}, {TN=}, {FP=}, {FN=}")
print(f"{logistic_predictor_precision=:.2f}, {logistic_predictor_recall=:.2f}, {logistic_predictor_fpr=:.2f}")