In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import numpy as np
import pandas as pd
import sys

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style = "whitegrid", 
        color_codes = True,
        font_scale = 1.5)

from datetime import datetime
from IPython.display import display, HTML

In [None]:
import zipfile
with zipfile.ZipFile('spam_ham_data.zip') as item:
    item.extractall()

In [None]:
original_training_data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Convert the emails to lowercase as the first step of text processing.
original_training_data['email'] = original_training_data['email'].str.lower()
test['email'] = test['email'].str.lower()

original_training_data.head()

In [None]:
# Fill any missing or NAN values.
print('Before imputation:')
print(original_training_data.isnull().sum())
original_training_data = original_training_data.fillna('')
print('------------')
print('After imputation:')
print(original_training_data.isnull().sum())

In [None]:
# This creates a 90/10 train-validation split on our labeled data.
from sklearn.model_selection import train_test_split
train, val = train_test_split(original_training_data, test_size = 0.1, random_state = 42)

# We must do this in order to preserve the ordering of emails to labels for words_in_texts.
train = train.reset_index(drop = True)

In [None]:
from projB2_utils import words_in_texts

words_in_texts(['hello', 'bye', 'world'], pd.Series(['hello', 'hello worldhello']))

In [None]:
some_words = ['drug', 'bank', 'prescription', 'memo', 'private']

X_train = words_in_texts(some_words, train['email'])
Y_train = np.array(train['spam'])

X_train[:5], Y_train[:5]

In [None]:
from sklearn.linear_model import LogisticRegression

simple_model = LogisticRegression()
simple_model.fit(X_train, Y_train)

training_accuracy = simple_model.score(X_train, Y_train)
print("Training Accuracy: ", training_accuracy)

In [None]:
# Calculate the correlation matrix
train_graph = train.copy()
train_graph['word_count'] = train_graph['email'].apply(lambda text: len(text.split()))

# Create a box plot to compare word count in spam and ham emails
plt.figure(figsize=(10, 6))
sns.violinplot(x='spam', y='word_count', data=train_graph)#, palette={'spam': 'red', 'ham': 'blue'})
plt.title('Word Count Distribution in Spam and Ham Emails')
plt.xlabel('Ham = 0, Spam = 1')
plt.ylabel('Word Count')
plt.show()

In [None]:
'''The violin plot above shows the distribution of the average word count of emails classified as ham and spam within the training set. We can see that both ham and spam emails have similar median values but ham values have a greater maximum value. There also seems to be a slightly greater spread for spam emails while ham emails are more dense around the 150 mark.'''

In [None]:
# import libraries
# You may use any of these to create your features.
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import re
from collections import Counter

In [None]:
words = ["free", "for you", "win", "increase", "special", "click", "guarantee", 
         "html", "please", "body", "name", "opportunity", "sir", "signup"]

X = pd.DataFrame(words_in_texts(words, train["email"]))
Y = train["spam"]

for i in range(len(words)):
    X.rename(columns={X.columns[i]: words[i]}, inplace=True)

X["Email Length"] = (train["email"].str.len() < 120) == (train['email'].str.len() > 27000).astype(int)
X["Subject Length"] = (train["subject"].str.len() < 20) == (train["subject"].str.len() > 70).astype(int)

X_train = np.array(X)
Y_train = np.array(Y)

model = LogisticRegression(fit_intercept = True, penalty = "l2")
model.fit(X_train, Y_train)

In [None]:
train_predictions = model.predict(X_train)

# Calculate the accuracy of the model on the testing set
training_accuracy = np.mean(train_predictions == train["spam"])

# Print your testing accuracy
training_accuracy

In [None]:
X_t = pd.DataFrame(words_in_texts(words, test["email"]))

for i in range(len(words)):
    X_t.rename(columns={X_t.columns[i]: words[i]}, inplace=True)

X_t["Email Length"] = (test["email"].str.len() < 120) == (test['email'].str.len() > 30000).astype(int)
X_t["Subject Length"] = (test["subject"].str.len() < 19) == (test["subject"].str.len() > 50).astype(int)

X_test = np.array(X_t)

test_predictions = model.predict(X_test)

In [None]:
# Predictions on the test set are stored in a 1-dimensional array called test_predictions.

submission_df = pd.DataFrame({
    "Id": test['id'], 
    "Class": test_predictions,
}, columns=['Id', 'Class'])
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = "submission_{}.csv".format(timestamp)
submission_df.to_csv(filename, index=False)

In [None]:
'''I found out that the chosen words affected my model's performance the greatest, I sought out to implement the graph we implemented in Q3 of Proj B1 to find good candidates that can indicate spam emails from ham emails. I also had the idea to integrate the email and subject word counts while working on my model, I found out later on that simply using the numerical value was not gonna work and it would be more feasible to sort of quantify the numerical values to boolean values. I was surprised that certain words I thought would be indicative of spam emails turned our to appear just as frequently in ham emails causing me to be more careful in selecting features.'''

In [None]:
Y_prob = model.predict_proba(X_train)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(Y_train, Y_prob)

plt.plot(thresholds, tpr, label="True Positive Rate")
plt.plot(thresholds, fpr, label="False Positive Rate")
plt.xlabel("Threshold")
plt.ylabel("TPR/FPR")
plt.legend();