# PS5 Naive Bayes spam filter

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1.
### 1.1

In [15]:
spam = pd.read_csv("./data/lingspam-emails.csv.bz2",
                 	sep = "\t")

In [27]:
from textwrap import wrap
is_spam = spam[spam['spam'] == True]
not_spam = spam[spam['spam'] == False]
spam_messages = is_spam[['message']].values
not_spam_messages = not_spam[['message']].values

print("non spam messages:")
for i, message in enumerate(not_spam_messages):
    if i >= 3: 
        break
    message_text = str(message[0])
    print("\n".join(wrap(message_text)), "\n")

print("spam messages:")
for i, message in enumerate(spam_messages):
    if i >= 3: 
        break
    message_text = str(message[0])
    print("\n".join(wrap(message_text)), "\n")

non spam messages:
Subject: re : 2 . 882 s - > np np  > date : sun , 15 dec 91 02 : 25 :
02 est > from : michael < mmorse @ vm1 . yorku . ca > > subject : re :
2 . 864 queries > > wlodek zadrozny asks if there is " anything
interesting " to be said > about the construction " s > np np " . . .
second , > and very much related : might we consider the construction
to be a form > of what has been discussed on this list of late as
reduplication ? the > logical sense of " john mcnamara the name " is
tautologous and thus , at > that level , indistinguishable from " well
, well now , what have we here ? " . to say that ' john mcnamara the
name ' is tautologous is to give support to those who say that a
logic-based semantics is irrelevant to natural language . in what
sense is it tautologous ? it supplies the value of an attribute
followed by the attribute of which it is the value . if in fact the
value of the name-attribute for the relevant entity were ' chaim
shmendrik ' , ' john mcnamara the

### 1.2

#### 1.

In [34]:
variable_naming_schemes = [
    ("Pr(S = 1)", "pr_spam_1"),
    ("Pr(S = 0)", "pr_spam_0"),
    ("Pr(w = 1)", "pr_word_1"),
    ("Pr(w = 0)", "pr_word_0"),
    ("Pr(w = 1|S = 1)", "pr_word_1_given_spam"),
    ("Pr(w = 0|S = 1)", "pr_word_0_given_spam"),
    ("Pr(w = 1|S = 0)", "pr_word_1_given_non_spam"),
    ("Pr(w = 0|S = 0)", "pr_word_0_given_non_spam"),
]

print(f"{'Probability':<30} {'Variable Name':<30}")
print("-" * 60)

for prob, var_name in variable_naming_schemes:
    print(f"{prob:<30} {var_name:<30}")

Probability                    Variable Name                 
------------------------------------------------------------
Pr(S = 1)                      pr_spam_1                     
Pr(S = 0)                      pr_spam_0                     
Pr(w = 1)                      pr_word_1                     
Pr(w = 0)                      pr_word_0                     
Pr(w = 1|S = 1)                pr_word_1_given_spam          
Pr(w = 0|S = 1)                pr_word_0_given_spam          
Pr(w = 1|S = 0)                pr_word_1_given_non_spam      
Pr(w = 0|S = 0)                pr_word_0_given_non_spam      


#### 2.

In [37]:
y = spam['spam']
y

0       False
1       False
2       False
3       False
4       False
        ...  
2888     True
2889     True
2890     True
2891     True
2892     True
Name: spam, Length: 2893, dtype: bool

In [52]:
def find_majority_class_bool(vector):
    return sum(vector) > len(vector) / 2
majority_class = find_majority_class_bool(y)
majority_class_text = "not-spam" if majority_class is False else "spam"
print(f"The majority class is: {majority_class_text}")

The majority class is: not-spam


In [53]:
from sklearn.metrics import confusion_matrix, accuracy_score

predictions = np.full_like(y, fill_value=majority_class)
conf_matrix = confusion_matrix(y, predictions)

accuracy_naive = (conf_matrix[0][0] + conf_matrix[1][1] )/ sum(map(sum, conf_matrix))

print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy of the naive estimator: {accuracy_naive * 100:.2f}%")

Confusion Matrix:
 [[2412    0]
 [ 481    0]]
Accuracy of the naive estimator: 83.37%


#### 3.

In [85]:
x = spam.message.str.lower().str.contains("million")
x

0       False
1       False
2       False
3       False
4       False
        ...  
2888    False
2889     True
2890    False
2891     True
2892     True
Name: message, Length: 2893, dtype: bool

#### 4.

In [32]:
pr_spam_1 = (spam['spam'] == True).mean()
pr_spam_0 = (spam['spam'] == False).mean()
print(f"Prior: probability of spam: {pr_spam_1*100:.2f}%, probability of non-spam: {pr_spam_0*100:.2f}%")

Prior: probabilty of spam: 16.63%, probability of non-spam: 83.37%


### 1.3
#### 1.

In [61]:
pr_word_1 = x.mean()
pr_word_0 = 1 - pr_word_1
print(f"Normalizer: probability of detecting target word: {pr_word_1*100:.2f}% , probability of not detecting target word: {pr_word_0*100:.2f}%")

Normalizer: probability of detecting target word: 4.84% , probability of not detecting target word: 95.16%


#### 2.

In [62]:
spam_with_x = spam.assign(contains_word_million = x)
spam_with_x.head(3)

Unnamed: 0,spam,files,message,contains_word_million
0,False,3-1msg1.txt,Subject: re : 2 . 882 s - > np np > date : su...,False
1,False,3-1msg2.txt,Subject: s - > np + np the discussion of s - ...,False
2,False,3-1msg3.txt,Subject: 2 . 882 s - > np np . . . for me it ...,False


In [70]:
pr_word_1_given_spam = spam_with_x[spam_with_x['spam'] == True]['contains_word_million'].mean()
pr_word_0_given_spam =  1 - pr_word_1_given_spam
pr_word_1_given_non_spam = spam_with_x[spam_with_x['spam'] == False]['contains_word_million'].mean()
pr_word_0_given_non_spam = 1 - pr_word_1_given_non_spam
print(f"Pr(w = 1|S = 1): {pr_word_1_given_spam*100:.2f}% , Pr(w = 0|S = 1): {pr_word_0_given_spam*100:.2f}%")
print(f"Pr(w = 1|S = 0): {pr_word_1_given_non_spam*100:.2f}% , Pr(w = 0|S = 0): {pr_word_0_given_non_spam*100:.2f}%")

Pr(w = 1|S = 1): 24.12% , Pr(w = 0|S = 1): 75.88%
Pr(w = 1|S = 0): 1.00% , Pr(w = 0|S = 0): 99.00%


#### 3.

In [68]:
pr_spam_given_word_1 = pr_word_1_given_spam  * pr_spam_1 / pr_word_1
pr_spam_given_word_0 = pr_word_0_given_spam  * pr_spam_1 / pr_word_0
print(f" Pr(S = 1|w = 1): {pr_spam_given_word_1*100:.2f}% , Pr(S = 1|w = 0): {pr_spam_given_word_0*100:.2f}%")

 Pr(S = 1|w = 1): 82.86% , Pr(S = 1|w = 0): 13.26%


In [72]:
pr_non_spam_given_word_1 = pr_word_1_given_non_spam  * pr_spam_0 / pr_word_1
pr_non_spam_given_word_0 = pr_word_0_given_non_spam  * pr_spam_0 / pr_word_0
print(f" Pr(S = 0|w = 1): {pr_non_spam_given_word_1*100:.2f}% , Pr(S = 0|w = 0): {pr_non_spam_given_word_0*100:.2f}%")

 Pr(S = 0|w = 1): 17.14% , Pr(S = 0|w = 0): 86.74%


#### 4.

In [78]:
print(f"Pr(S = 1|w = 1) + Pr(S = 0|w = 1) = {pr_spam_given_word_1 + pr_non_spam_given_word_1:.2f}")
print(f"Pr(S = 1|w = 0) + Pr(S = 0|w = 0) = {pr_spam_given_word_0 + pr_non_spam_given_word_0:.2f}")

Pr(S = 1|w = 1) + Pr(S = 0|w = 1) = 1.00
Pr(S = 1|w = 0) + Pr(S = 0|w = 0) = 1.00


Pr(S = 1|w = 1), Pr(S = 0|w = 1) adds up to 1 becuase they contains the same base assumption, which is "target word is found inside the message". So they would be in the same space in the venn diagram  So in this case what ever is "not" Pr(S = 1|w = 1) is Pr(S = 0|w = 1).

### 1.4
#### 1

In [88]:
predictions = pd.Series(index=spam_with_x.index, dtype='bool')
for index, row in spam_with_x.iterrows():
    if row['contains_word_million'] == True:
        predictions.at[index] = pr_spam_given_word_1 > 0.5
    else:
        predictions.at[index] = pr_spam_given_word_0 > 0.5
predictions

0       False
1       False
2       False
3       False
4       False
        ...  
2888    False
2889     True
2890    False
2891     True
2892     True
Length: 2893, dtype: bool

In [92]:
conf_matrix = confusion_matrix(y, predictions)

TP = conf_matrix[1][1]
TN = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]

accuracy_bays_theorem = (TN + TP )/ sum(map(sum, conf_matrix))
precision_bays_theorem = TP / (TP + FP)
recall_bays_theorem = TP / (TP + FN)

print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy of the bayes theorem estimator: {accuracy_bays_theorem * 100:.2f}%")
print(f"Precision of the bayes theorem estimator: {precision_bays_theorem :.2f}")
print(f"Recall of the bayes theorem estimator: {recall_bays_theorem:.2f}")

Confusion Matrix:
 [[2388   24]
 [ 365  116]]
Accuracy of the bayes theorem estimator: 86.55%
Precision of the bayes theorem estimator: 0.83
Recall of the bayes theorem estimator: 0.24


#### 3.
In the steps outlined above, the calculation of Bayes' theorem probabilities and the setting of a 0.5 threshold constitute the model's training. This is because the definition of a trained model is something from which we can make predictions. Our prediction was derived from the probabilities and the comparison between the threshold and the binary choice of whether the word is found or not, as described in the mentioned steps.