In [26]:
#import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [27]:
#reads data in from the downloaded file using pandas read
df = pd.read_csv("C:\\Users\\Brandon FP\\NbSpam\\spamCSV.csv", encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [28]:
#remove unnecessary rows of data using the drop function on our data frame
#Drop the unnamed columns because they include nothing of value
df = df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', "Unnamed: 4"], axis = 1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
#check our data to see if we have an ample amount using describe
df.groupby(["v1"]).describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [30]:
#rename columns for ease of use. Change "v1" to "class" and "v2" to "message"
df = df.rename(columns={"v1": "class", "v2": "message"})
df.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
#add a spam column. 0 if it is not spam, 1 if it is spam using our class column to check
df["spam"] = df["class"].apply(lambda x: 1 if x == "spam" else 0)
df.head()

Unnamed: 0,class,message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [32]:
#create training data/testing data split using sklearn. 80/20 split
x_train, x_test, y_train, y_test = train_test_split(df.message, df.spam, test_size = 0.2)

In [33]:
#Describe our testing data
x_train.describe()

count                       4457
unique                      4182
top       Sorry, I'll call later
freq                          23
Name: message, dtype: object

In [34]:
#find word count and store as numbers. Counts how many times a word shows up
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [35]:
#Find the prior probabilities for the spam/ham messages
def prior_probability(y_train):
    #adds up all the 1's for spam and divides it by how many messages there are
    pS = y_train.sum() / float(len(y_train))
    
    #takes how many spam messages there are and subtracts from the total messages, then divides by total messages to find 
    # the amount of not spam (ham) messages
    pH = (len(y_train) - y_train.sum()) / float(len(y_train))
    
    return pS, pH

In [36]:
#Find the likelyhood probabilities of each word in the message
def likelihood_probability(x_train_count, y_train):
    '''take the sum of the word occurances for messages labled spam and adds 1 to the count of each word, 
        (so we arent left later on to multiply by zero)- we then divide that sum by the toal number of words in a spam message
        plus a smoothing term''' 
    p_word_given_spam = (x_train_count[y_train == 1].sum(axis=0) + 1) / (y_train.sum() + 1 * x_train_count.shape[1])
    
    #same as above except for all messages labeled not spam
    p_word_given_not_spam = (x_train_count[y_train == 0].sum(axis=0) + 1) / ((len(y_train) - y_train.sum()) + 1 * x_train_count.shape[1])
    
    return p_word_given_spam, p_word_given_not_spam

In [37]:
#predict if it is spam or ham
def predict(x_test, p_spam, p_not_spam, p_word_given_spam, p_word_given_not_spam):
    #makes an array which we will return later
    predictions = []

    #loops through each message in the set (test or train)
    for message in x_test:
        #sets two variables equal to the prior probability of it being spam or not spam
        spam_prob = p_spam
        not_spam_prob = p_not_spam

        #loops through each word in the message. Splits it up by the spaces 
        for word in message.split():
            #if the word is in the spam prob dictionary, it updates the spam probability by multiplying
            if np.isin(word, p_word_given_spam):
                spam_prob *= p_word_given_spam[word]
            #same as for spam just for not spam probability
            if np.isin(word, p_word_given_not_spam):
                not_spam_prob *= p_word_given_not_spam[word]

        # Append to the array based on the probabilities
        if spam_prob > not_spam_prob:
            predictions.append(1)
        else:
            predictions.append(0)

    return predictions

In [38]:
# Calculate prior probabilities
p_spam, p_not_spam = prior_probability(y_train)

# Calculate likelihood probabilities
p_word_given_spam, p_word_given_not_spam = likelihood_probability(x_train_count, y_train)

# Make predictions
predictions = predict(x_test, p_spam, p_not_spam, p_word_given_spam, p_word_given_not_spam)

In [39]:
# Evaluate the model
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions, zero_division=1)

In [40]:
#print the accuracy of the training data
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.8663677130044843
Confusion Matrix:
 [[966   0]
 [149   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       966
           1       1.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.93      0.50      0.46      1115
weighted avg       0.88      0.87      0.80      1115



In [41]:
# Make predictions on test values
predictions = predict(x_test, p_spam, p_not_spam, p_word_given_spam, p_word_given_not_spam)

In [42]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions, zero_division=1)

# Print the results on testing data
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.8663677130044843
Confusion Matrix:
 [[966   0]
 [149   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       966
           1       1.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.93      0.50      0.46      1115
weighted avg       0.88      0.87      0.80      1115

