In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.multiclass import unique_labels

# First Method

In [23]:
data = pd.read_csv("spam.csv")

In [24]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


As we can see there is a lot of null values so we're going to drop them since they are not useful for this project.

In [25]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)

In [26]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


We're now going to rename the v1 and v2 columns since the column names do not mean anything.

In [27]:
data.rename(columns = {
    'v1': 'spam',
    'v2': 'message'}, inplace = True)

In [28]:
data.head()

Unnamed: 0,spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


We're now going to change spam values to True, since they do represent spam messages and ham values to false, since they do not represent spam messages.

In [29]:
#data['spam'].map({'ham': False, 'spam': True})
data['spam'] = data['spam'].map({'ham': False, 'spam': True})

In [30]:
data.head()

Unnamed: 0,spam,message
0,False,"Go until jurong point, crazy.. Available only ..."
1,False,Ok lar... Joking wif u oni...
2,True,Free entry in 2 a wkly comp to win FA Cup fina...
3,False,U dun say so early hor... U c already then say...
4,False,"Nah I don't think he goes to usf, he lives aro..."


Just checking the distribution of True and False values in the data

In [31]:
data['spam'].value_counts(normalize = True)

spam
False    0.865937
True     0.134063
Name: proportion, dtype: float64

For ideal performance our data would have 50% spam and 50% real messages but this is the dataset we have to work with so let's move on.

**Splitting the dataset into training and testing data**

In [34]:
y = pd.DataFrame(data['spam'])
x = pd.DataFrame(data['message'])
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['spam'], test_size = .2)

In [36]:
y_train.value_counts(normalize = True)

spam
False    0.871214
True     0.128786
Name: proportion, dtype: float64

The ratio is very similar

The message will be split and the count of each word in the message will be tracked with CountVerctorizer inbuilt function. The vectorized count is then converted into an array.

In [38]:
vectorizer = CountVectorizer(ngram_range = (1,2)).fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.toarray().shape

(4457, 42941)

## The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g. word caounts for text classification).

In [39]:
model = MultinomialNB(alpha = 0.1)
model.fit(X_train_vectorized, y_train)

The alpha value in this case is basically a smoothing parameter. What this means is that we're essentially setting the floor for our initial value for probability guess. Since words like congratulations might only be found in spam messages, we don't want our model to immediately associate that word with a spam message. We want the model to find other factors and build a stronger case as to why it is predicting a spam message.

In [40]:
predictions = model.predict(vectorizer.transform(X_test))
print("Accuracy:", 100 * sum(predictions == y_test) / len(predictions), '%')

Accuracy: 98.83408071748879 %


Accuracy based on the predictions made using the test data set.

**Testing with non-spam messages**

In [41]:
model.predict(vectorizer.transform(
    [
        "Thank you, ABC. Can you also share your LinkedIn profile? As you're a good Python programmer I'd like to see what work environments you've been in and what kind of projects you've worked on so far.",
        "Hello everybody, we have just opened up a brand new Data Scientist position at ABC Company, if you're interested make sure to head over to the company website and apply now!",
        "Dear X, Congratulations! You have been selected as a Data Scientist at ABC Company. We are really happy to have you here!"
    ]))

array([False, False, False])

As we can see all of the predictions on the example messages I wrote are False meaning that the model is working as intended.

**Testing with spam messages**

In [42]:
model.predict(vectorizer.transform(
    [
        "congratulations, you became today's lucky winner",
        "1-month unlimited calls offer Activate now",
        "Free money!! click here"
    ]))

array([ True,  True,  True])

The model recognizes these spam messages as spam.

# Second Method

The first method worked great but I wanted to challenge myself a little bit more and not use CountVectorizer. Instead I will be creating my own (probably less efficient) version.

In [53]:
data = pd.read_csv('spam.csv')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
data.rename(columns = {
    'v1': 'spam_tf',
    'v2': 'message_input'}, inplace = True)
data['spam_tf'] = data['spam_tf'].map({'ham': False, 'spam': True})
data.dropna(inplace = True)
data.head(3)

Unnamed: 0,spam_tf,message_input
0,False,"Go until jurong point, crazy.. Available only ..."
1,False,Ok lar... Joking wif u oni...
2,True,Free entry in 2 a wkly comp to win FA Cup fina...


I'll be creating a series of columns where each word is vectorized so I cannot use spam or message as those could be words in the messages I'll be processing. This is why the column names are different for this example.

In [54]:
# Data cleaning steps taken:
# Set it to string dtype
data['message_input'] = data['message_input'].astype(str)

# Remove punctuation
data['message_input'] = data['message_input'].str.replace(r'\W', ' ', regex=True)

# Remove extra spaces
data['message_input'] = data['message_input'].str.replace(r'\s+', ' ', regex=True).str.strip()

data['message_input'] = data['message_input'].str.lower()
#training_set.head(3)

# Split on every word
data['message_input'] = data['message_input'].str.split()

# Embeded loop creates the column names
vocabulary = []
for sms in data['message_input']:
    for word in sms:
        vocabulary.append(word)
        
vocabulary = list(set(vocabulary))

word_counts_per_sms = {unique_word: [0] * len(data['message_input'])
                      for unique_word in vocabulary}

# Fills in the columns with appropriate data
for index, sms in enumerate(data['message_input']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,la,aftr,married,msn,via,pie,smokin,letters,gsex,09058091854,...,resume,sender,leading,87121,poboxox36504w45wq,60,logon,09077818151,responding,m39m51
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Concatinating all of the word data and the original data for later use.**

In [55]:
data_clean = pd.concat([data, word_counts], axis = 1)
data_clean.head()

Unnamed: 0,spam_tf,message_input,la,aftr,married,msn,via,pie,smokin,letters,...,resume,sender,leading,87121,poboxox36504w45wq,60,logon,09077818151,responding,m39m51
0,False,"[go, until, jurong, point, crazy, available, o...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,True,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,False,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,False,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# Isolating spam and ham messages first using masking
spam_messages = data_clean[data_clean['spam_tf'] == True]
nspam_messages = data_clean[data_clean['spam_tf'] == False]

# P(Spam) and P(NSpam)
p_spam = len(spam_messages) / len(data_clean)
p_nspam = len(nspam_messages) / len(data_clean)

# N_Spam
n_words_per_spam_message = spam_messages['message_input'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_NSpam
n_words_per_nspam_message = nspam_messages['message_input'].apply(len)
n_nspam = n_words_per_nspam_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Smoothing factor
alpha = 1

# Initiate parameters
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_nspam = {unique_word: 0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()  # spam_messages already defined
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_nspam = nspam_messages[word].sum()  # NSpam_messages already defined
    p_word_given_nspam = (n_word_given_nspam + alpha) / (n_nspam + alpha * n_vocabulary)
    parameters_nspam[word] = p_word_given_nspam

Basically what is happening above is I am first creating two different dataframes. One with spam data and one with not spam data. These two dataframes were created using masking. I then calculate the probability of spam messages and not spam messages in the whole dataset. Next, I counted the number of words per spam message and non-spam message separetely to pass a parameter. I then calculated the length of the vocabulary using a len() function and also set the smoothing factor. 

The for loop at the bottom calculates the parameters that are going to be used. It essentialy takes the number of words given that it is either spam or not spam and calculates its probability. Alpha becomes very important here. it ensures that all of the 0s are not automatically assumed to have a probability of 0.

In [59]:
nspam_messages.shape

(4825, 8672)

In [60]:
import re

def classify(message_input):
    '''
    message: a string
    '''
    
    message_input = re.sub('\W', ' ', message_input)
    message_input = message_input.lower().split()
    
    p_spam_given_message = p_spam
    p_nspam_given_message = p_nspam
    
    for word in message_input:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_nspam:
            p_nspam_given_message *= parameters_nspam[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(NSpam|message):', p_nspam_given_message)
    
    if p_nspam_given_message > p_spam_given_message:
        print('Label: Not Spam')
    elif p_nspam_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print("Equal probabilities please have a human classify this")

This function removes all punctuation from the input message, converts it into lower case, and splits every word. This is done to match our training data. It then calculates the probability using precomputed parameters for each word. Then in the for loop, it multiplies the probability with every word in the input message and calculates the probabilities and spits out its classification.

In [61]:
classify('Click here for your reward!')

P(Spam|message): 2.5910204075339075e-16
P(NSpam|message): 2.019112996047724e-17
Label: Spam


In [62]:
classify('Winner!! This is the secret code to unlock your money: C3421.')

P(Spam|message): 2.217625934299856e-25
P(NSpam|message): 6.381611721617781e-28
Label: Spam


In [63]:
classify("Sounds good, see you there!")

P(Spam|message): 2.1813087641172994e-17
P(NSpam|message): 6.443853566566209e-14
Label: Not Spam


In [64]:
p_spam

0.13406317300789664

In [65]:
classify("I love Ginger.")

P(Spam|message): 1.195059468520307e-07
P(NSpam|message): 8.095405440398258e-05
Label: Not Spam


In [66]:
classify("Your package was not delivered. Click this link!")

P(Spam|message): 9.798729532899553e-28
P(NSpam|message): 3.221900580259293e-28
Label: Spam


In [67]:
classify("Your package was not delivered.")

P(Spam|message): 5.4857308763844595e-18
P(NSpam|message): 4.2521441168058316e-17
Label: Not Spam
