In [241]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# Imports

In [242]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
pd.options.mode.chained_assignment = None  # default='warn'
from nltk.corpus import stopwords
import nltk

In [243]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Dataset recovery

In [244]:
dataset = pd.read_csv("H:/Documents/Cours/ING3/NLP/TD2/SMSSpamCollection", sep='\t', header=None, names=['Type', 'message_content'])
dataset.head()

Unnamed: 0,Type,message_content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Dataset spliting

In [245]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test= train_test_split(dataset["message_content"], dataset["Type"], test_size=0.3,random_state=109)

# Data preprocessing

In [246]:
# After cleaning
X_train = X_train.str.replace('\W', ' ')
X_train = X_train.str.replace('\d+', ' ')
X_train = X_train.str.lower()
X_train = X_train.str.translate(str.maketrans('','',string.punctuation))
X_train.head(3)

2954    urgent  your mobile was awarded a      bonus c...
209     you please give us connection today itself bef...
2078    hey hun onbus goin   meet him  he wants  go ou...
Name: message_content, dtype: object

# Vocabulary preparation

In [247]:
X_train = X_train.str.split()

vocabulary = []
for message in X_train:
    for word in message:
        vocabulary.append(word)

vocabulary = list(set(vocabulary))

# Word frequency calculation

In [248]:
words_frequency_message = {unique_word: [0] * len(X_train) for unique_word in vocabulary}

for index, message in enumerate(X_train):
    for word in message:
        words_frequency_message[word][index] += 1

In [249]:
words_frequency = pd.DataFrame(words_frequency_message, index=X_train.index)
words_frequency.head()

Unnamed: 0,pokkiri,contacted,vital,ppm,askin,jungle,ag,pale,brekkie,energy,...,theres,posted,dollar,stars,basically,behave,cantdo,catching,checkmate,dungerees
2954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
209,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3085,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [250]:
full_train = pd.concat([X_train, y_train, words_frequency], axis=1)
full_train.head()

Unnamed: 0,message_content,Type,pokkiri,contacted,vital,ppm,askin,jungle,ag,pale,...,theres,posted,dollar,stars,basically,behave,cantdo,catching,checkmate,dungerees
2954,"[urgent, your, mobile, was, awarded, a, bonus,...",spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
209,"[you, please, give, us, connection, today, its...",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2078,"[hey, hun, onbus, goin, meet, him, he, wants, ...",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3085,"[ok, lor, i, ned, go, toa, payoh, a, while, re...",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1824,"[same, as, u, dun, wan, y, u, dun, like, me, a...",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Naive Bayes values

In [251]:
spam_messages = full_train[full_train['Type'] == 'spam']
ham_messages = full_train[full_train['Type'] == 'ham']

p_spam = len(spam_messages) / len(full_train)
p_ham = len(ham_messages) / len(full_train)

n_words_per_spam_message = spam_messages['message_content'].apply(len)
n_spam = n_words_per_spam_message.sum()

n_words_per_ham_message = ham_messages['message_content'].apply(len)
n_ham = n_words_per_ham_message.sum()

n_vocabulary = len(vocabulary)

alpha = 1

In [252]:
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    parameters_spam[word] = p_word_given_spam

    n_word_given_ham = ham_messages[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

# Classificator

In [253]:
def classify_test_set(message):
    message = re.sub('\W', ' ', message)
    message = message.lower().split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs verification'

In [254]:
test_set = X_test.apply(classify_test_set)
test_set_clean = pd.concat([y_test, X_test, test_set], axis=1)
test_set_clean.columns = ['Label', 'SMS', 'prediction']

test_set_clean.head()

Unnamed: 0,Label,SMS,prediction
3368,ham,Hey what are you doing. Y no reply pa..,ham
3261,ham,I'm always looking for an excuse to be in the ...,ham
4216,ham,No dear i was sleeping :-P,ham
1407,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD...",spam
1766,ham,Hi this is yijue... It's regarding the 3230 te...,ham


# Classificator test

In [255]:
true_positive = 0
false_negative = 0
true_negative = 0
false_positive = 0
for i in test_set_clean.iterrows() :
    row = i[1]
    if row["Label"] == "ham" and row["prediction"] == "ham":
        true_positive += 1
    elif row["Label"] == "spam" and row["prediction"] == "spam":
        true_negative += 1
    elif row["Label"] == "ham" and row["prediction"] == "spam":
        false_negative += 1
    else :
        false_positive +=1
print("ham that was predicted ham (TP) :", true_positive)
print("spam that was predicted spam (TN) :", true_negative)
print("ham that was predicted spam (FN) :", false_negative)
print("spam that was predicted ham (FP) :", false_positive, end="\n\n")
print("Total correct predictions :", true_positive+false_negative)
print("Total incorrect predictions :", true_negative+false_positive)
print("Accuracy :", (true_positive+true_negative)/len(test_set_clean))
print("Precision :", true_positive/(true_positive+false_positive))
print("Recall :", true_positive/(true_positive+false_negative))
print("F1 score :", (2*true_positive)/(2*true_positive+false_positive+false_negative))

ham that was predicted ham (TP) : 1439
spam that was predicted spam (TN) : 203
ham that was predicted spam (FN) : 8
spam that was predicted ham (FP) : 22

Total correct predictions : 1447
Total incorrect predictions : 225
Accuracy : 0.9820574162679426
Precision : 0.9849418206707734
Recall : 0.9944713199723566
F1 score : 0.9896836313617606
