# Machine learning assignment week 5
### Import libraries

In [1]:
import numpy as np
import pandas as pd
import os

### Read the file

In [2]:
# Test for the organisation of the data set
words = "Jacques a dit".split(" ")
print(words)
df = pd.DataFrame(data=[[' '.join(words[1:]), words[0]]],\
                  columns=["message", "Target"])

df.head()

['Jacques', 'a', 'dit']


Unnamed: 0,message,Target
0,a dit,Jacques


In [3]:
data = pd.DataFrame(columns=["Message", "Target"])

counter = 0
with open("messages.txt") as messages_file:
    for line in messages_file:
        words = line.split("\t")
        data.loc[counter] = [words[1], words[0]]
        counter += 1
        
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 2 columns):
Message    5000 non-null object
Target     5000 non-null object
dtypes: object(2)
memory usage: 117.2+ KB
None
                                             Message Target
0                   Yup i've finished c ü there...\n    ham
1             Remember to ask alex about his pizza\n    ham
2                     No da..today also i forgot..\n    ham
3  Ola would get back to you maybe not today but ...    ham
4  Fwiw the reason I'm only around when it's time...    ham


### Separate between train and test set

In [4]:
train_proportion = 0.8


train_data = data[1 - int(data.shape[0] * train_proportion):].reset_index(drop=True)
test_data = data[:1-int(data.shape[0] * train_proportion)].reset_index(drop=True)
print("Train : ", train_data.info())
print(train_data.head())
print("Test : ", test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 2 columns):
Message    3999 non-null object
Target     3999 non-null object
dtypes: object(2)
memory usage: 62.6+ KB
Train :  None
                                             Message Target
0  Dorothy@kiefer.com (Bank of Granite issues Str...   spam
1  says the  &lt;#&gt;  year old with a man and m...    ham
2                       I will come to ur home now\n    ham
3  Free any day but i finish at 6 on mon n thurs....    ham
4                        Will you be here for food\n    ham
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 2 columns):
Message    1001 non-null object
Target     1001 non-null object
dtypes: object(2)
memory usage: 15.7+ KB
Test :  None


### Make dictionnary

In [5]:
from collections import Counter

def make_dictionnary(data, most_commons):
    """
        Data is a pandas DataFrame generated earlier
        most_common represents the number of most common words we want to take
    """
    # Generate a list of words
    word_list = []
    for i in range(data.shape[0]):
        message = data.at[i, "Message"]
        # To add : remove punctuation from message
        words = message.split(" ")
        
        word_list += words
        
    word_dic = Counter(word_list)
    for item in list(word_dic):
        if item.isalpha() == False or len(item) == 1:
            del word_dic[item]
            
    return word_dic.most_common(most_commons)

In [6]:
dic_size = 3000

words_dic = make_dictionnary(train_data, dic_size)

### Parse the data sets
#### Turn Spam and ham into 0s and 1s

In [7]:
def parse_target(data):
    data["Target"] = pd.Categorical(data["Target"]).codes
    return data
    
print(parse_target(train_data).head())

                                             Message  Target
0  Dorothy@kiefer.com (Bank of Granite issues Str...       1
1  says the  &lt;#&gt;  year old with a man and m...       0
2                       I will come to ur home now\n       0
3  Free any day but i finish at 6 on mon n thurs....       0
4                        Will you be here for food\n       0


#### Turn each message into a vector by using the dictionnary

In [8]:
def extract_features(data, words_dic):
    feature_matrix = np.zeros((data.shape[0], len(words_dic)))
    
    messageID = 0
    for line in data["Message"]:
        words = line.split(" ")
        for word in words:
            for i, d in enumerate(words_dic):
                if d[0] == word:
                    feature_matrix[messageID, i] += 1
        messageID += 1

In [9]:
# Apply the function
train_features = extract_features(train_data, words_dic)

#### Apply Naive Bayes

For this part, we will use the Bayes formula.
* For each message, we write p(spam) the probability that this message is a spam, and p(x0) the probability that the word 0 is in that message.
* p(spam | x0) = (p(x0 | spam) * p(spam)) / p(x0).
* We know p(x0) and p(spam) trivially by counting how many instances of each are in our training set, and we can find p(x0 | spam) by looking for each word at how often they appear in spams.

We thus have 3 steps to train a naive Bayes classifier for our spam filter :
* Find p(X), for each word, the probability it is in a message.
* Find p(spam), for each message, the probability it is a spam.
* Find p(X | spam) : for each word, the probability it is in a spam.

##### Find P(spam)

In [10]:
p_spam = train_data["Target"][train_data["Target"] == 1].count() / train_data.shape[0]
print(p_spam)

0.13703425856464116


##### Find p(X)
* For each word, find the probability that it is x amount of time in any given message

In [22]:
# Generate our p_X.
# It is a matrix with each word as a line and the amount of time it appears
# in a message as a column

def count_nb_instances_for_each_word(data, words_dic, max_amount=5):
    # Variables
    retour = np.zeros((len(words_dic), max_amount))
    
    # Loop through each word of the dictionnary
    for i, dic_word in enumerate(words_dic):
        # Loop through each message
        for message in data["Message"]:
            instance_count = 0
            words = message.split(" ")
            
            # Loop tjrough each word of the message
            for word in words:
                if word == dic_word[0]:
                    instance_count += 1
            if instance_count >= max_amount:
                instance_count = max_amount - 1
            retour[i, instance_count] += 1
    
    # Turn the matrice into probabi p
    retour /= np.sum(retour[0, :])
    
    return retour

Apply the above function to find the number of instances of each word in all messages in training_data

In [23]:
p_X = count_nb_instances_for_each_word(train_data, words_dic)

print(p_X.shape)

(3000, 5)


In [24]:
print(p_X[0, :])

[0.70667667 0.22455614 0.05151288 0.01250313 0.00475119]


In [25]:
p_X_spam = count_nb_instances_for_each_word(\
            train_data[train_data["Target"] == 1], words_dic)

print(p_X_spam.shape)
print(p_X_spam[0, :])

(3000, 5)
[0.4379562  0.3850365  0.12591241 0.04562044 0.00547445]


In [21]:
np.sum(p_X[0, :])
print((p_X/ np.sum(p_X[0, :]))[0, :])

[0.70667667 0.22455614 0.05151288 0.01250313 0.00475119]


Il suffit maintenant d'appliquer la formule grâce à np.apply_formula.plz()

In [28]:
epsilon = 0.00000001
# We now have all the elements to give to each word a probability
p_spam_X = np.divide(p_X_spam * p_spam, p_X + epsilon)
print(p_spam_X.shape)
print(p_spam_X[0, :])

(3000, 5)
[0.08492569 0.23496658 0.33495139 0.4999996  0.1578944 ]
