# Machine learning assignment week 5
### Import libraries

In [1]:
import numpy as np
import pandas as pd
import os

### Read the file

In [2]:
# Test for the organisation of the data set
words = "Jacques a dit".split(" ")
print(words)
df = pd.DataFrame(data=[[' '.join(words[1:]), words[0]]],\
                  columns=["message", "Target"])

df.head()

['Jacques', 'a', 'dit']


Unnamed: 0,message,Target
0,a dit,Jacques


In [3]:
data = pd.DataFrame(columns=["Message", "Target"])

counter = 0
with open("messages.txt") as messages_file:
    for line in messages_file:
        words = line.split("\t")
        data.loc[counter] = [words[1], words[0]]
        counter += 1
        
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 2 columns):
Message    5000 non-null object
Target     5000 non-null object
dtypes: object(2)
memory usage: 117.2+ KB
None
                                             Message Target
0                   Yup i've finished c ü there...\n    ham
1             Remember to ask alex about his pizza\n    ham
2                     No da..today also i forgot..\n    ham
3  Ola would get back to you maybe not today but ...    ham
4  Fwiw the reason I'm only around when it's time...    ham


### Separate between train and test set

In [4]:
train_proportion = 0.8


train_data = data[1 - int(data.shape[0] * train_proportion):].reset_index(drop=True)
test_data = data[:1-int(data.shape[0] * train_proportion)].reset_index(drop=True)
print("Train : ", train_data.info())
print(train_data.head())
print("Test : ", test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 2 columns):
Message    3999 non-null object
Target     3999 non-null object
dtypes: object(2)
memory usage: 62.6+ KB
Train :  None
                                             Message Target
0  Dorothy@kiefer.com (Bank of Granite issues Str...   spam
1  says the  &lt;#&gt;  year old with a man and m...    ham
2                       I will come to ur home now\n    ham
3  Free any day but i finish at 6 on mon n thurs....    ham
4                        Will you be here for food\n    ham
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 2 columns):
Message    1001 non-null object
Target     1001 non-null object
dtypes: object(2)
memory usage: 15.7+ KB
Test :  None


### Make dictionnary

In [5]:
from collections import Counter

def make_dictionnary(data, most_commons):
    """
        Data is a pandas DataFrame generated earlier
        most_common represents the number of most common words we want to take
    """
    # Generate a list of words
    word_list = []
    for i in range(data.shape[0]):
        message = data.at[i, "Message"]
        # To add : remove punctuation from message
        words = message.split(" ")
        
        word_list += words
        
    word_dic = Counter(word_list)
    for item in list(word_dic):
        if item.isalpha() == False or len(item) == 1:
            del word_dic[item]
            
    return word_dic.most_common(most_commons)

In [6]:
dic_size = 3000

words_dic = make_dictionnary(train_data, dic_size)

### Parse the data sets
#### Turn Spam and ham into 0s and 1s

In [7]:
def parse_target(data):
    data["Target"] = pd.Categorical(data["Target"]).codes
    return data
    
print(parse_target(train_data).head())
parse_target(test_data)

                                             Message  Target
0  Dorothy@kiefer.com (Bank of Granite issues Str...       1
1  says the  &lt;#&gt;  year old with a man and m...       0
2                       I will come to ur home now\n       0
3  Free any day but i finish at 6 on mon n thurs....       0
4                        Will you be here for food\n       0


Unnamed: 0,Message,Target
0,Yup i've finished c ü there...\n,0
1,Remember to ask alex about his pizza\n,0
2,No da..today also i forgot..\n,0
3,Ola would get back to you maybe not today but ...,0
4,Fwiw the reason I'm only around when it's time...,0
5,"Hello, my boytoy! I made it home and my consta...",0
6,Congrats kano..whr s the treat maga?\n,0
7,Who u talking about?\n,0
8,Yup...\n,0
9,Ok...\n,0


#### Turn each message into a vector by using the dictionnary

In [8]:
def extract_features(data, words_dic, max_amount=5):
    feature_matrix = np.zeros((data.shape[0], len(words_dic)), dtype=int)
    
    messageID = 0
    for line in data["Message"]:
        words = line.split(" ")
        for word in words:
            for i, d in enumerate(words_dic):
                if d[0] == word:
                    feature_matrix[messageID, i] += 1
                    if feature_matrix[messageID, i] >= max_amount:
                        feature_matrix[messageID, i] = max_amount - 1
        messageID += 1
        
    return feature_matrix

In [9]:
# Apply the function
train_features = extract_features(train_data, words_dic)
test_features = extract_features(test_data, words_dic)

print(train_features.shape)
print(test_features.shape)

(3999, 3000)
(1001, 3000)


#### Apply Naive Bayes

For this part, we will use the Bayes formula.
* For each message, we write p(spam) the probability that this message is a spam, and p(x0) the probability that the word 0 is in that message.
* p(spam | x0) = (p(x0 | spam) * p(spam)) / p(x0).
* We know p(x0) and p(spam) trivially by counting how many instances of each are in our training set, and we can find p(x0 | spam) by looking for each word at how often they appear in spams.

We thus have 3 steps to train a naive Bayes classifier for our spam filter :
* Find p(X), for each word, the probability it is in a message.
* Find p(spam), for each message, the probability it is a spam.
* Find p(X | spam) : for each word, the probability it is in a spam.

##### Find P(spam)

In [10]:
p_spam = train_data["Target"][train_data["Target"] == 1].count() / train_data.shape[0]
print(p_spam)

0.13703425856464116


##### Find p(X)
* For each word, find the probability that it is x amount of time in any given message

In [11]:
# Generate our p_X.
# It is a matrix with each word as a line and the amount of time it appears
# in a message as a column
def count_nb_instances_for_one_word(column):
    """
    
    """
    return_value = np.zeros(5)
    for number in column:
        return_value[number] += 1
        
    return return_value

# This function should have run using extract features
def count_nb_instances_for_each_word(feature_matrix, max_amount=5):
    
    # Loop through each column of the messages
    retour = np.apply_along_axis(count_nb_instances_for_one_word, axis=0, arr=feature_matrix)
    
    retour = np.transpose(retour)
    
    # Turn the matrice into probabi p
    retour /= np.sum(retour[0, :])
    
    return retour

Apply the above function to find the number of instances of each word in all messages in training_data

In [12]:
p_X = count_nb_instances_for_each_word(train_features)

print(p_X.shape)

(3000, 5)


In [13]:
print(p_X[0, :])

[0.70667667 0.22455614 0.05151288 0.01250313 0.00475119]


In [14]:
print(train_features[train_data[train_data["Target"] == 1].index])

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [3 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [15]:
p_X_spam = count_nb_instances_for_each_word(\
            train_features[train_data[train_data["Target"] == 1].index])

print(p_X_spam.shape)
print(p_X_spam[0, :])

(3000, 5)
[0.4379562  0.3850365  0.12591241 0.04562044 0.00547445]


All we need to do now is to apply the formula thanks to np.apply_formula.please().no_really_I_dont_know_how_to_do_it(test_data)

In [16]:
""" This was our first version
epsilon = 0.00000001
# We now have all the elements to give to each word a probability
p_spam_X = np.divide(p_X_spam * p_spam, p_X + epsilon)
print(p_spam_X.shape)
print(p_spam_X[0, :])
"""
p_spam_X = p_X_spam * p_spam

p_ham = (1 - p_spam)

p_X_ham = count_nb_instances_for_each_word(\
            train_features[train_data[train_data["Target"] == 0].index])

p_ham_X = p_X_ham * p_ham

In [17]:
print(p_spam_X.shape)

(3000, 5)


### We make the function to combine the probabilities of the words
* By using the argmax

In [18]:
def one_message_argmax(message, p_spam_X, p_ham_X):
    proba_spam = 1
    proba_ham = 1
    
    for i, word in enumerate(message):
        if word > 0:
            proba_spam *= p_spam_X[i, word]
            proba_ham *= p_ham_X[i, word]
            
    if proba_spam > proba_ham:
        return 1
    else:
        return 0

def full_argmax(feature_matrix, p_spam_X, p_ham_X):
    """
    feature_matrix is a numpy matrix of shape (nb_examples, 3000)
    """
    return np.apply_along_axis(one_message_argmax, 1, feature_matrix, p_spam_X, p_ham_X)

In [19]:
# Apply the functions
results = full_argmax(test_features, p_spam_X, p_ham_X)

print(results.shape)
print(results[:4])

(1001,)
[0 0 0 0]


### Try our model on the test_data
* For each word, we have found the probability that its presence indicates the message is a spam.

In [20]:
print(test_data.head())

                                             Message  Target
0                   Yup i've finished c ü there...\n       0
1             Remember to ask alex about his pizza\n       0
2                     No da..today also i forgot..\n       0
3  Ola would get back to you maybe not today but ...       0
4  Fwiw the reason I'm only around when it's time...       0


In [21]:
test_data = parse_target(test_data)
print(test_data.head())

                                             Message  Target
0                   Yup i've finished c ü there...\n       0
1             Remember to ask alex about his pizza\n       0
2                     No da..today also i forgot..\n       0
3  Ola would get back to you maybe not today but ...       0
4  Fwiw the reason I'm only around when it's time...       0


In [22]:
"""
# Loop through test_features, and find the mean of probabilities
score_probas = np.zeros(test_data.shape[0])
for i, line in enumerate(test_features):

    word_count = 0
    for j, apparition in enumerate(line):
        if apparition > 0:
            # + 1 because multiple iteration of the same word
            # is already taken in consideration in p_spam_X
            word_count += 1
            if apparition >= p_spam_X.shape[1]:
                apparition = p_spam_X.shape[1] - 1
                
            score_probas[i] += p_spam_X[j, int(apparition)]
            
    score_probas[i] /= word_count
    
print(score_probas.shape)
print(score_probas[0])"""

'\n# Loop through test_features, and find the mean of probabilities\nscore_probas = np.zeros(test_data.shape[0])\nfor i, line in enumerate(test_features):\n\n    word_count = 0\n    for j, apparition in enumerate(line):\n        if apparition > 0:\n            # + 1 because multiple iteration of the same word\n            # is already taken in consideration in p_spam_X\n            word_count += 1\n            if apparition >= p_spam_X.shape[1]:\n                apparition = p_spam_X.shape[1] - 1\n                \n            score_probas[i] += p_spam_X[j, int(apparition)]\n            \n    score_probas[i] /= word_count\n    \nprint(score_probas.shape)\nprint(score_probas[0])'

In [23]:
print(results)

[0 0 0 ... 0 0 0]


In [24]:
test_data['Result'] = results

In [25]:
print(test_data.head())
print(test_data.info())

                                             Message  Target  Result
0                   Yup i've finished c ü there...\n       0       0
1             Remember to ask alex about his pizza\n       0       0
2                     No da..today also i forgot..\n       0       0
3  Ola would get back to you maybe not today but ...       0       0
4  Fwiw the reason I'm only around when it's time...       0       0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 3 columns):
Message    1001 non-null object
Target     1001 non-null int8
Result     1001 non-null int64
dtypes: int64(1), int8(1), object(1)
memory usage: 16.7+ KB
None


In [26]:
print(test_data[test_data['Target'] == 1])

                                               Message  Target  Result
14   U have won a nokia 6230 plus a free digital ca...       1       1
25   FREE entry into our £250 weekly comp just send...       1       1
48   Text82228>> Get more ringtones, logos and game...       1       1
56   FreeMSG You have been awarded a FREE mini DIGI...       1       1
58   This message is brought to you by GMW Ltd. and...       1       0
74   Congrats 2 mobile 3G Videophones R yours. call...       1       1
75   Your next amazing xxx PICSFREE1 video will be ...       1       0
81   U are subscribed to the best Mobile Content Se...       1       0
83   3 FREE TAROT TEXTS! Find out about your love l...       1       0
90   Join the UK's horniest Dogging service and u c...       1       0
96   Sunshine Quiz Wkly Q! Win a top Sony DVD playe...       1       1
112  Knock Knock Txt whose there to 80082 to enter ...       1       1
120  <Forwarded from 21870000>Hi - this is your Mai...       1       1
126  F

### Confusion matrix

In [27]:
def confusion_matrix(data):
    """
    data is DataFrame containing the columns Target and Result
    """
    confusion_matrix = np.zeros((2, 2))
    
    confusion_matrix[0, 0] = data\
    [data['Target'] == 1][data['Result'] == 1.0]['Target'].count()
    
    confusion_matrix[1, 1] = data\
    [data['Target'] == 0][data['Result'] == 0.0]['Target'].count()
    
    confusion_matrix[1, 0] = data\
    [data['Target'] == 1][data['Result'] == 0.0]['Target'].count()
    
    confusion_matrix[0, 1] = data\
    [data['Target'] == 0][data['Result'] == 1.0]['Target'].count()
    
    print("Accuracy : ",\
          (confusion_matrix[0, 0] + confusion_matrix[1, 1])/data.shape[0])
    print("Precision :",\
    (confusion_matrix[0, 0])/np.sum(confusion_matrix[0, :]))
    print("Recall :",\
    (confusion_matrix[0, 0])/np.sum(confusion_matrix[:, 0]))
    
    return confusion_matrix

confusion_matrix = confusion_matrix(test_data)

print(confusion_matrix)

Accuracy :  0.9500499500499501
Precision : 0.9868421052631579
Recall : 0.6048387096774194
[[ 75.   1.]
 [ 49. 876.]]


  
  # This is added back by InteractiveShellApp.init_path()
  
