In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/Github/tp_inf3236_groupe17

/content/drive/MyDrive/Github/tp_inf3236_groupe17


In [None]:
import os    
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('stopwords')
  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#1.  Loading the Data

In [None]:
def load_data():
    print("Loading data...")
    
    ham_files_location = os.listdir("dataset/ham")
    spam_files_location = os.listdir("dataset/spam")
    data = []
    
    # Load ham email
    for file_path in ham_files_location:
        f = open("dataset/ham/" + file_path, "r", encoding='ISO-8859-1')
        text = str(f.read())
        data.append([text, "ham"])
    
    # Load spam email
    for file_path in spam_files_location:
        f = open("dataset/spam/" + file_path, "r",encoding='ISO-8859-1')
        text = str(f.read())
        data.append([text, "spam"])
        
    data = np.array(data)
    
    print("flag 1: loaded data")
    return data


# 2.  Data Pre-processing

In [None]:
def preprocess_data(data):
    print("Preprocessing data...")
    
    punc = string.punctuation           # Punctuation list
    sw = stopwords.words('english')     # Stopwords list
    
    for record in data:
        # Remove common punctuation and symbols
        for item in punc:
            record[0] = record[0].replace(item, "")
             
        # Lowercase all letters and remove stopwords 
        splittedWords = record[0].split()
        newText = ""
        for word in splittedWords:
            if word not in sw:
                word = word.lower()
                newText = newText + " " + word  # Takes back all non-stopwords
        record[0] = newText
        
    print("flag 2: preprocessed data")        
    return data

# 3.  Splitting the Data into Training and Testing Sets

In [None]:
def split_data(data):
    print("Splitting data...")
    
    features = data[:, 0]   # array containing all email text bodies
    labels = data[:, 1]     # array containing all corresponding labels
    
    training_data, test_data, training_labels, test_labels =\
        train_test_split(features, labels, test_size = 0.27, random_state = 42)
    
    print("flag 3: splitted data")
    return training_data, test_data, training_labels, test_labels


# 4.  The KNN Algorithm
get_count() function

In [None]:
def get_count(text):
    wordCounts = dict()
    for word in text.split():
        if word in wordCounts:
            wordCounts[word] += 1
        else:
            wordCounts[word] = 1
    
    return wordCounts


# 5.  euclidean_difference() function

In [None]:
def euclidean_difference(test_WordCounts, training_WordCounts):
    total = 0
     
    for word in test_WordCounts:
        # if word is in both emails, calculate count difference, square it, and add to total
        if word in test_WordCounts and word in training_WordCounts:
            total += (test_WordCounts[word] - training_WordCounts[word])**2
            
            # to remove common words, to speed up processing in next for loop
            del training_WordCounts[word] 
            
        # if word in test email only, square the count and add to total
        else:
            total += test_WordCounts[word]**2
    
    # Square the count of words only in training email and add to total
    for word in training_WordCounts:
            total += training_WordCounts[word]**2
            
    return total**0.5


# 6.  get_class() function

In [None]:
def get_class(selected_Kvalues):
    spam_count = 0
    ham_count = 0
    
    # Counts the frequency of each class in K nearest neighbours
    for value in selected_Kvalues:
        if value[0] == "spam":
            spam_count += 1
        else:
            ham_count += 1
    
    if spam_count > ham_count:
        return "spam"
    else:
        return "ham"

# 7.  knn_classifier() function

In [None]:
def knn_classifier(training_data, training_labels, test_data, K, tsize):
    print("Running KNN Classifier...")
    
    result = []
    counter = 1
    
    # word counts for training email
    training_WordCounts = [] 
    for training_text in training_data:
            training_WordCounts.append(get_count(training_text))  
            
    for test_text in test_data:
        similarity = [] # List of euclidean distances
        test_WordCounts = get_count(test_text)  # word counts for test email
        
        # Getting euclidean difference 
        for index in range(len(training_data)):
            euclidean_diff =\
                euclidean_difference(test_WordCounts, training_WordCounts[index])
            similarity.append([training_labels[index], euclidean_diff])
        
        # Sort list in ascending order based on euclidean difference
        similarity = sorted(similarity, key = lambda i:i[1])    
        
        # Select K nearest neighbours
        selected_Kvalues = [] 
        for i in range(K):
            selected_Kvalues.append(similarity[i])
            
        # Predicting the class of email
        result.append(get_class(selected_Kvalues))
        
        print(str(counter) + "/" + str(tsize) + " done!")
        counter += 1
        
    return result


# 8.  main() function

In [None]:
def main(K):
    data = load_data()
    data = preprocess_data(data)
    training_data, test_data, training_labels, test_labels = split_data(data)
    
    # sample size of test emails to be tested. Use len(test_data) to test all test_data
    tsize = len(test_data)
    result = knn_classifier(training_data, training_labels, test_data[:tsize], K, tsize) 
    accuracy = accuracy_score(test_labels[:tsize], result)
    
    print("training data size\t: " + str(len(training_data)))
    print("test data size\t\t: " + str(len(test_data)))
    print("K value\t\t\t\t: " + str(K))
    print("Samples tested\t\t: " + str(tsize))
    print("% accuracy\t\t\t: " + str(accuracy * 100))
    print("Number correct\t\t: " + str(int(accuracy * tsize)))
    print("Number wrong\t\t: " + str(int((1 - accuracy) * tsize)))


main(11)

Loading data...
flag 1: loaded data
Preprocessing data...
flag 2: preprocessed data
Splitting data...
flag 3: splitted data
Running KNN Classifier...
1/1593 done!
2/1593 done!
3/1593 done!
4/1593 done!
5/1593 done!
6/1593 done!
7/1593 done!
8/1593 done!
9/1593 done!
10/1593 done!
11/1593 done!
12/1593 done!
13/1593 done!
14/1593 done!
15/1593 done!
16/1593 done!
17/1593 done!
18/1593 done!
19/1593 done!
20/1593 done!
21/1593 done!
22/1593 done!
23/1593 done!
24/1593 done!
25/1593 done!
26/1593 done!
27/1593 done!
28/1593 done!
29/1593 done!
30/1593 done!
31/1593 done!
32/1593 done!
33/1593 done!
34/1593 done!
35/1593 done!
36/1593 done!
37/1593 done!
38/1593 done!
39/1593 done!
40/1593 done!
41/1593 done!
42/1593 done!
43/1593 done!
44/1593 done!
45/1593 done!
46/1593 done!
47/1593 done!
48/1593 done!
49/1593 done!
50/1593 done!
51/1593 done!
52/1593 done!
53/1593 done!
54/1593 done!
55/1593 done!
56/1593 done!
57/1593 done!
58/1593 done!
59/1593 done!
60/1593 done!
61/1593 done!
62/15