In [172]:
import os
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from math import sqrt
from collections import Counter 


In [173]:
class KNN:
    def __init__(self, K, random):
        self.K = K
        self.random = random
    
    def load_data(self):
        df = pd.read_csv("./spam.csv", encoding='latin-1')
        return df

    def clean_data(self, df):
        clean = df.drop(['Unnamed: 2'], axis=1)
        clean = clean.drop(['Unnamed: 3'], axis = 1)
        clean = clean.drop(['Unnamed: 4'], axis = 1)
        clean.rename(columns = {'v1' : 'Category', 'v2' : 'Message'}, inplace=True)
        clean["Message"] = clean["Message"].str.replace('[^a-zA-Z]',' ', regex = True)
        clean["Message"] = clean["Message"].str.lower()
        return clean
    
    def split_data(self, clean):
        train_data, test_data = train_test_split(clean, test_size=0.25, random_state = self.random)
        return train_data, test_data

    def Euclidean_diff(self, train_message, test_dict):
        diff = 0 
        train_dict = Counter(train_message)
        for word in test_dict:
            if word in train_dict:
                diff += (test_dict[word]-train_dict[word]) ** 2
                del train_dict[word]
            else:
                diff += test_dict[word] ** 2
        for word in train_dict:
            diff += train_dict[word] ** 2
        return sqrt(diff)

    def spam_or_ham(self, Kvalues):
        spam_num = 0
        ham_num = 0
        for category in Kvalues:
            if category[0] == "spam":
                spam_num += 1
            else:
                ham_num += 1
        if spam_num > ham_num:
            return "spam"
        else:
            return "ham"

    def KNN_classifier(self, train_data, test_data, K):
        res = []
        for test_row in test_data.itertuples():
            sim = []
            test_dict = Counter(test_row.Message)
            for training_rows in train_data.itertuples():
                diff = self.Euclidean_diff(training_rows.Message, test_dict)
                sim.append([training_rows.Category, diff])
            sim = sorted(sim, key = lambda i: i[1])
            Kvalues = [sim[i] for i in range(K)]
            res.append(self.spam_or_ham(Kvalues))
        return res

    def main(self):
        df = self.load_data()
        clean = self.clean_data(df)
        train_data, test_data = self.split_data(clean)
        train_data["Message"] = train_data["Message"].str.split()
        res = self.KNN_classifier(train_data, test_data, self.K)
        # Order test result_actual value
        spam_spam = 0
        spam_ham = 0
        ham_ham = 0
        ham_spam = 0
        count = 0
        for test_row in test_data.itertuples():
            if test_row.Category == res[count] and test_row.Category == "spam":
                spam_spam += 1
            elif test_row.Category != res[count] and test_row.Category == "spam":
                spam_ham += 1
            elif test_row.Category == res[count] and test_row.Category == "ham":
                ham_ham += 1
            else:
                ham_spam += 1
        print("Spam matched by spam = " + str(spam_spam))
        print("Ham matched by ham = " + str(ham_ham))
        print("Ham matched by spam = " + str(spam_ham))
        print("Spam matched by ham = "+str(ham_spam))
        accuracy = (spam_spam + ham_ham)/(spam_spam + ham_ham + spam_ham + ham_spam)
        accuracy *= 100
        accuracy = round(accuracy, 2)
        print("Accuracy: "+str(accuracy)+"%")



        


In [174]:
instance = KNN(11, 21)
instance.main()

Spam matched by spam = 0
Ham matched by ham = 1212
Ham matched by spam = 181
Spam matched by ham = 0
Accuracy: 87.01%
