In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [14]:
spam_email_data = pd.read_csv("./spam.csv", encoding='latin-1')
spam_email_data = spam_email_data[['v1', 'v2']]

In [None]:
def clean_text(text):
    text = text.lower() 
    text = re.sub(r'\S+@\S+', 'emailaddress', text) 
    text = re.sub(r'http\S+|www\S+', 'url', text)  
    text = re.sub(r'\d+', 'number', text)  
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    return text

In [16]:
X = spam_email_data['v2'] 
Y=  spam_email_data['v1']
spam_email_data['v2'] = spam_email_data['v2'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

vectorize = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_vec = vectorize.fit_transform(X_train)
X_test_vec = vectorize.transform(X_test)

X_train_vec = normalize(X_train_vec)
X_test_vec = normalize(X_test_vec)



In [17]:

def cosine_distance(a, b):
    return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def knn_predict_weighted(X_train, y_train, X_test, k=5, epsilon=1e-5):
   
    y_pred = []
    for i in range(X_test.shape[0]):
        # calculating the distance between X_test and X_train
        distances = [cosine_distance(X_test[i], X_train[j]) for j in range(X_train.shape[0])]
        
        #finding k nearest entry
        k_indices = np.argsort(distances)[:k]
        k_labels = [y_train[j] for j in k_indices]
        k_distances = [distances[j] for j in k_indices]
        
        #calculating weights
        weights = [1 / (d + epsilon) for d in k_distances]
        
        #suming weights based on classes
        class_weight_sum = {}
        for label, w in zip(k_labels, weights):
            if label in class_weight_sum:
                class_weight_sum[label] += w
            else:
                class_weight_sum[label] = w
        
        #choosing a class with maximum weight 
        pred_label = max(class_weight_sum, key=class_weight_sum.get)
        y_pred.append(pred_label)
        
    return np.array(y_pred)

In [None]:
y_pred = knn_predict_weighted(X_train_vec.toarray(), y_train.values, X_test_vec.toarray(), k=5)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

  return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


Accuracy: 0.9545454545454546


In [20]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1415   38]
 [  38  181]]


In [23]:
new_email = "hello elnaz,it is sara,call me please"
new_email_vec = vectorize.transform([new_email])
new_email_vec = normalize(new_email_vec)  
new_email_vec = new_email_vec.toarray()  
print(knn_predict_weighted(X_train_vec.toarray(), y_train.values, new_email_vec, k=5))

['ham']


  return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
