In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [3]:
df = pd.read_csv('emails.csv')
df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [5]:
df.isnull().sum()

Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64

In [8]:
X = df.drop(['Prediction','Email No.'],axis=1)
y = df.Prediction


In [9]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [12]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [14]:
metrics = {
    "Model": ["KNN", "SVM"],
    "Accuracy": [accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_svm)],
    "Precision": [precision_score(y_test, y_pred_knn), precision_score(y_test, y_pred_svm)],
    "Recall": [recall_score(y_test, y_pred_knn), recall_score(y_test, y_pred_svm)],
    "F1 Score": [f1_score(y_test, y_pred_knn), f1_score(y_test, y_pred_svm)]
}
results = pd.DataFrame(metrics)
print(results)

  Model  Accuracy  Precision    Recall  F1 Score
0   KNN  0.862802   0.725146  0.837838  0.777429
1   SVM  0.959420   0.920530  0.939189  0.929766


In [15]:
print("Confusion Matrix (KNN):\n", confusion_matrix(y_test, y_pred_knn))


Confusion Matrix (KNN):
 [[645  94]
 [ 48 248]]


In [None]:
# True Negatives (TN) = 645: These are non-spam emails correctly identified as non-spam.
# False Positives (FP) = 94: These are non-spam emails incorrectly classified as spam.
# False Negatives (FN) = 48: These are spam emails incorrectly classified as non-spam.
# True Positives (TP) = 248: These are spam emails correctly identified as spam.

In [16]:
print("Confusion Matrix (SVM):\n", confusion_matrix(y_test, y_pred_svm))

Confusion Matrix (SVM):
 [[715  24]
 [ 18 278]]


In [None]:
# True Negatives (TN) = 715: Non-spam emails correctly classified as non-spam.
# False Positives (FP) = 24: Non-spam emails incorrectly classified as spam.
# False Negatives (FN) = 18: Spam emails incorrectly classified as non-spam.
# True Positives (TP) = 278: Spam emails correctly classified as spam.

In [None]:
# KNN has more false positives and false negatives compared to SVM, meaning it misclassifies more emails.
# SVM performs better, with fewer misclassifications in both categories.