Importing Packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Loading dataset

In [2]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


Data Preparation/Preprocessing

In [3]:
X = df.drop(columns=['Email No.'])
y = df['Email No.']

Splitting of data into training and testing datasets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Training the KNN & SVM models

In [5]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

Model Performance Evaluation

In [9]:
# KNN model evaluation
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_train_score = knn.score(X_train, y_train)
knn_test_score = knn.score(X_test, y_test)
knn_confusion_matrix = confusion_matrix(y_test, knn_pred)
knn_classification_report = classification_report(y_test, knn_pred)
print("K-Nearest Neighbors:")
print("Training Score = ", knn_train_score)
print("Testing Score = ", knn_test_score)
print("Accuracy = ", knn_accuracy)
print("Predicted Values = ", knn_pred)
print("Confusion Matrix = ", knn_confusion_matrix)
print("Classification Report = ", knn_classification_report)

K-Nearest Neighbors:
Training Score =  0.1723756906077348
Testing Score =  0.0
Accuracy =  0.0
Predicted Values =  ['Email 2721' 'Email 1928' 'Email 147' ... 'Email 1098' 'Email 2936'
 'Email 1910']
Confusion Matrix =  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report =                precision    recall  f1-score   support

     Email 1       0.00      0.00      0.00       0.0
    Email 10       0.00      0.00      0.00       0.0
   Email 100       0.00      0.00      0.00       1.0
  Email 1000       0.00      0.00      0.00       1.0
  Email 1001       0.00      0.00      0.00       0.0
  Email 1002       0.00      0.00      0.00       1.0
  Email 1003       0.00      0.00      0.00       1.0
  Email 1005       0.00      0.00      0.00       0.0
  Email 1008       0.00      0.00      0.00       0.0
  Email 1009       0.00      0.00      0.00       0.0
   Email 101       0.00      0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# SVM model evaluation
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_train_score = svm.score(X_train, y_train)
svm_test_score = svm.score(X_test, y_test)
svm_confusion_matrix = confusion_matrix(y_test, svm_pred)
svm_classification_report = classification_report(y_test, svm_pred)
print("Support Vector Machine:")
print("Training Score = ", svm_train_score)
print("Testing Score = ", svm_test_score)
print("Accuracy = ", svm_accuracy)
print("Predicted Values = ", svm_pred)
print("Confusion Matrix = ", svm_confusion_matrix)
print("Classification Report = ", svm_classification_report)

Support Vector Machine:
Training Score =  0.9077348066298343
Testing Score =  0.0
Accuracy =  0.0
Predicted Values =  ['Email 4638' 'Email 1990' 'Email 1656' ... 'Email 1657' 'Email 3946'
 'Email 4832']
Confusion Matrix =  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report =                precision    recall  f1-score   support

     Email 1       0.00      0.00      0.00       0.0
   Email 100       0.00      0.00      0.00       1.0
  Email 1000       0.00      0.00      0.00       1.0
  Email 1002       0.00      0.00      0.00       1.0
  Email 1003       0.00      0.00      0.00       1.0
  Email 1005       0.00      0.00      0.00       0.0
   Email 101       0.00      0.00      0.00       1.0
  Email 1011       0.00      0.00      0.00       1.0
  Email 1016       0.00      0.00      0.00       0.0
  Email 1019       0.00      0.00      0.00       1.0
  Email 1021       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
