In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
def load_dataset(directory):
    texts = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()
                texts.append(text)
                content = file.read()
                labels.append(directory.split('/')[-1])
    return pd.DataFrame({'content': texts, 'label': labels})

In [10]:
spam_df = load_dataset('C:/Users/user/Documents/enron6/spam')
ham_df = load_dataset('C:/Users/user/Documents/enron6/ham')

In [11]:
# Concatenate spam and ham datasets
enron_spam_df = pd.concat([spam_df, ham_df], ignore_index=True)

# Explore the dataset
print(enron_spam_df.head())

                                             content label
0  Subject: advs\ngreetings ,\ni am benedicta lin...  spam
1  Subject: whats new in summer ? bawled\ncarolyn...  spam
2  Subject: \nh $ ello\ndea 54 r home owner ,\nwe...  spam
3  Subject: : ) ) you can not save the world by q...  spam
4  Subject: need software ? click here .\ntop qua...  spam


In [12]:
X = enron_spam_df['content']
y = enron_spam_df['label']

In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_classifier = KNeighborsClassifier()

knn_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
    ('knn', knn_classifier)
])

In [14]:
param_grid = {
    'knn__n_neighbors': [3, 5, 7], 
    'knn__weights': ['uniform', 'distance']
}

grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

best_knn_model = grid_search.best_estimator_


Best Hyperparameters: {'knn__n_neighbors': 7, 'knn__weights': 'distance'}


In [15]:
# Model Prediction
y_pred = best_knn_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

Accuracy: 0.97

Confusion Matrix:
[[278  20]
 [ 10 892]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.93      0.95       298
        spam       0.98      0.99      0.98       902

    accuracy                           0.97      1200
   macro avg       0.97      0.96      0.97      1200
weighted avg       0.97      0.97      0.97      1200

