In [1]:
import os
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
# Function to load and preprocess emails
def load_emails(folder):
    files = os.listdir(folder)
    emails = [os.path.join(folder, file) for file in files]
    words = []

    for email in emails:
        with open(email, encoding='latin-1') as f:
            blob = f.read()
            words += blob.split(" ")

    words = [word for word in words if word.isalpha()]
    return emails, words

In [3]:
# Function to extract features and labels
def extract_features_labels(emails, word_dict):
    features = []
    labels = []
    
    for email in emails:
        with open(email, encoding='latin-1') as f:
            blob = f.read().split(" ")
            data = [blob.count(word[0]) for word in word_dict]
            features.append(data)
            
            if 'spam' in email:
                labels.append(1)
            if 'ham' in email:
                labels.append(0)
                
    return np.array(features), np.array(labels)

In [4]:
# Load emails and preprocess
folder = 'training emails/'
emails, words = load_emails(folder)
print("Number of emails:", len(emails))

Number of emails: 5172


In [5]:
# Create word dictionary
word_dict = Counter(words)
del word_dict[""]
word_dict = word_dict.most_common(3000)

In [6]:
# Extract features and labels
features, labels = extract_features_labels(emails, word_dict)
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)

Features shape: (5172, 3000)
Labels shape: (5172,)


In [7]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=9)
print("Training data shape:", x_train.shape, y_train.shape)
print("Testing data shape:", x_test.shape, y_test.shape)

Training data shape: (4137, 3000) (4137,)
Testing data shape: (1035, 3000) (1035,)


In [8]:
# Train and test Naive Bayes Classifier
classifier_nb = MultinomialNB()
classifier_nb.fit(x_train, y_train)
y_pred_nb = classifier_nb.predict(x_test)
accuracy_nb = accuracy_score(y_pred_nb, y_test)
print("Naive Bayes Classifier Accuracy:", accuracy_nb)

Naive Bayes Classifier Accuracy: 0.9439613526570049


In [9]:
# Predict using trained classifier
new_email = """You have unlocked a new offer.
Deposit money in order to claim the prize. Hurry up before the stock runs out!"""
sample = []
for i in word_dict:
    sample.append(new_email.split(" ").count(i[0]))

sample = np.array(sample)
prediction = classifier_nb.predict(sample.reshape(1, 3000))
print("Prediction for the new email:", "spam" if prediction[0] == 1 else "ham")

Prediction for the new email: spam


In [10]:
# Hyperparameter tuning for Decision Tree Classifier
param_grid_dt = {'max_depth': [None, 10, 20, 30, 50],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}
classifier_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5)
classifier_dt.fit(x_train, y_train)
best_classifier_dt = classifier_dt.best_estimator_

In [11]:
# Hyperparameter tuning for Random Forest Classifier
param_dist_rf = {'n_estimators': [50, 100, 200],
                 'max_features': ['sqrt', 'log2', None],
                 'max_depth': [None, 10, 20, 30],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}

n_iter_search = 10  # You can adjust this value based on computational resources

classifier_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist_rf, n_iter=n_iter_search, cv=5, n_jobs=-1)
classifier_rf.fit(x_train, y_train)
best_classifier_rf = classifier_rf.best_estimator_

In [12]:
# Hyperparameter tuning for SVM Classifier
param_grid_svm = {'C': [0.1, 1, 10, 100],
                  'gamma': [1, 0.1, 0.01, 0.001],
                  'kernel': ['linear', 'rbf']}
classifier_svm = GridSearchCV(SVC(), param_grid_svm, cv=5)
classifier_svm.fit(x_train, y_train)
best_classifier_svm = classifier_svm.best_estimator_

In [13]:
# Hyperparameter tuning for Naive Bayes Classifier
param_grid_nb = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
classifier_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5)
classifier_nb.fit(x_train, y_train)
best_classifier_nb = classifier_nb.best_estimator_

In [14]:
# Evaluate best classifiers
y_pred_best_nb = best_classifier_nb.predict(x_test)
accuracy_best_nb = accuracy_score(y_pred_best_nb, y_test)

y_pred_best_dt = best_classifier_dt.predict(x_test)
accuracy_best_dt = accuracy_score(y_pred_best_dt, y_test)

y_pred_best_rf = best_classifier_rf.predict(x_test)
accuracy_best_rf = accuracy_score(y_pred_best_rf, y_test)

y_pred_best_svm = best_classifier_svm.predict(x_test)
accuracy_best_svm = accuracy_score(y_pred_best_svm, y_test)

In [15]:
# Print accuracies for best classifiers
print("Best Naive Bayes Accuracy:", accuracy_best_nb)
print("Best Decision Tree Accuracy:", accuracy_best_dt)
print("Best Random Forest Accuracy:", accuracy_best_rf)
print("Best SVM Accuracy:", accuracy_best_svm)

Best Naive Bayes Accuracy: 0.9594202898550724
Best Decision Tree Accuracy: 0.9323671497584541
Best Random Forest Accuracy: 0.9507246376811594
Best SVM Accuracy: 0.970048309178744


In [16]:
# Define base classifiers
base_classifiers = [('naive_bayes', best_classifier_nb),
                    ('decision_tree', best_classifier_dt),
                    ('random_forest', best_classifier_rf),
                    ('svm', best_classifier_svm)]

In [17]:
# Create bagging ensemble model with VotingClassifier as the base estimator
bagging_model = BaggingClassifier(VotingClassifier(base_classifiers), n_estimators=10, random_state=42)

In [18]:
# Fit the bagging ensemble model on the training data
bagging_model.fit(x_train, y_train)

In [19]:
# Predict using the bagging ensemble model
y_pred_bagging = bagging_model.predict(x_test)

In [20]:
# Calculate accuracy of the bagging ensemble model
accuracy_bagging = accuracy_score(y_pred_bagging, y_test)
print("Bagging Model Accuracy:", accuracy_bagging)

Bagging Model Accuracy: 0.9710144927536232
