In [2]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

DATA_PATH = r"C:\Users\techsupport1\OneDrive - Exafluence, Inc\Desktop\Msc\Sem1\ML\mini project\data"

FOLDERS = {
    "easy_ham": 0,  # Not spam
    "hard_ham": 0,  # Not spam
    "spam_2": 1     # Spam
}

In [3]:
def load_emails(base_path, folder_name, label):
    emails = []
    folder_path = os.path.join(base_path, folder_name, folder_name)  
    if not os.path.exists(folder_path):
        print(f"Skipping: {folder_path} (Not Found)")
        return emails

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path):  
            with open(file_path, 'r', encoding='latin-1') as file:
                emails.append((file.read(), label))   
    return emails
all_emails = []
for folder, label in FOLDERS.items():
    all_emails.extend(load_emails(DATA_PATH, folder, label))

df = pd.DataFrame(all_emails, columns=["email", "label"])
df

Unnamed: 0,email,label
0,From exmh-workers-admin@redhat.com Thu Aug 22...,0
1,From Steve_Burt@cursor-system.com Thu Aug 22 ...,0
2,From timc@2ubh.com Thu Aug 22 13:52:59 2002\n...,0
3,From irregulars-admin@tb.tf Thu Aug 22 14:23:...,0
4,From exmh-users-admin@redhat.com Thu Aug 22 1...,0
...,...,...
4193,From tba@insiq.us Wed Dec 4 11:46:34 2002\nR...,1
4194,Return-Path: <raye@yahoo.lv>\nReceived: from u...,1
4195,From cweqx@dialix.oz.au Tue Aug 6 11:03:54 2...,1
4196,From ilug-admin@linux.ie Wed Dec 4 11:52:36 ...,1


In [4]:

def clean_text(text):
    text = text.lower()  # Converting to lowercase
    text = re.sub(r'<[^>]+>', '', text)  #HTML tags removing
    text = re.sub(r'\W+', ' ', text)  # Removing special characters
    text = re.sub(r'\d+', '', text)  # Removing numbers
    return text

df["email"] = df["email"].apply(clean_text)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df["email"], df["label"], test_size=0.2)

In [34]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Data Preprocessing Completed!")

Data Preprocessing Completed!


In [17]:
df.head()

Unnamed: 0,email,label
0,from exmh workers admin redhat com thu aug ...,0
1,from steve_burt cursor system com thu aug ...,0
2,from timc ubh com thu aug return path del...,0
3,from irregulars admin tb tf thu aug retur...,0
4,from exmh users admin redhat com thu aug ...,0


In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train the model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

y_pred = nb_classifier.predict(X_test_tfidf)

In [33]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

print("\n Classification Report:\n", classification_report(y_test, y_pred))

print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Model Accuracy: 0.9714

 Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98       541
           1       0.95      0.97      0.96       299

    accuracy                           0.97       840
   macro avg       0.97      0.97      0.97       840
weighted avg       0.97      0.97      0.97       840


 Confusion Matrix:
 [[526  15]
 [  9 290]]


In [32]:
nb_classifier = MultinomialNB(alpha=0.5)  # here experimented with different values (0.1, 0.5, 1.0)
nb_classifier.fit(X_train_tfidf, y_train)


In [37]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)
y_pred_svm = svm_classifier.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


SVM Accuracy: 0.9952380952380953


In [39]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train_tfidf, y_train)
y_pred_rf = rf_classifier.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.9928571428571429


In [40]:
import joblib

# Saving the model
joblib.dump(nb_classifier, "spam_classifier.pkl")

# Saving the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [41]:
# Load the saved model & vectorizer
loaded_model = joblib.load("spam_classifier.pkl")
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Function to predict spam/ham
def predict_email(text):
    cleaned_text = clean_text(text)  # Use the same cleaning function
    transformed_text = loaded_vectorizer.transform([cleaned_text])  # Convert to TF-IDF
    prediction = loaded_model.predict(transformed_text)
    return "SPAM" if prediction[0] == 1 else "HAM"

# Test with a sample email
new_email = "Congratulations! You've won a free iPhone. Click here to claim your prize."
print("Prediction:", predict_email(new_email))


Prediction: SPAM


In [43]:
new_email = "Hi How are you?"
print("Prediction:", predict_email(new_email))

Prediction: HAM


In [9]:
pip install flask


Note: you may need to restart the kernel to use updated packages.


In [1]:
import joblib

# Assuming 'model' is your trained classifier
joblib.dump(model, "spam_classifier.pkl")

print("Model saved as spam_classifier.pkl")


NameError: name 'model' is not defined

In [None]:
import joblib

# Assuming 'model' is your trained classifier
joblib.dump(model, "spam_classifier.pkl")

print("Model saved as spam_classifier.pkl")

In [11]:
import os

file_path = os.path.abspath("spam_classifier.pkl")
print("Model saved at:", file_path)


Model saved at: C:\Users\techsupport1\spam_classifier.pkl


In [12]:
import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))


{0: 2221, 1: 1137}


In [5]:
import joblib

model = joblib.load("spam_classifier.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

sample_email = ["Hi Divya, how are you?"]
sample_vectorized = vectorizer.transform(sample_email)
print(model.predict_proba(sample_vectorized))



[[0.62311369 0.37688631]]
