In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [11]:
# Step 2: Load your dataset
import os
print(os.getcwd())  # Shows the current working directory
print(os.listdir()) # Lists all files in the directory


path = (r"C:\Users\Admin\Downloads\mail_data.csv")
print(os.path.exists(path))  # Should return True
df = pd.read_csv(path)


# Optional: See the first few rows
print(df)


C:\Users\Admin
['.anaconda', '.android', '.conda', '.condarc', '.continuum', '.gitconfig', '.idlerc', '.ipynb_checkpoints', '.ipython', '.jupyter', '.matplotlib', '.ms-ad', '.node_repl_history', '.nuget', '.thumbnails', '.VirtualBox', '.vscode', 'anaconda3', 'anaconda_projects', 'AppData', 'Application Data', 'battery-report.html', 'Contacts', 'Cookies', 'DemoApp', 'Documents', 'Downloads', 'email classification', 'Favorites', 'Google', 'HelloWorldApp', 'honor 1', 'import nltk.py', 'java', 'Links', 'Local Settings', 'Microsoft', 'Music', 'My Documents', 'NetHood', 'node_modules', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{2ad838bc-efea-11ee-a54d-000d3a94eaa1}.TM.blf', 'NTUSER.DAT{2ad838bc-efea-11ee-a54d-000d3a94eaa1}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{2ad838bc-efea-11ee-a54d-000d3a94eaa1}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'our project', 'package-lock.json', 'package.json', 'PDFEditor', 'PrintHood', 'pytho

In [12]:
# Step 3: Convert 'Category' from 'ham'/'spam' to 0/1
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

# Split features and labels
X = df['Message']  # email text
y = df['Category'] # label: 0 = ham, 1 = spam


In [13]:
# Step 4: Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Step 5: Convert text to numeric features using CountVectorizer
vectorizer = CountVectorizer()

# Learn vocabulary from training data and transform it
X_train_vec = vectorizer.fit_transform(X_train)

# Transform testing data (don't fit again!)
X_test_vec = vectorizer.transform(X_test)


In [15]:
# Step 6: Train a Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)


In [16]:
# Step 7: Predict and evaluate
y_pred = model.predict(X_test_vec)

# Print Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9919282511210762
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [17]:
# Step 8: Save the trained model and vectorizer
joblib.dump(model, "spam_classifier_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']