In [2]:
# ----------------------------------------
# 🧪 1. Import Required Libraries
# ----------------------------------------
import pandas as pd
import re
import nltk
import sys
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
# ----------------------------------------
# 🧼 2. Load and Inspect Dataset
# ----------------------------------------
df = pd.read_csv('../raw/spam_ham_dataset.csv')  # Replace with correct path if needed
print(df.head(3))                                # Preview first 3 rows
print(df.isnull().sum())                         # Check for missing values
print(df.info())                                 # Dataset information
print(df.describe())                             # Basic stats


   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   

   label_num  
0          0  
1          0  
2          0  
Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB
None
        Unnamed: 0    label_num
count  5171.000000  5171.000000
mean   2585.000000     0.289886
std    1492.883452     0.453753
min       0.000000     0.000000
25%

In [4]:
# ----------------------------------------
# 🧹 3. Text Preprocessing & Cleaning
# ----------------------------------------
print(sys.executable)                            # Debug: check Python path
nltk.download("stopwords")                       # Download stopwords

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_txt(text):
    text = text.lower()                                           # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)           # Remove links
    text = re.sub(r'\@\w+|\#', '', text)                          # Remove @mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)                       # Remove numbers/punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]   # Remove stopwords
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_txt)
print(df.head(3))                                # Check cleaned text

c:\Users\lenovo\AppData\Local\Programs\Python\Python311\python.exe


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   

   label_num                                         clean_text  
0          0  subject enron methanol meter follow note gave ...  
1          0  subject hpl nom january see attached file hpln...  
2          0  subject neon retreat ho ho ho around wonderful...  


In [5]:
# ----------------------------------------
# 🔢 4. Feature Extraction (TF-IDF)
# ----------------------------------------
tfid = TfidfVectorizer(max_features=5000)
X = tfid.fit_transform(df['clean_text'])         # Features: vectorized text
Y = df['label']                                  # Target: spam or ham

In [6]:
# ----------------------------------------
# ✂️ 5. Train-Test Split
# ----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [7]:
# ----------------------------------------
# 📊 6. Base Model - Multinomial Naive Bayes
# ----------------------------------------
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Evaluate base model
print("Train Accuracy:", nb.score(X_train, y_train))
print("Test Accuracy:", nb.score(X_test, y_test))

# Predict and evaluate
y_pred = nb.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Train Accuracy: 0.9673597678916828
Test Accuracy: 0.9536231884057971
Confusion Matrix:
 [[714  28]
 [ 20 273]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      0.96      0.97       742
        spam       0.91      0.93      0.92       293

    accuracy                           0.95      1035
   macro avg       0.94      0.95      0.94      1035
weighted avg       0.95      0.95      0.95      1035



In [8]:
# ----------------------------------------
# 🧠 7. Hyperparameter Tuning with GridSearchCV
# ----------------------------------------
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
grid = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')

grid.fit(X_train, y_train)

# Results
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)

# Evaluate best model
best_model = grid.best_estimator_
print("Train Accuracy (Best Model):", best_model.score(X_train, y_train))
print("Test Accuracy (Best Model):", best_model.score(X_test, y_test))

# Predict and evaluate best model
y_pred = best_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'alpha': 0.1}
Best Cross-Validation Accuracy: 0.9610731413817476
Train Accuracy (Best Model): 0.9741295938104448
Test Accuracy (Best Model): 0.9555555555555556
Confusion Matrix:
 [[713  29]
 [ 17 276]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.96      0.97       742
        spam       0.90      0.94      0.92       293

    accuracy                           0.96      1035
   macro avg       0.94      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [9]:
# ----------------------------------------
# 💾 8. Save Model and Vectorizer
# ----------------------------------------
joblib.dump(grid, "gridsearch_naive_bayes.pkl")             # Full grid object (optional)
joblib.dump(best_model, "best_naive_bayes_model.pkl")       # Best model from GridSearch
joblib.dump(tfid, "tfidf_vectorizer.pkl")                   # TF-IDF vectoriz

['tfidf_vectorizer.pkl']

In [10]:
# ----------------------------------------
# 🔁 9. Load and Use Saved Model (Demo)
# ----------------------------------------
model = joblib.load("best_naive_bayes_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Predict again using loaded model (just to verify)
y_pred = model.predict(X_test)
print("Test Accuracy (Loaded Model):", model.score(X_test, y_test))

Test Accuracy (Loaded Model): 0.9555555555555556
