In [11]:
import pandas as pd

In [12]:
df = pd.read_csv('../raw/spam_ham_dataset.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0


In [13]:
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [15]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


In [16]:
import sys
print(sys.executable)

c:\Users\lenovo\AppData\Local\Programs\Python\Python311\python.exe


In [17]:
import re  #For text pattern matching (like finding URLS)
import nltk #for text processing
from nltk.corpus import stopwords  # Common words like "the", "is", etc.
from nltk.stem import PorterStemmer   # Reduces words to root form (e.g., "running" → "run")

In [18]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english')) 

In [20]:
def clean_txt(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Removes links
    text = re.sub(r'\@\w+|\#', '', text)  # Removes Twitter-style @mentions and #hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removes numbers, punctuation (!, ?)

    words = text.split()  # Splits sentence into words
    words = [word for word in words if word not in stop_words]  # Removes "the", "and", etc.    

    return ' '.join(words)

In [21]:
df['clean_text'] = df['text'].apply(clean_txt)

In [22]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,label,text,label_num,clean_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom january see attached file hpln...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonderful...


In [23]:
#Feature Extraction (Text to numbers)
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features=5000)
X = tfid.fit_transform(df['clean_text'])
Y = df['label']

In [24]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,label,text,label_num,clean_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom january see attached file hpln...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonderful...


In [25]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , Y , test_size=0.2 , random_state=42)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report  , confusion_matrix

nb = MultinomialNB()

nb.fit(X_train , y_train)
# Accuracy on training data
train_accuracy = nb.score(X_train, y_train)

# Accuracy on test data
test_accuracy = nb.score(X_test, y_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


y_pred = nb.predict(X_test)

cm = confusion_matrix(y_test , y_pred)
print('conf mat' , cm)

print("Classification Report:\n", classification_report(y_test, y_pred))


Train Accuracy: 0.9673597678916828
Test Accuracy: 0.9536231884057971
conf mat [[714  28]
 [ 20 273]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      0.96      0.97       742
        spam       0.91      0.93      0.92       293

    accuracy                           0.95      1035
   macro avg       0.94      0.95      0.94      1035
weighted avg       0.95      0.95      0.95      1035



In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Define the model
nb = MultinomialNB()

# Step 2: Define parameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
}

# Step 3: GridSearchCV setup with 5-fold cross-validation
grid = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')

# Step 4: Fit on training data
grid.fit(X_train, y_train)

# Step 5: Results
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)

# Step 6: Get best model and evaluate
best_model = grid.best_estimator_

# Accuracy on training data
train_accuracy = best_model.score(X_train, y_train)
print("Train Accuracy:", train_accuracy)

# Accuracy on test data
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

# Step 7: Predict and evaluate
y_pred = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

print("Classification Report:\n", classification_report(y_test, y_pred))


Best Parameters: {'alpha': 0.1}
Best Cross-Validation Accuracy: 0.9610731413817476
Train Accuracy: 0.9741295938104448
Test Accuracy: 0.9555555555555556
Confusion Matrix:
 [[713  29]
 [ 17 276]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.96      0.97       742
        spam       0.90      0.94      0.92       293

    accuracy                           0.96      1035
   macro avg       0.94      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [38]:
import joblib

# Save full GridSearchCV object
joblib.dump(grid, "gridsearch_naive_bayes.pkl")


['gridsearch_naive_bayes.pkl']

In [None]:
#saving our model
grid_loaded = joblib.load("gridsearch_naive_bayes.pkl")

# Access best model again
best_model = grid_loaded.best_estimator_

# Predict
y_pred = best_model.predict(X_test)


In [40]:
joblib.dump(grid.best_estimator_, "best_naive_bayes_model.pkl")


['best_naive_bayes_model.pkl']

In [41]:
model = joblib.load("best_naive_bayes_model.pkl")
y_pred = model.predict(X_test)


In [42]:
print("Test Accuracy:", model.score(X_test, y_test))


Test Accuracy: 0.9555555555555556
