In [1]:
import pandas as pd
import re
import nltk
import sys
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
df = pd.read_csv('../raw/spamDetection.csv') 
print(df.head(3))                               
print(df.isnull().sum())                         
print(df.info())                                
print(df.describe())                            


  label                                               text  length  punc
0   ham  Go until jurong point, crazy.. Available only ...     111     9
1   ham                      Ok lar... Joking wif u oni...      29     6
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155     6
label     0
text      0
length    0
punc      0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
 2   length  5572 non-null   int64 
 3   punc    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.3+ KB
None
            length         punc
count  5572.000000  5572.000000
mean     80.489950     4.177495
std      59.942907     4.623919
min       2.000000     0.000000
25%      36.000000     2.000000
50%      62.000000     3.000000
75%     122.000000     6.000000
max     910.

In [4]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [5]:

#Text Preprocessing & Cleaning

print(sys.executable)                            #  check Python path
nltk.download("stopwords")                       # Download stopwords

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_txt(text):
    text = text.lower()                                           # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)           # Remove links
    text = re.sub(r'\@\w+|\#', '', text)                          # Remove @mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)                       # Remove numbers/punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]   # Remove stopwords
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_txt)
print(df.head(3))                                # Check cleaned text

c:\Users\lenovo\AppData\Local\Programs\Python\Python311\python.exe


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  label                                               text  length  punc  \
0   ham  Go until jurong point, crazy.. Available only ...     111     9   
1   ham                      Ok lar... Joking wif u oni...      29     6   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155     6   

                                          clean_text  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts st ...  


In [6]:

#Feature Extraction (TF-IDF)

tfid = TfidfVectorizer(max_features=5000)
X = tfid.fit_transform(df['clean_text'])         # Features: vectorized text
Y = df['label']                                  # Target: spam or ham

In [7]:

# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=42 
)

In [8]:

#  Multinomial Naive Bayes

nb = MultinomialNB()
nb.fit(X_train, y_train)

# Evaluate base model
print("Train Accuracy:", nb.score(X_train, y_train))
print("Test Accuracy:", nb.score(X_test, y_test))

# Predict and evaluate
y_pred = nb.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Train Accuracy: 0.9811532420910927
Test Accuracy: 0.967713004484305
Confusion Matrix:
 [[966   0]
 [ 36 113]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [9]:

#  Hyperparameter Tuning with GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
grid = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')

grid.fit(X_train, y_train)

# Results
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)

# Evaluate best model
best_model = grid.best_estimator_
print("Train Accuracy (Best Model):", best_model.score(X_train, y_train))
print("Test Accuracy (Best Model):", best_model.score(X_test, y_test))

# Predict and evaluate best model
y_pred = best_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'alpha': 0.1}
Best Cross-Validation Accuracy: 0.9786862144111769
Train Accuracy (Best Model): 0.9934933811981154
Test Accuracy (Best Model): 0.9775784753363229
Confusion Matrix:
 [[958   8]
 [ 17 132]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.99      0.99       966
        spam       0.94      0.89      0.91       149

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [10]:
#  Save Model and Vectorizer

joblib.dump(grid, "gridsearch_naive_bayes.pkl")             # Full grid object (optional)
joblib.dump(best_model, "best_naive_bayes_model.pkl")       # Best model from GridSearch
joblib.dump(tfid, "tfidf_vectorizer.pkl")                   # TF-IDF vectoriz

['tfidf_vectorizer.pkl']

In [11]:

#  Load and Use Saved Model (Demo)

model = joblib.load("best_naive_bayes_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Predict again using loaded model (just to verify)
y_pred = model.predict(X_test)
print("Test Accuracy (Loaded Model):", model.score(X_test, y_test))

Test Accuracy (Loaded Model): 0.9775784753363229
