In [40]:
import pandas as pd 
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [41]:
df = pd.read_csv(r'D:\Machine_Learning\Naive_Bayes\Datasets\Spam Ham Messages.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [42]:
df.shape

(5572, 2)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [44]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [45]:
df.duplicated().sum()

np.int64(415)

In [46]:
df.drop_duplicates(inplace=True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5157 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5157 non-null   object
 1   Message   5157 non-null   object
dtypes: object(2)
memory usage: 120.9+ KB


In [48]:
df['Category'].value_counts()

Category
ham     4516
spam     641
Name: count, dtype: int64

In [49]:
X = df['Message']
y  = df['Category']

In [50]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8, random_state = 42)

### **Text vectorization using TF-IDF**
Converts raw text messages into numerical vectors based on Term Frequency-Inverse Document Frequency, highlighting important words while down-weighting common ones. This transforms text into a format suitable for machine learning.

### **Oversampling using SMOTE to handle class imbalance**
SMOTE (Synthetic Minority Over-sampling Technique) creates synthetic examples of the minority class to balance the dataset. It prevents the classifier from being biased toward the majority class.

### **Classification using Naive Bayes**
Multinomial Naive Bayes is a probabilistic algorithm ideal for text data. It assumes features (words) are conditionally independent and is effective with word count or TF-IDF features.

### **Why Use a Pipeline?**

1. Modularity
Each step (vectorizer, SMOTE, classifier) is treated as a reusable and replaceable module.

2. Clean and Consistent
You avoid repetitive code and ensure that the same transformations are applied during training and testing (no data leakage).

3. Integrated Oversampling
Regular sklearn.pipeline.Pipeline does not allow resampling methods like SMOTE, which change both input (X) and output (y).
So you must use imblearn.pipeline.Pipeline, which supports these steps safely.

4. Cross-validation compatibility
You can plug your pipeline directly into cross_val_score, GridSearchCV, etc., and it will apply all steps correctly within each fold.

In [51]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

pipeline = ImbPipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('oversampler', SMOTE(random_state=42)),  # handle class imbalance
    ('classifier', MultinomialNB())
])

pipeline.fit(x_train, y_train)

In [52]:
y_pred = pipeline.predict(x_test)    
y_pred    

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [53]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)*100
print(f"Accuracy: {accuracy:.2f} %")

Accuracy: 96.03 %


In [54]:
classification_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_report)

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.96      0.98       896
        spam       0.78      0.96      0.86       136

    accuracy                           0.96      1032
   macro avg       0.89      0.96      0.92      1032
weighted avg       0.97      0.96      0.96      1032



In [60]:
examples = ["Can you please call me today", "We can make tommrow", "Free entry in 2 a wkly comp to win FA Cup"]
predictions = pipeline.predict(examples)
predictions[:3]

array(['ham', 'ham', 'spam'], dtype='<U4')

In [56]:
import joblib
joblib.dump(pipeline, 'Spam_Ham_Detection_Model.pkl')

['Spam_Ham_Detection_Model.pkl']

In [None]:
Spam_Ham_Detection = joblib.load('Spam_Ham_Detection_Model.pkl')
