In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn.svm import SVC #Support Vector Classifier

In [2]:

# --- 1. Load the dataset ---
df = pd.read_csv('mail_data.csv')

In [3]:
print(df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
data =df.where((pd.notnull(df)),'') 
# --- 2. Handle missing values ---

In [5]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
data.shape

(5572, 2)

In [8]:
data.loc[data['Category'] == 'spam','Category',] = 0
data.loc[data['Category'] == 'ham', 'Category',] = 1

In [9]:

print("\nDataFrame after label encoding:")
print(data.head())


DataFrame after label encoding:
  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [10]:
# --- DIAGNOSTIC: Check Class Distribution ---
print("\nClass distribution (0=Spam, 1=Ham):")
print(data['Category'].value_counts())


Class distribution (0=Spam, 1=Ham):
Category
1    4825
0     747
Name: count, dtype: int64


In [11]:
X = data['Message']
Y = data['Category']

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [13]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [14]:
X_train , X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state =3, stratify=Y)
# stratify=Y ensures that the train/test split has the same proportion of spam/ham as the full dataset.
# This is crucial for imbalanced datasets.

In [15]:
#random_state ensures your data split is consistent.Think as a preset shuffle for a deck of  cards.

#No random_state: You get a different random shuffle (and different piles) every time.
#With random_state (e.g., random_state=3): You get the exact same shuffle (and same piles) every #single time.
#This makes your results reproducible so WE gets the same outcome from the same code.

In [16]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [17]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(4457,)
(1115,)


In [18]:
#Advanced Text Preprocessing Function (NLP Core)
lemmatizer = WordNetLemmatizer()
english_stopwords = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs (common in spam)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)
    # Remove special characters and numbers (keep only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize words
    words = text.split()
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in english_stopwords]
    # Join words back into a cleaned string
    text = ' '.join(words)
    # Remove extra spaces (if any remain after processing)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [19]:
# Apply the preprocessing function to both training and test messages
print("\nApplying advanced text preprocessing...")
X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)
print("Preprocessing complete.")

print("\nProcessed X_train sample (first 5 messages):")
print(X_train_processed.head())


Applying advanced text preprocessing...
Preprocessing complete.

Processed X_train sample (first 5 messages):
3501    bank granite issue strongbuy explosive pick me...
617     like v shock leh co telling shuhui like tellin...
475     nice line said broken heart plz dont cum time ...
5535    know thinkin malaria relax child cant handle m...
4747                      orh tot u say still dun believe
Name: Message, dtype: object


In [20]:
# --- 8. Feature Extraction: TF-IDF with N-grams ---
# ngram_range=(1, 3): This means the vectorizer will create features for:
#   - single words (unigrams)
#   - sequences of 2 words (bigrams), e.g., "free money"
#   - sequences of 3 words (trigrams), e.g., "claim your prize"
# max_features: Limits the number of features to the most frequent ones.
#               Often useful to prevent overfitting and manage memory.
feature_extraction = TfidfVectorizer(
    min_df=1,
    ngram_range=(1, 3), # Incorporate unigrams, bigrams, and trigrams
    max_features=5000 # Consider top 5000 most frequent n-grams
)

In [21]:
#Fit the vectorizer on the processed training data and transform it
print("\nFitting TF-IDF vectorizer and transforming data...")
X_train_features = feature_extraction.fit_transform(X_train_processed)

# Transform the *processed* test data using the *same* fitted vectorizer
X_test_features = feature_extraction.transform(X_test_processed)
print("TF-IDF Vectorization complete.")



Fitting TF-IDF vectorizer and transforming data...
TF-IDF Vectorization complete.


In [22]:
#Ensure Y labels are integers ---
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [23]:
print("\nX_train_features shape (after TF-IDF):", X_train_features.shape)


X_train_features shape (after TF-IDF): (4457, 5000)


In [24]:
#Train the Support Vector Machine (SVM) Model ---
# SVC with 'linear' kernel is often excellent for text classification.
# C: Regularization parameter. Smaller C means stronger regularization.
# class_weight='balanced': Addresses class imbalance by automatically adjusting weights.
print("\nTraining Support Vector Classifier (SVC) model...")
Model = SVC(kernel='linear', C=1.0, class_weight='balanced', random_state=3, probability=True)
# probability=True allows predict_proba which is useful for probabilities
Model.fit(X_train_features, Y_train)
print("Model training complete.")



Training Support Vector Classifier (SVC) model...
Model training complete.


In [25]:
#Evaluate the model on test data ---
prediction_on_test_data = Model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [26]:
print('\n--- Model Evaluation on Test Data ---')
print('Accuracy on test data:', accuracy_on_test_data)


--- Model Evaluation on Test Data ---
Accuracy on test data: 0.97847533632287


In [27]:
# More detailed classification report
print("\nClassification Report:")
print(classification_report(Y_test, prediction_on_test_data, target_names=['Spam (0)', 'Ham (1)']))



Classification Report:
              precision    recall  f1-score   support

    Spam (0)       0.91      0.93      0.92       149
     Ham (1)       0.99      0.99      0.99       966

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
# Confusion Matrix
cm = confusion_matrix(Y_test, prediction_on_test_data)
print("\nConfusion Matrix:")
print(cm)
# Interpretation of Confusion Matrix:
# [[True Negatives (Spam correctly identified)   False Positives (Ham incorrectly identified as Spam)]
#  [False Negatives (Spam incorrectly identified as Ham) True Positives (Ham correctly identified)]]
# For spam detection, you want to minimize False Negatives (actual spam classified as ham).
# This is recall for the spam class.


Confusion Matrix:
[[138  11]
 [ 13 953]]


In [29]:
# Predict on a new input mail ---
input_your_mail = ["Free money, huge prizes, impossible returns.", "Hey John, can we meet tomorrow for the project discussion?"]

print("\n--- Predicting on New Input Mails ---")
for i, mail in enumerate(input_your_mail):
    print(f"\nMail {i+1}: '{mail}'")

    # Apply the same preprocessing function to the new input mail
    input_mail_processed = preprocess_text(mail)
    print(f"  Processed mail: '{input_mail_processed}'")

    # Transform the processed new input using the *same* feature_extraction object
    input_data_features = feature_extraction.transform([input_mail_processed])

    # Print prediction probabilities (Spam=0, Ham=1)
    prediction_probabilities = Model.predict_proba(input_data_features)
    print(f"  Prediction probabilities (Spam=0, Ham=1): {prediction_probabilities}")

    # Get the predicted class (0 or 1)
    prediction = Model.predict(input_data_features)

    # Interpret and print the prediction
    if(prediction[0] == 1):
        print('  Result: Ham mail')
    else:
        print('  Result: Spam mail')




--- Predicting on New Input Mails ---

Mail 1: 'Free money, huge prizes, impossible returns.'
  Processed mail: 'free money huge prize impossible return'
  Prediction probabilities (Spam=0, Ham=1): [[0.91095054 0.08904946]]
  Result: Spam mail

Mail 2: 'Hey John, can we meet tomorrow for the project discussion?'
  Processed mail: 'hey john meet tomorrow project discussion'
  Prediction probabilities (Spam=0, Ham=1): [[8.92159035e-07 9.99999108e-01]]
  Result: Ham mail
