<a href="https://colab.research.google.com/github/ByteBoss-ai/Email_classification/blob/main/Email_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setup & Imports

In [41]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from nltk.corpus import stopwords
import nltk

In [None]:
# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 2. Dataset Overview

In [None]:
# Load dataset
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "combined_data.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
    "purusinghvi/email-spam-classification-dataset",
      file_path
)

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'email-spam-classification-dataset' dataset.


**EXPORATORY DATA ANALYSIS**

In [None]:
df.shape

(83448, 2)

In [None]:
df.columns.tolist()

['label', 'text']

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [None]:
df.head(8)

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
5,0,larry king live at escapenumber escapenumber p...
6,0,michael pobega wrote i'm not sure if it's the ...
7,0,hi i have this error tr sample escapenumber es...


In [None]:
# Replace 'text' and 'label'
TEXT_COL = 'text'
LABEL_COL = 'label'

In [None]:
# Basic sanity checks
print('\nMissing values per column:\n', df.isnull().sum())
print('\nLabel distribution:')
print(df[LABEL_COL].value_counts(dropna=False))


Missing values per column:
 label    0
text     0
dtype: int64

Label distribution:
label
1    43910
0    39538
Name: count, dtype: int64


# 3. Exploratory Look at Raw Text

In [None]:
# Show a few examples of raw emails for each class
print('\n  Sample HAM (label=0): ')
print(df[df[LABEL_COL]==0].sample(3)[TEXT_COL].tolist())
print('\n  Sample SPAM (label=1):')
print(df[df[LABEL_COL]==1].sample(3)[TEXT_COL].tolist())


  Sample HAM (label=0): 
['please see the attached draft of an open season posting for a potential tw\nexpansion . please provide comments to me by friday , october 27 th .\nthanks !\nlindy', "on thursday escapenumber may escapenumber escapenumber escapenumber escapenumber mike mattie wrote quick patch to use the mem sys free wrapper instead of using the platform's free directly i only see an openpgp signature attachment can you respond with the patch attached again or inline i suppose if it's a small change i could make it directly too c", "johan faux writes thanks for your reply experimenting a little further i was able to learn a bit more and find what is the real problem of mine what the real question is i could create my sescapenumber extended class by adding these two lines in my namespace file importclassesfrom matrix matrix importclassesfrom matrix dmatrix you can also do importclassesfrom matrix matrix dmatrix and then creating my new class in another r file library matrix i 

# 4. Data Cleaning & Preprocessing

In [19]:
import html

def clean_text(text, remove_stopwords=True, extra_strip=True):
  if pd.isnull(text):
    return ""
  # unescape html entities
  text = html.unescape(str(text))
  # lowercase
  text = text.lower()
  # remove email addresses and urls (common in spam)
  text = re.sub(r'\S+@\S+', ' ', text) # emails
  text = re.sub(r'http\S+|www\.\S+', ' ', text) # urls
  # remove numbers
  text = re.sub(r'\d+', ' ', text)
  # remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  # remove extra whitespace
  if extra_strip:
    text = re.sub(r'\s+', ' ', text).strip()
  # remove stopwords
  if remove_stopwords:
    stops = set(stopwords.words('english'))
    tokens = [w for w in text.split() if w not in stops and len(w)>1]
    return ' '.join(tokens)
  return text

In [20]:
# Apply cleaning to new column to keep raw text as well
CLEAN_COL = 'clean_text'
df[CLEAN_COL] = df[TEXT_COL].apply(lambda x: clean_text(x))

In [21]:
# Show before/after for a few rows
sample = df[[TEXT_COL, CLEAN_COL, LABEL_COL]].sample(6)
sample

Unnamed: 0,text,clean_text,label
83338,suraj please don't send me copies of messages ...,suraj please dont send copies messages mailing...,0
61191,the alert is on symescapenumberol cdyvcurr pri...,alert symescapenumberol cdyvcurr price escapen...,1
60175,"john ,\nas discussed , we propose a 3 % adjust...",john discussed propose adjustment base salarie...,0
79772,satcon technology corporation satcon joins gre...,satcon technology corporation satcon joins gre...,0
13458,( see attached file : hplno 201 . xls )\n- hpl...,see attached file hplno xls hplno xls,0
79239,"hey ,\ni just heard\nof this new drg called il...",hey heard new drg called ilis thought might in...,1


# 5. Visualize Token Lengths & Basic Stats

In [22]:
# tokens per cleaned message
df['token_count'] = df[CLEAN_COL].apply(lambda x: len(str(x).split()))

In [23]:
print(df[['token_count', LABEL_COL]].groupby(LABEL_COL).describe())

      token_count                                                          
            count        mean         std  min   25%    50%    75%      max
label                                                                      
0         39538.0  218.034043  670.857576  0.0  58.0  116.0  222.0  92322.0
1         43910.0  121.294967  184.625193  0.0  36.0   67.0  151.0  11827.0


In [24]:
# Show distribution head
print('\nTop 10 longest messages (tokens):')
print(df.sort_values('token_count', ascending=False)[[TEXT_COL, CLEAN_COL, 'token_count']].head(5))


Top 10 longest messages (tokens):
                                                    text  \
52108  commit a snapshot of the generated prototype h...   
51996  enron : a wake - up call\nthe wall street jour...   
475    rival to buy enron , top energy trader , after...   
68692  fall of a power giant : bailout is unlikely if...   
63878  at file home jelmer bzr samba old escapenumber...   

                                              clean_text  token_count  
52108  commit snapshot generated prototype headers so...        92322  
51996  enron wake call wall street journal fall power...        20985  
475    rival buy enron top energy trader financial fa...        16805  
68692  fall power giant bailout unlikely enron goes t...        15381  
63878  file home jelmer bzr samba old escapenumber es...        14054  


In [None]:
# Show distribution head
print('\nTop 10 longest messages (tokens):')
print(df.sort_values('token_count', ascending=False)[[TEXT_COL, CLEAN_COL, 'token_count']].head(5))

In [25]:
# Optionally drop empty cleaned messages
empty_before = (df[CLEAN_COL].str.strip() == '').sum()
print('\nEmpty cleaned messages before drop:', empty_before)


df = df[df[CLEAN_COL].str.strip() != ''].reset_index(drop=True)
empty_after = (df[CLEAN_COL].str.strip() == '').sum()
print('Empty cleaned messages after drop:', empty_after)


Empty cleaned messages before drop: 32
Empty cleaned messages after drop: 0


# 6. Train / Test Split

In [28]:
X = df[CLEAN_COL]
y = df[LABEL_COL]

In [29]:
# stratify to keep label balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [30]:
print('Train size:', X_train.shape[0])
print('Test size :', X_test.shape[0])

Train size: 66732
Test size : 16684


In [31]:
print('\nTrain label distribution:\n', y_train.value_counts(normalize=True))
print('\nTest label distribution:\n', y_test.value_counts(normalize=True))


Train label distribution:
 label
1    0.526104
0    0.473896
Name: proportion, dtype: float64

Test label distribution:
 label
1    0.526133
0    0.473867
Name: proportion, dtype: float64


# 7. Feature Extraction (TF-IDF)

In [32]:
# Vectorizer setup
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

In [33]:
# Fit on training data and transform
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [34]:
print('TF-IDF shape — train:', X_train_tfidf.shape)
print('TF-IDF shape — test :', X_test_tfidf.shape)

TF-IDF shape — train: (66732, 5000)
TF-IDF shape — test : (16684, 5000)


In [35]:
# Show top features learned (first 40)
feature_names = vectorizer.get_feature_names_out()
print('\nSample TF-IDF features (first 40):\n', feature_names[:40])


Sample TF-IDF features (first 40):
 ['aa' 'ab' 'abbott' 'abc' 'ability' 'able' 'absolutely' 'abuse' 'ac'
 'ac uk' 'academy' 'accept' 'acceptance' 'accepted' 'access' 'according'
 'account' 'accounting' 'accounts' 'acct' 'accuracy' 'accurate'
 'accuweather' 'accuweather com' 'achieve' 'acl' 'acquire' 'acquired'
 'acquisition' 'acrobat' 'acrobat escapenumber' 'across' 'act'
 'act section' 'act statements' 'action' 'actions' 'active' 'activities'
 'activity']


In [37]:
# Show TF-IDF example for a single message
sample_idx = X_train.index[0]
print('\nOriginal message:\n', X_train.iloc[0])


vec = X_train_tfidf[0].toarray().reshape(-1)
nonzero_indices = np.where(vec>0)[0]
print('\nNon-zero TF-IDF features in this message:')
for i in nonzero_indices[:30]:
  print(feature_names[i], '->', round(vec[i], 4))


Original message:
 «canadianpharmacy» offers wide selection escapenumber generic products choose great level service fast delivery personal approach customer security information absolutely cheap prices visit canadianpharmacy site enjoy new saving options «canadianpharmacy» http colorpath hk

Non-zero TF-IDF features in this message:
absolutely -> 0.2006
approach -> 0.1919
canadianpharmacy -> 0.5575
cheap -> 0.1932
choose -> 0.1682
customer -> 0.145
delivery -> 0.1638
enjoy -> 0.1808
escapenumber -> 0.0629
fast -> 0.1626
generic -> 0.165
great -> 0.1326
hk -> 0.1491
http -> 0.069
information -> 0.1125
level -> 0.1618
new -> 0.0941
offers -> 0.1504
options -> 0.1636
personal -> 0.1702
prices -> 0.1392
products -> 0.1424
saving -> 0.216
security -> 0.1614
selection -> 0.2001
service -> 0.1305
site -> 0.1368
visit -> 0.1356
wide -> 0.1766


# 8. Model 1 — Multinomial Naive Bayes

In [46]:
nb = MultinomialNB()

In [47]:
nb.fit(X_train_tfidf, y_train)
nb_pred = nb.predict(X_test_tfidf)
nb_proba = nb.predict_proba(X_test_tfidf)[:,1] # probability of class 1

In [48]:
print('Naive Bayes — Accuracy:', accuracy_score(y_test, nb_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, nb_pred))
print('\nClassification Report:\n', classification_report(y_test, nb_pred))
print('\nROC AUC (NB):', roc_auc_score(y_test, nb_proba))

Naive Bayes — Accuracy: 0.9665547830256533

Confusion Matrix:
 [[7613  293]
 [ 265 8513]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      7906
           1       0.97      0.97      0.97      8778

    accuracy                           0.97     16684
   macro avg       0.97      0.97      0.97     16684
weighted avg       0.97      0.97      0.97     16684


ROC AUC (NB): 0.9933466566054074


# 9. Model 2 — Logistic Regression

In [49]:
lr = LogisticRegression(max_iter=2000)

In [50]:
lr.fit(X_train_tfidf, y_train)
lr_pred = lr.predict(X_test_tfidf)
lr_proba = lr.predict_proba(X_test_tfidf)[:,1]

In [51]:
print('Logistic Regression — Accuracy:', accuracy_score(y_test, lr_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, lr_pred))
print('\nClassification Report:\n', classification_report(y_test, lr_pred))
print('\nROC AUC (LR):', roc_auc_score(y_test, lr_proba))

Logistic Regression — Accuracy: 0.9823183888755694

Confusion Matrix:
 [[7703  203]
 [  92 8686]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      7906
           1       0.98      0.99      0.98      8778

    accuracy                           0.98     16684
   macro avg       0.98      0.98      0.98     16684
weighted avg       0.98      0.98      0.98     16684


ROC AUC (LR): 0.9972969804060782


# 10. Compare Models

In [53]:
from sklearn.metrics import precision_recall_fscore_support


results = []
for name, pred, proba in [
('Naive Bayes', nb_pred, nb_proba),
('Logistic Regression', lr_pred, lr_proba)
]:
  acc = accuracy_score(y_test, pred)
  cm = confusion_matrix(y_test, pred)
  report = classification_report(y_test, pred, output_dict=True)
  roc = roc_auc_score(y_test, proba)
  results.append({'model': name, 'accuracy': acc, 'roc_auc': roc, 'precision_spam': report['1']['precision'], 'recall_spam': report['1']['recall'], 'f1_spam': report['1']['f1-score']})


pd.DataFrame(results)

Unnamed: 0,model,accuracy,roc_auc,precision_spam,recall_spam,f1_spam
0,Naive Bayes,0.966555,0.993347,0.966727,0.969811,0.968267
1,Logistic Regression,0.982318,0.997297,0.977163,0.989519,0.983302


# 11. Error Analysis

In [54]:
# show a few false negatives (spam predicted as ham)
import numpy as np

nb_wrong_idx = np.where((y_test.values==1) & (nb_pred==0))[0]
print('Number of spam missed by NB:', len(nb_wrong_idx))


Number of spam missed by NB: 265


In [57]:
# show up to 5 missed spam examples
for i in nb_wrong_idx[:5]:
  print('\n    Message (MISSED by NB)   ')
  print('Raw:', X_test.iloc[i])
  print('Predicted by NB:', nb_pred[i], ' Pred by LR:', lr_pred[i])


    Message (MISSED by NB)   
Raw: file bad
Predicted by NB: 0  Pred by LR: 1

    Message (MISSED by NB)   
Raw: harvard medical coping mpotence learn bouncy nobleleeward afterlife follyeloise boustrophedon gutsyiconoclast squeeze downfallsaguaro leftmost bayouhuckleberry traitorous caloriedusk compile catskillmemphis privy bialystokgrip tektite gratitudewy owly updraftburke whiplash silkymush accumulate ashencab
Predicted by NB: 0  Pred by LR: 1

    Message (MISSED by NB)   
Raw: run axon exe corporate users allowed download order first buyer upgrade older licenses
Predicted by NB: 0  Pred by LR: 1

    Message (MISSED by NB)   
Raw: name terminal port ip address login session complete dozens ports done one fi mkdir var spool printer name also account login name local host touch tree verify delta also use flag architectures discussion limit designate default router changing line escapenumber escapenumber also buy many ansi standards committee password equal previous password one wa

# 12. Sample Predictions

In [58]:
def predict_single(text):
  cleaned = clean_text(text)
  vec = vectorizer.transform([cleaned])
  nb_p = nb.predict(vec)[0]
  lr_p = lr.predict(vec)[0]
  nb_prob = nb.predict_proba(vec)[0,1]
  lr_prob = lr.predict_proba(vec)[0,1]
  return {'raw': text, 'cleaned': cleaned, 'nb_pred': int(nb_p), 'nb_prob_spam': float(nb_prob), 'lr_pred': int(lr_p), 'lr_prob_spam': float(lr_prob)}


examples = [
"Congratulations! You have WON a $1000 Walmart gift card. Click here",
"Hi Ananya, please review the attached minutes from today's meeting and share feedback",
"Limited time OFFER: Get cheap meds, no prescription needed!",
]


for e in examples:
  print('\n', predict_single(e))


 {'raw': 'Congratulations! You have WON a $1000 Walmart gift card. Click here', 'cleaned': 'congratulations walmart gift card click', 'nb_pred': 1, 'nb_prob_spam': 0.9343237044873406, 'lr_pred': 1, 'lr_prob_spam': 0.8502240680659061}

 {'raw': "Hi Ananya, please review the attached minutes from today's meeting and share feedback", 'cleaned': 'hi ananya please review attached minutes todays meeting share feedback', 'nb_pred': 0, 'nb_prob_spam': 0.202655846699817, 'lr_pred': 0, 'lr_prob_spam': 0.04694033249453965}

 {'raw': 'Limited time OFFER: Get cheap meds, no prescription needed!', 'cleaned': 'limited time offer get cheap meds prescription needed', 'nb_pred': 1, 'nb_prob_spam': 0.9987258324398949, 'lr_pred': 1, 'lr_prob_spam': 0.9961794865624835}
