In [2]:
import pandas as pd


In [3]:
df = pd.read_csv('spam_ham_dataset.csv')
df.head(10)


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0
6,2793,ham,Subject: spring savings certificate - take 30 ...,0
7,4185,spam,Subject: looking for medication ? we ` re the ...,1
8,2641,ham,Subject: noms / actual flow for 2 / 26\r\nwe a...,0
9,1870,ham,"Subject: nominations for oct . 21 - 23 , 2000\...",0


In [4]:
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


In [7]:
print("Spam vs. Ham counts:\n", df['label'].value_counts())

Spam vs. Ham counts:
 label
ham     3672
spam    1499
Name: count, dtype: int64


In [8]:
import re  # For text pattern matching (like finding URLs)
import nltk  # Natural Language Toolkit (for text processing)
from nltk.corpus import stopwords  # Common words like "the", "is", etc.
from nltk.stem import PorterStemmer  # Reduces words to root form (e.g., "running" → "run")

In [9]:
nltk.download('stopwords')   # Downloads a list of common words to ignore

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [10]:
stemmer = PorterStemmer()  # Helps convert words like "playing" → "play"
stop_words = set(stopwords.words('english')) # Words like "the", "and", "is" (not useful for ML)

In [11]:
def clean_txt(text):
    text = text.lower() # Makes all letters lowercase ("Hello" → "hello")
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Removes links
    text = re.sub(r'\@\w+|\#', '', text)  # Removes Twitter-style @mentions and #hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removes numbers, punctuation (!, ?)

    words = text.split()  # Splits sentence into words
    words = [word for word in words if word not in stop_words]  # Removes "the", "and", etc.    

    return ' '.join(words)


In [12]:
df['cleaned_text'] = df['text'].apply(clean_txt)  # Runs clean_text() on every email

In [13]:
df['cleaned_text'].head(3)


0    subject enron methanol meter follow note gave ...
1    subject hpl nom january see attached file hpln...
2    subject neon retreat ho ho ho around wonderful...
Name: cleaned_text, dtype: object

In [14]:
df['text'].head(3)

0    Subject: enron methanol ; meter # : 988291\r\n...
1    Subject: hpl nom for january 9 , 2001\r\n( see...
2    Subject: neon retreat\r\nho ho ho , we ' re ar...
Name: text, dtype: object

In [None]:
#Feature extraction (Text to Numbers)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Keep top 5000 words
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label']


In [16]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Model 1: Naive Bayes (Best for Text)
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


# Model 2: Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Model 3: SVM (Good for high-dimensional data)
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
# print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Naive Bayes Accuracy: 0.9536231884057971
Confusion Matrix:
 [[714  28]
 [ 20 273]]
Random Forest Accuracy: 0.9797101449275363
Confusion Matrix:
 [[714  28]
 [ 20 273]]
SVM Accuracy: 0.9874396135265701
Confusion Matrix:
 [[714  28]
 [ 20 273]]


In [18]:
from sklearn.model_selection import GridSearchCV

# Example: Tuning Naive Bayes
params = {'alpha': [0.1, 0.5, 1.0, 2.0]}
grid = GridSearchCV(MultinomialNB(), params, cv=5)
grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)

Best parameters: {'alpha': 0.1}
