In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df1 = pd.read_csv("datasets/emails.csv")
df2= pd.read_csv("datasets/spam.csv")

In [3]:
df1.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
df2.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# Clean df2
df2 = df2.rename(columns={"Category": "spam", "Message": "text"})
df2["spam"] = df2["spam"].map({"spam": 1, "ham": 0})

# Just to check correctness
print(df1.columns)
print(df2.columns)


Index(['text', 'spam'], dtype='object')
Index(['spam', 'text'], dtype='object')


In [8]:
combined_df = pd.concat([df1, df2], ignore_index=True)

# Drop any duplicates or missing values
combined_df.drop_duplicates(subset="text", inplace=True)
combined_df.dropna(inplace=True)

# Verify result
print("✅ Combined shape:", combined_df.shape)


✅ Combined shape: (10852, 2)


In [9]:
combined_df.head(10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [10]:
combined_df.isnull().sum()

text    0
spam    0
dtype: int64

In [11]:
combined_df.duplicated().sum()

np.int64(0)

In [12]:
combined_df.shape

(10852, 2)

In [13]:
# Remove "Subject:" (case-insensitive) from the start of each message
combined_df["text"] = combined_df["text"].str.replace(r"^Subject:\s*", "", case=False, regex=True)

# Optional: verify it worked
combined_df["text"].head(10)


0    naturally irresistible your corporate identity...
1    the stock trading gunslinger  fanny is merrill...
2    unbelievable new homes made easy  im wanting t...
3    4 color printing special  request additional i...
4    do not have money , get software cds from here...
5    great nnews  hello , welcome to medzonline sh ...
6    here ' s a hot play in motion  homeland securi...
7    save your money buy getting this thing here  y...
8    undeliverable : home based business for grownu...
9    save your money buy getting this thing here  y...
Name: text, dtype: object

In [14]:

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = " ".join([ps.stem(word) for word in text.split() if word not in stop_words])
    return text

# Apply cleaning
combined_df["clean_text"] = combined_df["text"].apply(clean_text)
combined_df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,spam,clean_text
0,naturally irresistible your corporate identity...,1,natur irresist corpor ident lt realli hard rec...
1,the stock trading gunslinger fanny is merrill...,1,stock trade gunsling fanni merril muzo colza a...
2,unbelievable new homes made easy im wanting t...,1,unbeliev new home made easi im want show homeo...
3,4 color printing special request additional i...,1,color print special request addit inform click...
4,"do not have money , get software cds from here...",1,money get softwar cd softwar compat great grow...


In [15]:
import nltk

# Download tokenization data safely
nltk.download('punkt')
nltk.download('punkt_tab')  # some new NLTK versions need this extra package

from nltk.tokenize import word_tokenize

# Now tokenize your text
combined_df["tokens"] = combined_df["text"].apply(word_tokenize)

# Preview results
combined_df.head()



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhyu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abhyu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,text,spam,clean_text,tokens
0,naturally irresistible your corporate identity...,1,natur irresist corpor ident lt realli hard rec...,"[naturally, irresistible, your, corporate, ide..."
1,the stock trading gunslinger fanny is merrill...,1,stock trade gunsling fanni merril muzo colza a...,"[the, stock, trading, gunslinger, fanny, is, m..."
2,unbelievable new homes made easy im wanting t...,1,unbeliev new home made easi im want show homeo...,"[unbelievable, new, homes, made, easy, im, wan..."
3,4 color printing special request additional i...,1,color print special request addit inform click...,"[4, color, printing, special, request, additio..."
4,"do not have money , get software cds from here...",1,money get softwar cd softwar compat great grow...,"[do, not, have, money, ,, get, software, cds, ..."


In [16]:
combined_df.drop("text", axis=1, inplace=True)


In [17]:
combined_df.head()

Unnamed: 0,spam,clean_text,tokens
0,1,natur irresist corpor ident lt realli hard rec...,"[naturally, irresistible, your, corporate, ide..."
1,1,stock trade gunsling fanni merril muzo colza a...,"[the, stock, trading, gunslinger, fanny, is, m..."
2,1,unbeliev new home made easi im want show homeo...,"[unbelievable, new, homes, made, easy, im, wan..."
3,1,color print special request addit inform click...,"[4, color, printing, special, request, additio..."
4,1,money get softwar cd softwar compat great grow...,"[do, not, have, money, ,, get, software, cds, ..."


In [18]:
combined_df.to_csv("cleaned_spam_dataset.csv", index=False)
print("✅ Saved cleaned dataset as cleaned_spam_dataset.csv")   

✅ Saved cleaned dataset as cleaned_spam_dataset.csv
