Imports and Setup

In [34]:
import pandas as pd
import urllib.request
import zipfile
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

# Download necessary NLTK data (stopwords for filtering non-informative words)
nltk.download('stopwords')

print("Libraries imported and NLTK data downloaded.")

# Initialize Stemmer and Stopwords once to save computation time during the loop
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

Libraries imported and NLTK data downloaded.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Deepanshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Acquisition and Loading

In [35]:
# --- 1. Download the dataset from UCI ---
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "data/sms_spam_collection.zip"

In [36]:
# Download the zip file if it doesn't exist
if not os.path.exists(zip_path):
    urllib.request.urlretrieve(url, zip_path)
    print("Dataset downloaded.")


Dataset downloaded.


In [37]:
# --- 2. Unzip the file ---
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(".")


In [39]:
# --- 3. Load into Pandas ---
# The UCI dataset is tab-separated with no header.
# We manually assign the columns 'label' and 'text'.
df = pd.read_csv('data/SMSSpamCollection', sep='\t', header=None, names=['label', 'text'])


In [22]:
print(f"Dataset shape: {df.shape}")
print(df.head())

Dataset shape: (5572, 2)
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


Defining Preprocessing Logic

In [40]:
def clean_text(text):
    """
    Preprocessing pipeline to normalize SMS text:
    1. Lowercase: Normalize case sensitivity.
    2. Regex: Remove special characters/numbers (keep only alphabets).
    3. Tokenize: Split string into a list of words.
    4. Remove Stopwords: Filter out common words (e.g., 'the', 'is').
    5. Stemming: Reduce words to their root form (e.g., 'calling' -> 'call').
    """
    if not isinstance(text, str):
        return ""
        
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove non-alphabetic characters (keep spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenize
    words = text.split()
    
    # 4. & 5. Remove stopwords and Stem
    cleaned_words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return " ".join(cleaned_words)

Executing Cleaning and Label Encoding

In [41]:
print(f"Original shape: {df.shape}")



Original shape: (5572, 2)


In [42]:
# --- 1. Drop Duplicates ---
# Critical for SMS datasets as duplicate messages can cause data leakage 
# between train and test sets.
df = df.drop_duplicates(keep='first')
print(f"Shape after dropping duplicates: {df.shape}")


Shape after dropping duplicates: (5169, 2)


In [43]:
# --- 2. Encode Labels ---
# Map string labels to binary integers: Spam = 1, Ham = 0
df['label'] = df['label'].map({'spam': 1, 'ham': 0})



In [27]:
# --- 3. Apply Text Cleaning ---
# Apply the clean_text function to create the feature column
df['clean_text'] = df['text'].apply(clean_text)


In [28]:
# Remove any empty rows created by aggressive cleaning (e.g., messages with only special chars)
df = df[df['clean_text'].str.len() > 0]

print("Preprocessing complete.")
print(df[['text', 'clean_text', 'label']].head())

Preprocessing complete.
                                                text  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                          clean_text  label  
0  go jurong point crazi avail bugi n great world...      0  
1                              ok lar joke wif u oni      0  
2  free entri wkli comp win fa cup final tkt st m...      1  
3                u dun say earli hor u c alreadi say      0  
4          nah dont think goe usf live around though      0  


Data Splitting

In [29]:
# --- Stratified Split ---
# We use 'stratify' to ensure the ratio of spam-to-ham remains consistent 
# across all three splits (Train, Validation, Test).

# Target Split Sizes: 70% Train, 15% Validation, 15% Test

# 1. Split off the Test set (15% of total)
train_val, test = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])

# 2. Split the remaining (85%) into Train and Validation
# To get 15% of the TOTAL original data for validation, we need ~17.65% of the REMAINING data.
# Calculation: 0.15 / 0.85 = 0.17647...
val_size = 0.15 / 0.85
train, val = train_test_split(train_val, test_size=val_size, random_state=42, stratify=train_val['label'])

print(f"Train Size: {len(train)}")
print(f"Val Size:   {len(val)}")
print(f"Test Size:  {len(test)}")

Train Size: 3613
Val Size:   775
Test Size:  775


Save Processed Data

In [30]:
# Save the processed splits to CSV files
# We exclude the index to keep the file clean
train.to_csv('train.csv', index=False)
val.to_csv('validation.csv', index=False)
test.to_csv('test.csv', index=False)

print("Files saved successfully: train.csv, validation.csv, test.csv")

Files saved successfully: train.csv, validation.csv, test.csv


In [31]:
train.head()

Unnamed: 0,label,text,clean_text
4448,0,Please tell me you have some of that special s...,pleas tell special stock talk
3428,0,Haha okay... Today weekend leh...,haha okay today weekend leh
1660,0,"Yeah, where's your class at?",yeah where class
796,0,it's really getting me down just hanging around.,realli get hang around
5408,0,... Are you in the pub?,pub


In [32]:
test.head()

Unnamed: 0,label,text,clean_text
142,0,"Sir, Waiting for your mail.",sir wait mail
5459,0,If you hear a loud scream in about &lt;#&gt; m...,hear loud scream ltgt minut caus gyno shove th...
2059,0,Ugh fuck it I'm resubbing to eve,ugh fuck im resub eve
4357,0,Great. So should i send you my account number.,great send account number
3683,0,"Hello, hello, hi lou sorry it took so long 2 r...",hello hello hi lou sorri took long repli left ...


In [33]:
val.head()

Unnamed: 0,label,text,clean_text
4887,0,Or just do that 6times,time
1992,0,No other Valentines huh? The proof is on your ...,valentin huh proof fb page ugh im glad realli ...
4596,0,Yo sorry was in the shower sup,yo sorri shower sup
5545,0,Hi its in durban are you still on this number,hi durban still number
1652,0,I wan but too early lei... Me outside now wun ...,wan earli lei outsid wun b home earli neva mind
