# Hands-on Assignment: Text Preprocessing on SMS Spam Collection Dataset

In [None]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK stopwords if not already present
nltk.download('stopwords')

In [None]:
# Load the dataset
df = pd.read_csv("spam.csv", encoding="latin-1")

# Drop unnecessary columns
df = df[['v1', 'v2']]

# Rename columns for clarity
df = df.rename(columns={'v1': 'label', 'v2': 'message'})

# Display first few rows
df.head()

In [None]:
# Check dataset info and class distribution
print(df.info())
print("\nClass distribution:")
print(df['label'].value_counts())

In [None]:
# Convert labels to binary (ham=0, spam=1)
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Lowercase the messages
df['message'] = df['message'].str.lower()

# Remove punctuation
df['message'] = df['message'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Remove numbers
df['message'] = df['message'].apply(lambda x: re.sub(r'\d+', '', x))

df.head()

In [None]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    words = text.split()
    # Remove stopwords
    words = [w for w in words if w not in stop_words]
    # Apply stemming
    words = [ps.stem(w) for w in words]
    return ' '.join(words)

df['clean_message'] = df['message'].apply(preprocess_text)
df.head()

In [None]:
# Compare original vs cleaned messages
df[['message', 'clean_message']].head(10)