# Lab 2 — Medium NLP (Download Dataset): SMS Spam (BoW + Logistic Regression)

**Dataset:** UCI SMS Spam Collection (public zip)

**Goal:** Text cleaning → Bag of Words → Train classifier → Evaluate + error analysis.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

print('✅ Ready')

## 1) Download dataset

In [None]:
import os, zipfile, urllib.request

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
zip_path = "smsspamcollection.zip"

urllib.request.urlretrieve(url, zip_path)
print("✅ Downloaded:", zip_path)

## 2) Unzip + load into Pandas

In [None]:
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall('sms_data')

print('Files:', os.listdir('sms_data'))

data_path = 'sms_data/SMSSpamCollection'
df = pd.read_csv(data_path, sep='\t', header=None, names=['label', 'text'])
df.head()

## 3) Encode labels

In [None]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df['label'].value_counts()

## 4) Clean text + train/test split

In [None]:
import re

def clean_text(t: str) -> str:
    t = t.lower()
    t = re.sub(r"http\S+|www\S+", "", t)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df['clean_text'] = df['text'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.25, random_state=42, stratify=df['label']
)

## 5) Bag of Words + Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

bow_clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', LogisticRegression(max_iter=500))
])

bow_clf.fit(X_train, y_train)
y_pred = bow_clf.predict(X_test)

print(classification_report(y_test, y_pred))

## 6) Mistake analysis (very important)

In [None]:
test_df = pd.DataFrame({'text': X_test, 'y_true': y_test, 'y_pred': y_pred})
wrong = test_df[test_df['y_true'] != test_df['y_pred']]
wrong.head(10)