# Lab 3 — Medium+ NLP: TF-IDF + Linear SVM + GridSearch

**Goal:** Build an industry-style pipeline, tune it, save the best model, and demo custom predictions.

**Dataset:** SMS Spam (reuse from Lab 2).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

print('✅ Ready')

## 1) Download + Load dataset (repeat for standalone notebook)

In [None]:
import os, zipfile, urllib.request, re

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
zip_path = "smsspamcollection.zip"

urllib.request.urlretrieve(url, zip_path)

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall('sms_data')

df = pd.read_csv('sms_data/SMSSpamCollection', sep='\t', header=None, names=['label', 'text'])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


def clean_text(t: str) -> str:
    t = t.lower()
    t = re.sub(r"http\S+|www\S+", "", t)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


df['clean_text'] = df['text'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.25, random_state=42, stratify=df['label']
)

print(df.head())
print('Train size:', len(X_train), 'Test size:', len(X_test))

## 2) TF-IDF + LinearSVC baseline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


tfidf_svc = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LinearSVC())
])

tfidf_svc.fit(X_train, y_train)
y_pred = tfidf_svc.predict(X_test)

print(classification_report(y_test, y_pred))

## 3) GridSearchCV (fast tuning)

In [None]:
params = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__min_df': [1, 2, 5]
}

gs = GridSearchCV(tfidf_svc, params, cv=3, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

print('✅ Best Params:', gs.best_params_)
print('✅ Best CV Score:', gs.best_score_)

## 4) Evaluate best model

In [None]:
best_model = gs.best_estimator_
y_pred_best = best_model.predict(X_test)

print(classification_report(y_test, y_pred_best))

## 5) Save model

In [None]:
import joblib

joblib.dump(best_model, 'spam_tfidf_model.joblib')
print('✅ Saved: spam_tfidf_model.joblib')

## 6) Demo predictions

In [None]:
samples = [
    "Congratulations! You've won a free iPhone. Claim now!",
    "Are you coming to college tomorrow?",
    "URGENT: Your bank account is blocked. Verify now!"
]

preds = best_model.predict(samples)

for s, p in zip(samples, preds):
    print(f"[{p}] {s}")