# 🤖 SMS Spam Classification - Final Pipeline Version

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
import warnings
warnings.filterwarnings("ignore")

## 📥 Load and Prepare Data

In [2]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_table(url, header=0, names=["label", "message"])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
X = df['message']
y = df['label']

## ✂️ Train-Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 🔍 Compare Multiple Models with Pipeline

In [4]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": LinearSVC()
}

results = []

for name, model in models.items():
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=3000)),
        ("clf", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({"Model": name, "Accuracy": acc, "F1 Score": f1})

results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
results_df

Unnamed: 0,Model,Accuracy,F1 Score
3,SVM,0.98565,0.948052
2,Random Forest,0.98296,0.937294
1,Naive Bayes,0.976682,0.911565
0,Logistic Regression,0.970404,0.885813


In [5]:
best_model_name = results_df.iloc[0]["Model"]
print("✅ Best Model:", best_model_name)

✅ Best Model: SVM


## 🛠️ Hyperparameter Tuning on Best Model

In [6]:
# Re-select best model
best_model = models[best_model_name]

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=3000)),
    ("clf", best_model)
])

param_grid = {
    "clf__C": [0.1, 1.0, 10.0] if "Logistic" in best_model_name or "SVM" in best_model_name else [],
    "clf__alpha": [0.5, 1.0] if "Naive" in best_model_name else [],
    "clf__n_estimators": [100, 200] if "Random" in best_model_name else []
}

# Filter out empty params
param_grid = {k: v for k, v in param_grid.items() if v}

if param_grid:
    grid_search = GridSearchCV(pipeline, param_grid, scoring="f1", cv=5)
    grid_search.fit(X_train, y_train)
    final_model = grid_search.best_estimator_
    print("✅ Best parameters found and model trained.")
else:
    pipeline.fit(X_train, y_train)
    final_model = pipeline
    print("ℹ️ No hyperparameters to tune. Model trained with default settings.")

✅ Best parameters found and model trained.


## 💾 Save Final Pipeline

In [7]:
joblib.dump(final_model, "spam_classifier_pipeline.pkl")
print("📁 Model pipeline saved as spam_classifier_pipeline.pkl")

📁 Model pipeline saved as spam_classifier_pipeline.pkl


## ✅ Predict a New Sample Message

In [8]:
sample = ["Congratulations! You've won a free ticket. Click here to claim now!"]
prediction = final_model.predict(sample)
label = "Spam" if prediction[0] == 1 else "Ham"
print(f"🔮 Prediction: {label}")

🔮 Prediction: Spam
