# 🤖 SMS Spam Classification - Final Pipeline Version

In [31]:
import json
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split

# Classifier imports
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

## 📥 Load and Prepare Data

In [3]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_table(url, header=0, names=["label", "message"])
# Preprocess the data
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
X = df['message']
y = df['label']


## ✂️ Train-Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Load JSON ettings

In [6]:
with open("classification_param_settings.json", "r") as f:
    settings = json.load(f)

## Model Class Map

In [32]:
model_classes = {
    "LogisticRegression": LogisticRegression,
    "MultinomialNB": MultinomialNB,
    "RandomForestClassifier": RandomForestClassifier,
    "GradientBoostingClassifier": GradientBoostingClassifier,
    "AdaBoostClassifier": AdaBoostClassifier,
    "KNeighborsClassifier": KNeighborsClassifier,
    "DecisionTreeClassifier": DecisionTreeClassifier,
    "LinearSVC": LinearSVC
}

## Loop Through Models with GridSearchCV

In [34]:
results = []

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


for model_name, config in settings.items():
    
    print(f"\n🚀 Training {model_name}...")

    model_class_name = config["model"]
    model = model_classes[model_class_name]()  # No grid search, use default

    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Accuracy": acc,
        "F1 Score": f1
    })


🚀 Training Logistic Regression...

🚀 Training MultinomialNB...

🚀 Training Random Forest...

🚀 Training Linear SVC...

🚀 Training Gradient Boosting...

🚀 Training AdaBoost...

🚀 Training KNN...

🚀 Training Decision Tree...


## Display Results

In [36]:
import pandas as pd
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
print("\n📊 Model Performance Summary:")
print(results_df)


📊 Model Performance Summary:
                 Model  Accuracy  F1 Score
3           Linear SVC  0.988341  0.958199
2        Random Forest  0.977578  0.915254
6                  KNN  0.974888  0.904762
4    Gradient Boosting  0.971300  0.891892
0  Logistic Regression  0.965022  0.864111
7        Decision Tree  0.962332  0.862745
1        MultinomialNB  0.960538  0.840580
5             AdaBoost  0.948879  0.786517


## 💾 Save Final Pipeline

In [43]:
import joblib
# Save the best model
max_row = results_df.iloc[0]
best_model_name = max_row['Model']
best_model_class = model_classes[settings[best_model_name]['model']]
best_model = best_model_class()
best_model.fit(X_train_vec, y_train)

In [45]:
joblib.dump(best_model, "best_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

## ✅ Predict a New Sample Message

In [47]:
loaded_model = joblib.load("best_model.pkl")
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Predict on a new message
msg = ["You have WON a R5000 voucher. Call now!"]
msg_vec = loaded_vectorizer.transform(msg)
prediction = loaded_model.predict(msg_vec)

print("Spam" if prediction[0] == 1 else "Ham")

Spam
