In [8]:
import pandas as pd
import zipfile
import io
import requests

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --------------------------------------------------
# Load SMS Spam Collection (FAIL-SAFE METHOD)
# --------------------------------------------------
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Read the correct file inside the ZIP
with z.open("SMSSpamCollection") as f:
    df = pd.read_csv(
        f,
        sep="\t",
        header=None,
        names=["label", "message"]
    )

# --------------------------------------------------
# Features and target
# --------------------------------------------------
X = df["message"].astype(str)
y = df["label"].map({"ham": 0, "spam": 1})

# --------------------------------------------------
# Train-test split
# --------------------------------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# --------------------------------------------------
# Model: TF-IDF + Naive Bayes
# --------------------------------------------------
model = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", MultinomialNB())
])

model.fit(X_tr, y_tr)

# --------------------------------------------------
# Evaluation
# --------------------------------------------------
y_pred = model.predict(X_te)

print("=== Naive Bayes (Spam Classification) ===")
print("Accuracy:", accuracy_score(y_te, y_pred))
print("\nClassification Report:\n",
      classification_report(y_te, y_pred, target_names=["Non-Spam", "Spam"]))
print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

# --------------------------------------------------
# Sample prediction
# --------------------------------------------------
sample_msg = "Congratulations! You have won a free prize. Call now!"
pred = model.predict([sample_msg])[0]

print("\nSample Message:", sample_msg)
print("Prediction:", "Spam" if pred == 1 else "Non-Spam")


=== Naive Bayes (Spam Classification) ===
Accuracy: 0.9705671213208902

Classification Report:
               precision    recall  f1-score   support

    Non-Spam       0.97      1.00      0.98      1206
        Spam       1.00      0.78      0.88       187

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393

Confusion Matrix:
 [[1206    0]
 [  41  146]]

Sample Message: Congratulations! You have won a free prize. Call now!
Prediction: Spam


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

dt_model = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", DecisionTreeClassifier(
        criterion="entropy",
        max_depth=25,          # prevents overfitting
        min_samples_leaf=5,    # prevents tiny-leaf memorization
        random_state=42
    ))
])

dt_model.fit(X_tr, y_tr)

dt_pred = dt_model.predict(X_te)

print("=== Decision Tree (Entropy) — Spam Classification ===")
print("Accuracy:", accuracy_score(y_te, dt_pred))
print("\nClassification Report:\n", classification_report(
    y_te, dt_pred, target_names=["Non-Spam", "Spam"], zero_division=0
))
print("Confusion Matrix:\n", confusion_matrix(y_te, dt_pred))

sample_msg = "Congratulations! You have won a free prize. Call now!"
pred = dt_model.predict([sample_msg])[0]
print("\nSample Message:", sample_msg)
print("Prediction:", "Spam" if pred == 1 else "Non-Spam")

# Optional quick overfitting check (train vs test)
train_acc = accuracy_score(y_tr, dt_model.predict(X_tr))
test_acc  = accuracy_score(y_te, dt_pred)
print("\nTrain Accuracy:", train_acc)
print("Test  Accuracy:", test_acc)


=== Decision Tree (Entropy) — Spam Classification ===
Accuracy: 0.9425699928212491

Classification Report:
               precision    recall  f1-score   support

    Non-Spam       0.96      0.98      0.97      1206
        Spam       0.82      0.73      0.77       187

    accuracy                           0.94      1393
   macro avg       0.89      0.85      0.87      1393
weighted avg       0.94      0.94      0.94      1393

Confusion Matrix:
 [[1177   29]
 [  51  136]]

Sample Message: Congratulations! You have won a free prize. Call now!
Prediction: Non-Spam

Train Accuracy: 0.9734386216798278
Test  Accuracy: 0.9425699928212491


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

ann_model = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("svd", TruncatedSVD(n_components=200, random_state=42)),  # reduce dims
    ("scaler", StandardScaler()),                               # important for MLP
    ("clf", MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=30,
        random_state=42
    ))
])

ann_model.fit(X_tr, y_tr)

ann_pred = ann_model.predict(X_te)

print("=== ANN (MLP) — Spam Classification ===")
print("Accuracy:", accuracy_score(y_te, ann_pred))
print("\nClassification Report:\n", classification_report(
    y_te, ann_pred, target_names=["Non-Spam", "Spam"], zero_division=0
))
print("Confusion Matrix:\n", confusion_matrix(y_te, ann_pred))

sample_msg = "Congratulations! You have won a free prize. Call now!"
pred = ann_model.predict([sample_msg])[0]
print("\nSample Message:", sample_msg)
print("Prediction:", "Spam" if pred == 1 else "Non-Spam")

# Optional quick overfitting check (train vs test)
train_acc = accuracy_score(y_tr, ann_model.predict(X_tr))
test_acc  = accuracy_score(y_te, ann_pred)
print("\nTrain Accuracy:", train_acc)
print("Test  Accuracy:", test_acc)


=== ANN (MLP) — Spam Classification ===
Accuracy: 0.9806173725771715

Classification Report:
               precision    recall  f1-score   support

    Non-Spam       0.98      0.99      0.99      1206
        Spam       0.95      0.90      0.93       187

    accuracy                           0.98      1393
   macro avg       0.97      0.95      0.96      1393
weighted avg       0.98      0.98      0.98      1393

Confusion Matrix:
 [[1198    8]
 [  19  168]]

Sample Message: Congratulations! You have won a free prize. Call now!
Prediction: Spam

Train Accuracy: 0.9992821249102656
Test  Accuracy: 0.9806173725771715


