In [1]:
import kagglehub

# Download dataset from Kaggle
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("‚úÖ Dataset downloaded successfully!")
print("üìÅ Path:", path)




‚úÖ Dataset downloaded successfully!
üìÅ Path: /Users/info/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1


In [2]:
import pandas as pd
import os

# Build path to spam.csv
csv_path = os.path.join(path, "spam.csv")

# Load dataset
df = pd.read_csv(csv_path, encoding="latin-1")

# Display info
print("‚úÖ Dataset loaded successfully!")
print("üìä Shape:", df.shape)
df.head()
# Drop unnecessary columns
df = df[['v1', 'v2']]

# Rename columns for clarity
df.columns = ['label', 'message']

# Display info after cleaning
print("‚úÖ Data cleaned successfully!")
print("üìä Shape after cleaning:", df.shape)
print(df.sample(5))


‚úÖ Dataset loaded successfully!
üìä Shape: (5572, 5)
‚úÖ Data cleaned successfully!
üìä Shape after cleaning: (5572, 2)
     label                                            message
5446   ham  I am back. Good journey! Let me know if you ne...
3820   ham  You are right though. I can't give you the spa...
255    ham  Don't necessarily expect it to be done before ...
3853   ham  oh ya... Got hip hop open. Haha i was thinking...
497    ham  Some of them told accenture is not confirm. Is...


In [3]:
# üìä Step 1: Data Exploration

# Check dataset balance (spam vs ham)
print("üîπ Label distribution:")
print(df['label'].value_counts())

# Check missing values
print("\nüîπ Missing values:")
print(df.isnull().sum())

# Add a new column with message length
df['length'] = df['message'].apply(len)

# Basic statistics
print("\nüîπ Message length statistics:")
print(df['length'].describe())

# Show examples
print("\nüîπ Sample messages:")
print(df.sample(5))
print("‚úÖ Kernel is working correctly!")

üîπ Label distribution:
label
ham     4825
spam     747
Name: count, dtype: int64

üîπ Missing values:
label      0
message    0
dtype: int64

üîπ Message length statistics:
count    5572.000000
mean       80.118808
std        59.690841
min         2.000000
25%        36.000000
50%        61.000000
75%       121.000000
max       910.000000
Name: length, dtype: float64

üîπ Sample messages:
     label                                            message  length
4204  spam  IMPORTANT INFORMATION 4 ORANGE USER 0796XXXXXX...     157
2500   ham               Remember to ask alex about his pizza      36
795    ham   it's really getting me down just hanging around.      48
4150   ham                      √å√è comin to fetch us oredi...      29
5267   ham      Anything lar then √å_ not going home 4 dinner?      45
‚úÖ Kernel is working correctly!


In [4]:
# üßπ Step 2: Text Preprocessing and Feature Extraction

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download stopwords (run once)
nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Lowercase + split
    text = text.lower().split()
    # Remove stopwords and apply stemming
    text = [ps.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

# Apply cleaning
df['clean_msg'] = df['message'].apply(clean_text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['clean_msg']).toarray()

# Encode labels
y = df['label'].map({'ham':0, 'spam':1}).values

print("‚úÖ Text preprocessing and TF-IDF completed!")
print("üìä Feature matrix shape:", X.shape)


[nltk_data] Downloading package stopwords to /Users/info/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


‚úÖ Text preprocessing and TF-IDF completed!
üìä Feature matrix shape: (5572, 3000)


In [5]:
# ü§ñ Step 3: Model Training and Evaluation

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
acc = accuracy_score(y_test, y_pred)
print("‚úÖ Model trained successfully!")
print("üéØ Accuracy:", round(acc * 100, 2), "%\n")

print("üìä Classification Report:")
print(classification_report(y_test, y_pred))

print("üß© Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


‚úÖ Model trained successfully!
üéØ Accuracy: 97.49 %

üìä Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

üß© Confusion Matrix:
[[965   0]
 [ 28 122]]


In [6]:
# üß† Step 4: Test with a new message

def predict_message(text):
    clean = clean_text(text)
    vector = vectorizer.transform([clean]).toarray()
    pred = model.predict(vector)[0]
    label = "üö´ Spam" if pred == 1 else "‚úÖ Ham"
    return label

# Test examples
print(predict_message("Congratulations! You won a $1000 Walmart gift card. Call now!"))
print(predict_message("Hey bro, are we meeting at 7pm tonight?"))


‚úÖ Ham
‚úÖ Ham


In [7]:
# üíæ Step 5: Save model and vectorizer
import pickle

with open("../src/spam_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("../src/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("‚úÖ Model and vectorizer saved successfully!")


‚úÖ Model and vectorizer saved successfully!


In [8]:
# (Re)define a slightly stronger cleaner: drop very short tokens (len<3)
def clean_text_v2(text):
    import re
    from nltk.stem.porter import PorterStemmer
    from nltk.corpus import stopwords

    ps = PorterStemmer()
    stop = set(stopwords.words('english'))

    text = re.sub('[^a-zA-Z]', ' ', str(text)).lower().split()
    text = [ps.stem(w) for w in text if w not in stop and len(w) >= 3]
    return ' '.join(text)

# Build v2 column (fast)
df['clean_v2'] = df['clean_msg'].where(df['clean_msg'].notna(), '').apply(clean_text_v2)
df['y'] = df['label'].map({'ham':0, 'spam':1}).astype(int)
print("‚úÖ clean_v2 ready.")


‚úÖ clean_v2 ready.


In [9]:
#Cell 6B ‚Äî Stronger TF-IDF (n-grams, sublinear TF)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Stronger features: unigrams+bigrams, ignore ultra-rare/common terms
tfidf_opt = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.90,
    sublinear_tf=True
)
X_opt = tfidf_opt.fit_transform(df['clean_v2'])
y = df['y'].values

Xtr, Xte, ytr, yte = train_test_split(X_opt, y, test_size=0.2, random_state=42, stratify=y)
Xtr.shape, Xte.shape


((4457, 4000), (1115, 4000))

In [10]:
#Cell 6C ‚Äî Try better models & compare
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

models = {
    "NaiveBayes Œ±=0.5": MultinomialNB(alpha=0.5),
    "LogReg (liblinear)": LogisticRegression(max_iter=2000, solver="liblinear", class_weight=None),
    "LinearSVC": LinearSVC()
}

results = []
for name, clf in models.items():
    clf.fit(Xtr, ytr)
    yp = clf.predict(Xte)
    acc = accuracy_score(yte, yp)
    p, r, f1, _ = precision_recall_fscore_support(yte, yp, average='binary')
    results.append((name, acc, p, r, f1))
    print(f"\n=== {name} ===")
    print("Accuracy:", round(acc*100, 2), "%")
    print(classification_report(yte, yp, target_names=['ham','spam']))
    print("Confusion Matrix:\n", confusion_matrix(yte, yp))

# Show sorted summary
results = sorted(results, key=lambda x: x[-1], reverse=True)
print("\nüìä Summary (sorted by F1 for spam):")
for r in results:
    print(f"{r[0]:17s} | Acc {r[1]*100:5.2f}% | P {r[2]:.3f} | R {r[3]:.3f} | F1 {r[4]:.3f}")



=== NaiveBayes Œ±=0.5 ===
Accuracy: 97.58 %
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       0.98      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.94      1115
weighted avg       0.98      0.98      0.97      1115

Confusion Matrix:
 [[964   2]
 [ 25 124]]

=== LogReg (liblinear) ===
Accuracy: 96.5 %
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       0.97      0.76      0.85       149

    accuracy                           0.97      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
 [[963   3]
 [ 36 113]]

=== LinearSVC ===
Accuracy: 98.39 %
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.99   

In [11]:
#Cell 6D ‚Äî Keep best model & save
import pickle

best_name = results[0][0]
best_model = models[best_name]
print("üèÜ Best model:", best_name)

# Save artifacts
with open("../src/spam_model_best.pkl", "wb") as f:
    pickle.dump(best_model, f)
with open("../src/tfidf_best.pkl", "wb") as f:
    pickle.dump(tfidf_opt, f)

print("‚úÖ Saved: ../src/spam_model_best.pkl + ../src/tfidf_best.pkl")


üèÜ Best model: LinearSVC
‚úÖ Saved: ../src/spam_model_best.pkl + ../src/tfidf_best.pkl


In [12]:
#Cell 6E ‚Äî Quick test using best artifacts
def predict_message_v2(text):
    clean = clean_text_v2(text)
    vec = tfidf_opt.transform([clean])
    pred = best_model.predict(vec)[0]
    return "üö´ Spam" if pred == 1 else "‚úÖ Ham"

print(predict_message_v2("WIN a brand new iPhone! Click the link to claim your prize now."))
print(predict_message_v2("Hey Mohamed, the lecture starts at 8:30 tomorrow."))


üö´ Spam
‚úÖ Ham
