Imports and Data Loading

In [10]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [11]:
# --- 1. Load the splits created in prepare.ipynb ---
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [12]:
# --- 2. Handle missing values ---
# If preprocessing resulted in empty strings (e.g., a message was just "!!!"), 
# pandas might read them as NaN. We replace them with empty strings.
train_df['clean_text'] = train_df['clean_text'].fillna('')
val_df['clean_text'] = val_df['clean_text'].fillna('')
test_df['clean_text'] = test_df['clean_text'].fillna('')

In [13]:
# --- 3. Prepare Feature (X) and Target (y) variables ---
X_train = train_df['clean_text']
y_train = train_df['label']

X_val = val_df['clean_text']
y_val = val_df['label']

X_test = test_df['clean_text']
y_test = test_df['label']

print(f"Loaded Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Loaded Train: (3613,), Val: (775,), Test: (775,)


Helper Functions

In [14]:
def fit_model(model, X_train, y_train):
    """
    Fits the model (pipeline) on training data.
    """
    model.fit(X_train, y_train)
    return model

def score_model(model, X, y):
    """
    Returns the accuracy score for the model on given data.
    """
    return model.score(X, y)

def evaluate_model(model, X, y, dataset_name="Data"):
    """
    Prints a detailed classification report and returns key metrics.
    """
    y_pred = model.predict(X)
    
    print(f"--- Evaluation on {dataset_name} ---")
    print(classification_report(y, y_pred))
    
    # Return metrics as a dictionary for easy comparison later
    return {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred)
    }

Baseline Model Training

In [15]:
# We use Pipelines to automate the TF-IDF Vectorization step
models = {
    "Naive Bayes": Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', MultinomialNB())
    ]),
    "Logistic Regression": Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', LogisticRegression(random_state=42))
    ]),
    "Random Forest": Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
    ])
}



In [16]:
# Dictionary to store validation results
val_results = {}

print("Training and validating base models...\n")

for name, model in models.items():
    print(f"Training {name}...")
    
    # 1. Fit on Train
    fit_model(model, X_train, y_train)
    
    # 2. Score & Evaluate on Train (Sanity Check)
    train_acc = score_model(model, X_train, y_train)
    print(f"  Train Accuracy: {train_acc:.4f}")
    
    # 3. Score & Evaluate on Validation
    metrics = evaluate_model(model, X_val, y_val, dataset_name=f"{name} (Validation)")
    val_results[name] = metrics
    print("-" * 60)

Training and validating base models...

Training Naive Bayes...
  Train Accuracy: 0.9729
--- Evaluation on Naive Bayes (Validation) ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       677
           1       1.00      0.70      0.83        98

    accuracy                           0.96       775
   macro avg       0.98      0.85      0.90       775
weighted avg       0.96      0.96      0.96       775

------------------------------------------------------------
Training Logistic Regression...
  Train Accuracy: 0.9632
--- Evaluation on Logistic Regression (Validation) ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       677
           1       0.97      0.68      0.80        98

    accuracy                           0.96       775
   macro avg       0.96      0.84      0.89       775
weighted avg       0.96      0.96      0.95       775

-------------------------------------

Hyperparameter Tuning (Fine-Tuning)

In [17]:
print("--- Fine-Tuning All Models ---\n")

# Define hyperparameter grids for each model
param_grids = {
    "Naive Bayes": {
        'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams vs Bigrams
        'clf__alpha': [0.1, 0.5, 1.0]            # Smoothing parameter (Lower = less smoothing)
    },
    "Logistic Regression": {
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10, 100]              # Regularization strength (Higher = less regularization)
    },
    "Random Forest": {
        'tfidf__ngram_range': [(1, 1)],          # Keep RF simple to save time
        'clf__n_estimators': [50, 100],          # Number of trees
        'clf__max_depth': [None, 10, 20],        # Tree depth control
        'clf__min_samples_split': [2, 5]
    }
}

tuned_models = {}

# Loop through models and grid search each one
for name, pipeline in models.items():
    print(f"Tuning {name}...")
    
    # Get the specific grid for this model
    grid = param_grids[name]
    
    # Run Grid Search (using 'f1' as the scoring metric for imbalanced data)
    search = GridSearchCV(pipeline, grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
    search.fit(X_train, y_train)
    
    # Save the best version
    tuned_models[name] = search.best_estimator_
    
    print(f"  Best Params: {search.best_params_}")
    print(f"  Best CV F1: {search.best_score_:.4f}")
    print("-" * 40)

--- Fine-Tuning All Models ---

Tuning Naive Bayes...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
  Best Params: {'clf__alpha': 0.1, 'tfidf__ngram_range': (1, 2)}
  Best CV F1: 0.9015
----------------------------------------
Tuning Logistic Regression...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
  Best Params: {'clf__C': 100, 'tfidf__ngram_range': (1, 2)}
  Best CV F1: 0.9030
----------------------------------------
Tuning Random Forest...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
  Best Params: {'clf__max_depth': None, 'clf__min_samples_split': 5, 'clf__n_estimators': 100, 'tfidf__ngram_range': (1, 1)}
  Best CV F1: 0.8538
----------------------------------------


Final Evaluation and Model Selection 

Test the best models from each category on unseen data and select a winner.

In [19]:
print("\n--- Final Evaluation on Test Set (Tuned Models) ---\n")

best_score = 0
best_model_name = ""
results_list = []

for name, model in tuned_models.items():
    # Evaluate the tuned model on the Test set
    metrics = evaluate_model(model, X_test, y_test, dataset_name=name)
    metrics['Model'] = name
    results_list.append(metrics)
    
    # Track the winner (based on F1 Score)
    if metrics['f1'] > best_score:
        best_score = metrics['f1']
        best_model_name = name

# Display comparative results
results_df = pd.DataFrame(results_list).set_index('Model')
print("\nSummary of Tuned Model Performance:")
print(results_df)

print(f"\nThe Best Overall Model is: {best_model_name}")
print(f"   With an F1-Score of: {best_score:.4f}")

# Save the best model using joblib
joblib.dump(tuned_models[best_model_name], 'best_spam_classifier.pkl')
print(f"   Model saved to 'best_spam_classifier.pkl'")


--- Final Evaluation on Test Set (Tuned Models) ---

--- Evaluation on Naive Bayes ---
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       677
           1       0.99      0.87      0.92        98

    accuracy                           0.98       775
   macro avg       0.98      0.93      0.96       775
weighted avg       0.98      0.98      0.98       775

--- Evaluation on Logistic Regression ---
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       677
           1       0.96      0.88      0.91        98

    accuracy                           0.98       775
   macro avg       0.97      0.94      0.95       775
weighted avg       0.98      0.98      0.98       775

--- Evaluation on Random Forest ---
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       677
           1       1.00      0.83      0.91        98

    accuracy    

Summary & Results

In this project, we built a pipeline to classify SMS messages as Spam or Ham (not spam). We explored three algorithms: Multinomial Naive Bayes, Logistic Regression, and Random Forest.

1. Preprocessing Strategy
- Cleaning: We implemented aggressive text cleaning (lowercasing, regex removal of special characters, stopword removal, and Porter Stemming).

- Vectorization: We utilized TfidfVectorizer to convert text into numerical features, testing both unigrams (1,1) and bigrams (1,2).

- Splitting: Data was stratified split into Train (70%), Validation (15%), and Test (15%).

2. Model Performance (Test Set)After hyperparameter tuning using GridSearchCV (optimizing for F1-Score), The Best Overall Model is: Naive Bayes With an F1-Score of: 0.9239.

3. Conclusion
- The Winner: Multinomial Naive Bayes achieved the highest overall F1-Score (0.924). It provided the best balance between precision and recall.

- Precision vs. Recall: * Random Forest achieved perfect precision (100%), meaning it never classified a real message as spam (0 False Positives). However, it had the lowest recall, missing about 17% of actual spam.

- Naive Bayes missed slightly fewer spam messages (higher recall) while maintaining extremely high precision.

- Tuning Impact: Tuning was highly effective. For example, Naive Bayes improved significantly when moving to alpha=0.1 (less smoothing), allowing it to trust the data more.