In [None]:
# importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import joblib

# Load data
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

# Prepare features (X) and target (y)
X = df['message']
y = df['label']

# Encode labels: 'ham' = 0, 'spam' = 1
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.3, random_state=42)

# Apply SMOTE to resample the training data (balance the dataset)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Dictionary of models to evaluate
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'XGBoost': xgb.XGBClassifier(scale_pos_weight=1, random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Get probabilities for class 1 (spam)

    # Print classification report
    print(f"Classification Report for {model.__class__.__name__}:")
    print(classification_report(y_test, y_pred))
    
    # Calculate ROC-AUC
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f'ROC AUC for {model.__class__.__name__}: {roc_auc:.4f}')
    
    # Calculate Precision-Recall AUC
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    print(f'Precision-Recall AUC for {model.__class__.__name__}: {pr_auc:.4f}')
    
    print("-" * 80)
    return model

# Loop through all models and evaluate
best_model = None
for model_name, model in models.items():
    trained_model = evaluate_model(model, X_train_resampled, y_train_resampled, X_test, y_test)
    
    # If this model gives the best accuracy, update `best_model`
    if not best_model or classification_report(y_test, trained_model.predict(X_test)).split()[-2] > classification_report(y_test, best_model.predict(X_test)).split()[-2]:
        best_model = trained_model


print(best_model)
# Now save the best model, vectorizer, and label encoder
joblib.dump(best_model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')


Classification Report for MultinomialNB:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1448
           1       0.85      0.97      0.91       224

    accuracy                           0.97      1672
   macro avg       0.92      0.97      0.95      1672
weighted avg       0.98      0.97      0.97      1672

ROC AUC for MultinomialNB: 0.9893
Precision-Recall AUC for MultinomialNB: 0.9755
--------------------------------------------------------------------------------
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1448
           1       1.00      0.92      0.95       224

    accuracy                           0.99      1672
   macro avg       0.99      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672

ROC AUC for RandomForestClassifier: 0.9962
Precision-Recall AUC for RandomForestClassifier: 0

['label_encoder.pkl']