In [3]:
# Install MLflow if it's missing
!pip install mlflow -q

import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, auc
import mlflow
import mlflow.sklearn

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load datasets from Google Drive
base_path = '/content/drive/My Drive/dvc_storage/'
train = pd.read_csv(base_path + 'train.csv')
validation = pd.read_csv(base_path + 'validation.csv')
test = pd.read_csv(base_path + 'test.csv')

# Drop rows with NaNs in processed_text
train.dropna(subset=['processed_text'], inplace=True)
validation.dropna(subset=['processed_text'], inplace=True)
test.dropna(subset=['processed_text'], inplace=True)

print("NaNs dropped from datasets.")

# Now define features and targets
X_train, y_train = train['processed_text'], train['target']
X_val, y_val = validation['processed_text'], validation['target']
X_test, y_test = test['processed_text'], test['target']

# Define evaluation metrics
def accuracy(preds, y):
    return (preds == y).mean()

def precision(preds, y):
    TP = ((preds == 1) & (y == 1)).sum()
    FP = ((preds == 1) & (y == 0)).sum()
    return TP / (TP + FP) if (TP + FP) else 0

def recall(preds, y):
    TP = ((preds == 1) & (y == 1)).sum()
    FN = ((preds == 0) & (y == 1)).sum()
    return TP / (TP + FN) if TP + FN > 0 else 0

def f1_score(preds, y):
    p, r = precision(preds, y), recall(preds, y)
    return 2 * p * r / (p + r) if p + r > 0 else 0

def AUCPR(probs, y):
    precision_curve, recall_curve, _ = precision_recall_curve(y, probs)
    return auc(recall_curve, precision_curve)

# Set MLflow Experiment
mlflow.set_experiment("Spam_Classification_Models")

# Define model pipelines
from sklearn.pipeline import make_pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = {
    "Naive_Bayes": make_pipeline(CountVectorizer(), MultinomialNB()),
    "Logistic_Regression": make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000)),
    "Random_Forest": make_pipeline(CountVectorizer(), RandomForestClassifier(random_state=42, n_jobs=-1))
}

# Run experiments
for model_name, model_pipeline in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train model
        model_pipeline.fit(X_train, y_train)
        preds = model_pipeline.predict(X_val)
        probs = model_pipeline.predict_proba(X_val)[:, 1]

        # Calculate metrics
        acc = accuracy(preds, y_val)
        prec = precision(preds, y_val)
        rec = recall(preds, y_val)
        f1 = f1_score(preds, y_val)
        precision_curve, recall_curve, _ = precision_recall_curve(y_val, probs)
        aucpr = auc(recall_curve, precision_curve)

        # Log parameters, metrics, and model
        mlflow.log_param("model", model_name)
        mlflow.log_metric("Accuracy", accuracy(preds, y_val))
        mlflow.log_metric("Precision", precision(preds, y_val))
        mlflow.log_metric("Recall", recall(preds, y_val))
        mlflow.log_metric("F1_Score", f1)
        mlflow.log_metric("AUCPR", aucpr)
        mlflow.sklearn.log_model(model_pipeline, model_name)

        # Print metrics
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy(preds, y_val):.4f}")
        print(f"Precision: {precision(preds, y_val):.4f}")
        print(f"Recall: {recall(preds, y_val):.4f}")
        print(f"F1 Score: {f1_score(preds, y_val):.4f}")
        print(f"AUCPR: {aucpr:.4f}\n")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
NaNs dropped from datasets.




Model: Naive_Bayes
Accuracy: 0.9750
Precision: 0.9349
Recall: 0.8778
F1 Score: 0.9054
AUCPR: 0.9414





Model: Logistic_Regression
Accuracy: 0.9636
Precision: 0.9583
Recall: 0.7667
F1 Score: 0.8519
AUCPR: 0.9276





Model: Random_Forest
Accuracy: 0.9652
Precision: 1.0000
Recall: 0.7444
F1 Score: 0.8535
AUCPR: 0.9421

