- Name: Deepanshi
- Roll No.: MDS202416
- Assignment 2

1. Setup & Imports

In [None]:
!pip install python-dotenv

import os
import warnings
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, average_precision_score, confusion_matrix, precision_recall_curve, auc
)
from dotenv import load_dotenv

# Set seeds for complete reproducibility
np.random.seed(42)

#Suppress warning 
warnings.filterwarnings("ignore")

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


2. Configure Local MLflow Database & DVC Remote

In [36]:
# Setting up SQLite to store our experiment tracking locally
# Setting up SQLite to store our experiment tracking locally
DB_URI = "sqlite:///mlflow_track.db"
mlflow.set_tracking_uri(DB_URI)

# Set a unique experiment name
EXP_NAME = "Spam_Classification"
mlflow.set_experiment(EXP_NAME)

print(f"[INFO] MLflow Backend: {mlflow.get_tracking_uri()}")
print(f"[INFO] Active Experiment: {EXP_NAME}")

2026/02/15 21:39:33 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/02/15 21:39:33 INFO mlflow.store.db.utils: Updating database tables
2026/02/15 21:39:33 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/15 21:39:33 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/02/15 21:39:33 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2026/02/15 21:39:33 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2026/02/15 21:39:33 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2026/02/15 21:39:33 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2026/02/15 21:39:33 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2026/02/15 21:39:33 INFO alembic.runtime.migration: Running 

[INFO] MLflow Backend: sqlite:///mlflow_track.db
[INFO] Active Experiment: Spam_Classification


In [37]:
# Load the hidden credentials from the .env file
load_dotenv()
client_id = os.getenv("GDRIVE_CLIENT_ID", "").strip()
client_secret = os.getenv("GDRIVE_CLIENT_SECRET", "").strip()

if not client_id or not client_secret:
    print("[WARNING] Credentials not found! Check your .env file.")
else:
    # Configure DVC Google Drive Remote
    !dvc remote add -d myremote gdrive://1vZ462bCCmyDfS1Mx8OAqEEczq3V0v_8u
    
    # Force unset old ones to clear any cached broken credentials
    !dvc remote modify --local myremote --unset gdrive_client_id
    !dvc remote modify --local myremote --unset gdrive_client_secret
    
    # Apply the secure credentials
    !dvc remote modify --local myremote gdrive_client_id "{client_id}"
    !dvc remote modify --local myremote gdrive_client_secret "{client_secret}"
    
    print("[INFO] DVC Remote configured securely from .env file.")

Setting 'myremote' as a default remote.


ERROR: configuration error - config file error: remote 'myremote' already exists. Use `-f|--force` to overwrite it.


[INFO] DVC Remote configured securely from .env file.


3. Unified Tuning and Logging Function

In [38]:
def AUCPR(predictions, y_test):
    """Computes the Area Under the Precision-Recall Curve."""
    precision, recall, _ = precision_recall_curve(y_test, predictions)
    return auc(recall, precision)

In [39]:
def optimize_evaluate_and_log(pipeline, param_grid, X_train, y_train, X_val, y_val, X_test, y_test, model_name):
    """
    Runs GridSearchCV to find the best hyperparameters, evaluates the model
    using exact requested print formatting, and logs all data to MLflow.
    """
    print(f"\n[INFO] Running GridSearch Tuning for: {model_name}...")
    
    # 1. Hyperparameter Tuning
    search = GridSearchCV(pipeline, param_grid, cv=3, scoring='average_precision', n_jobs=-1)
    search.fit(X_train, y_train)
    optimized_model = search.best_estimator_
    
    print(f"[INFO] Best Parameters Found: {search.best_params_}\n")

    # 2. MLflow Tracking & Evaluation
    with mlflow.start_run(run_name=model_name):

        # Generate predictions
        y_pred_train = optimized_model.predict(X_train)
        y_pred_val = optimized_model.predict(X_val)
        y_pred_test = optimized_model.predict(X_test)
        
        # Calculate correct training and validation accuracies
        train_accuracy = accuracy_score(y_train, y_pred_train)
        val_accuracy = accuracy_score(y_val, y_pred_val)

        print(f"Training Accuracy: {train_accuracy}")
        print(f"Validation Accuracy: {val_accuracy}")

        # Log parameters
        mlflow.log_param("model_name", model_name)
        for param_key, param_val in search.best_params_.items():
            mlflow.log_param(param_key, param_val)

        # Calculate Test Metrics
        test_accuracy = accuracy_score(y_test, y_pred_test)
        precision = precision_score(y_test, y_pred_test, average='weighted')
        recall = recall_score(y_test, y_pred_test, average='weighted')
        f1 = f1_score(y_test, y_pred_test, average='weighted')
        aucpr_test = AUCPR(y_pred_test, y_test)
        
        # Log Test Metrics to MLflow
        mlflow.log_metric("accuracy", test_accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("AUCPR", aucpr_test)
        
        # Log Confusion Matrix as JSON
        matrix_json = np.array(confusion_matrix(y_test, y_pred_test)).tolist()
        mlflow.log_dict(matrix_json, "confusion_matrix.json")

        # --- DETAILED TERMINAL OUTPUT ---
        print(f"\nModel ({model_name}):")
        print(f"Accuracy: {test_accuracy}")
        print(f"AUCPR: {aucpr_test}")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred_test)} \n")
        
        # Re-calculating standard scores for the percentage printout block
        print_acc = accuracy_score(y_test, y_pred_test) * 100
        print_prec = precision_score(y_test, y_pred_test) * 100
        print_rec = recall_score(y_test, y_pred_test) * 100
        print_f1 = f1_score(y_test, y_pred_test) * 100
        print_aucpr = aucpr_test * 100

        print(f"{model_name}\n\n")
        print(f"On Test Dataset:")
        print(f"Accuracy : {print_acc:.2f}%")
        print(f"Precision : {print_prec:.2f}%")
        print(f"Recall : {print_rec:.2f}%")
        print(f"F1 Score : {print_f1:.2f}%")
        print(f"**** AUCPR : {print_aucpr:.2f}% ****")
        print("\n" + "-" * 50 + "\n")

        # Calculate and Log Validation AUCPR
        aucpr_val = AUCPR(y_pred_val, y_val)
        mlflow.log_metric("Validation AUCPR", aucpr_val)
        print(f"Validation AUCPR: {aucpr_val}")

        # Model Registration 
        input_example = pd.DataFrame(X_test[:1])
        mlflow.sklearn.log_model(optimized_model, "model", input_example=input_example)
        mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/model", model_name)
        
    return optimized_model


4. Define Model Architectures

In [None]:
# Define Model Pipelines and Hyperparameter Grids
algorithms_to_test = {
    # --- Support Vector Machine ---
    "SVM_Model": (
        # Pipelines bundle the TF-IDF vectorizer directly with the classifier.
        # This prevents data leakage during cross-validation and ensures the vectorizer 
        # is saved alongside the model weights inside the MLflow artifact.
        Pipeline([("vectorizer", TfidfVectorizer(max_features=4000)), ("clf", SVC(probability=True, random_state=42))]),
        # The 'clf__' prefix tells GridSearchCV to only apply these parameters 
        # to the 'clf' (classifier) step of the pipeline, not the vectorizer.
        {'clf__C': [0.5, 2.0], 'clf__kernel': ['linear', 'rbf']}
    ),

    # --- Logistic Regression ---
    "LogReg_Model": (
        Pipeline([("vectorizer", TfidfVectorizer(max_features=4000)), ("clf", LogisticRegression(random_state=42))]),
        # Tuning regularization strength (C) and the optimization algorithm
        {'clf__C': [0.5, 2.0, 5.0], 'clf__solver': ['lbfgs', 'liblinear']}
    ),

    # --- Random Forest ---
    "RandomForest_Model": (
        Pipeline([("vectorizer", TfidfVectorizer(max_features=4000)), ("clf", RandomForestClassifier(random_state=42))]),
        # Tuning the number of trees (n_estimators) and the maximum depth to prevent overfitting
        {'clf__n_estimators': [50, 150], 'clf__max_depth': [None, 20]}
    )
}

5. Version 1 Data (Seed 21)

In [41]:
#1. Checkout the specific commit based on your diagnostic output
!git checkout 873224d 

# 2. Tell DVC to update the tracking files
!dvc checkout

# 3. Force DVC to pull the heavy CSV files from Google Drive
!dvc pull --force

M	.dvc/config


Note: switching to '873224d'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 873224d Version 1: Split with seed 21


M       test.csv
M       train.csv
M       validation.csv
Everything is up to date.


In [42]:
# Load Data
df_train = pd.read_csv("train.csv").dropna()
df_val = pd.read_csv("validation.csv").dropna()
df_test = pd.read_csv("test.csv").dropna()

X_train_v1, y_train_v1 = df_train["clean_text"], df_train["target"]
X_val_v1, y_val_v1 = df_val["clean_text"], df_val["target"]
X_test_v1, y_test_v1 = df_test["clean_text"], df_test["target"]

print(f"[SUCCESS] Version 1 Data Loaded! Training Size: {len(X_train_v1)}")

[SUCCESS] Version 1 Data Loaded! Training Size: 3613


In [43]:
print("="*50)
print("STARTING EXPERIMENTS: VERSION 1")
print("="*50)

# Loop through our dictionary to tune and track every model automatically
for name, (pipe, grid) in algorithms_to_test.items():
    optimize_evaluate_and_log(pipe, grid, X_train_v1, y_train_v1, X_val_v1, y_val_v1, X_test_v1, y_test_v1, name)

STARTING EXPERIMENTS: VERSION 1

[INFO] Running GridSearch Tuning for: SVM_Model...
[INFO] Best Parameters Found: {'clf__C': 2.0, 'clf__kernel': 'rbf'}





Training Accuracy: 0.9991696650982563
Validation Accuracy: 0.9754838709677419

Model (SVM_Model):
Accuracy: 0.9741935483870968
AUCPR: 0.909069289005925
Confusion Matrix:
 [[676   1]
 [ 19  79]] 

SVM_Model


On Test Dataset:
Accuracy : 97.42%
Precision : 98.75%
Recall : 80.61%
F1 Score : 88.76%
**** AUCPR : 90.91% ****

--------------------------------------------------

Validation AUCPR: 0.915319289005925


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 2343.00it/s] 
Successfully registered model 'SVM_Model'.
Created version '1' of model 'SVM_Model'.



[INFO] Running GridSearch Tuning for: LogReg_Model...
[INFO] Best Parameters Found: {'clf__C': 5.0, 'clf__solver': 'liblinear'}





Training Accuracy: 0.9903127594796568
Validation Accuracy: 0.9716129032258064

Model (LogReg_Model):
Accuracy: 0.9754838709677419
AUCPR: 0.913603329025756
Confusion Matrix:
 [[676   1]
 [ 18  80]] 

LogReg_Model


On Test Dataset:
Accuracy : 97.55%
Precision : 98.77%
Recall : 81.63%
F1 Score : 89.39%
**** AUCPR : 91.36% ****

--------------------------------------------------

Validation AUCPR: 0.8983624094799211


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1497.89it/s] 
Successfully registered model 'LogReg_Model'.
Created version '1' of model 'LogReg_Model'.



[INFO] Running GridSearch Tuning for: RandomForest_Model...
[INFO] Best Parameters Found: {'clf__max_depth': None, 'clf__n_estimators': 150}





Training Accuracy: 0.9994464433988375
Validation Accuracy: 0.9690322580645161

Model (RandomForest_Model):
Accuracy: 0.9690322580645161
AUCPR: 0.8909128235334881
Confusion Matrix:
 [[676   1]
 [ 23  75]] 

RandomForest_Model


On Test Dataset:
Accuracy : 96.90%
Precision : 98.68%
Recall : 76.53%
F1 Score : 86.21%
**** AUCPR : 89.09% ****

--------------------------------------------------

Validation AUCPR: 0.8930348913759052


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1563.12it/s] 
Successfully registered model 'RandomForest_Model'.
Created version '1' of model 'RandomForest_Model'.


Observations:
1. Champion Model: Logistic Regression was the best performing model. It achieved the highest Test AUCPR (91.36%) and the highest F1 Score (89.39%), successfully catching more spam (80 True Positives) than the others.

2. Excellent Spam Prevention: All three models produced only 1 False Positive. In a real-world spam filter, this high precision (98.7%) is crucial because it ensures legitimate messages are not accidentally blocked.

3. Overfitting in Complex Models: Both Random Forest and SVM showed signs of overfitting. They achieved near-perfect accuracy on the training data (99.9%) but struggled more on the unseen test data. Random Forest was the most overfit, resulting in the lowest Test Recall (76.53%).

4. Tuning Success: The GridSearch successfully identified the optimal hyperparameters, favoring strong regularization (C=5.0) for the winning Logistic Regression model to help it generalize better to new data.

In [44]:
!git log --oneline

873224d Version 1: Split with seed 21
34ceb5c Initialize Git and DVC


In [45]:
!git log --all --oneline

1713e5e Version 2: Updated split with seed 77
873224d Version 1: Split with seed 21
34ceb5c Initialize Git and DVC


6. Checkout & Train on Data Version 2 (Seed 77)

In [46]:
# 1. Return to main branch for the updated dataset
!git checkout 1713e5e

# 2. Tell DVC to update the tracking files
!dvc checkout

# 3. Force DVC to pull the NEW heavy CSV files from Google Drive
!dvc pull --force

M	.dvc/config


Previous HEAD position was 873224d Version 1: Split with seed 21
HEAD is now at 1713e5e Version 2: Updated split with seed 77


M       test.csv
M       train.csv
M       validation.csv
Everything is up to date.


In [47]:
# Load New Data
df_train_v2 = pd.read_csv("train.csv").dropna()
df_val_v2 = pd.read_csv("validation.csv").dropna()
df_test_v2 = pd.read_csv("test.csv").dropna()

X_train_v2, y_train_v2 = df_train_v2["clean_text"], df_train_v2["target"]
X_val_v2, y_val_v2 = df_val_v2["clean_text"], df_val_v2["target"]
X_test_v2, y_test_v2 = df_test_v2["clean_text"], df_test_v2["target"]

print(f"[SUCCESS] Version 2 Data Loaded! Training Size: {len(X_train_v2)}")

# %%
print("="*50)
print("STARTING EXPERIMENTS: VERSION 2 (Data Shift)")
print("="*50)

# Re-run the loop on the new data
for name, (pipe, grid) in algorithms_to_test.items():
    optimize_evaluate_and_log(pipe, grid, X_train_v2, y_train_v2, X_val_v2, y_val_v2, X_test_v2, y_test_v2, name)

[SUCCESS] Version 2 Data Loaded! Training Size: 3613
STARTING EXPERIMENTS: VERSION 2 (Data Shift)

[INFO] Running GridSearch Tuning for: SVM_Model...
[INFO] Best Parameters Found: {'clf__C': 2.0, 'clf__kernel': 'rbf'}





Training Accuracy: 0.9994464433988375
Validation Accuracy: 0.983225806451613

Model (SVM_Model):
Accuracy: 0.9819354838709677
AUCPR: 0.9376036866359447
Confusion Matrix:
 [[677   0]
 [ 14  84]] 

SVM_Model


On Test Dataset:
Accuracy : 98.19%
Precision : 100.00%
Recall : 85.71%
F1 Score : 92.31%
**** AUCPR : 93.76% ****

--------------------------------------------------

Validation AUCPR: 0.9407703192511709


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 2308.00it/s] 
Registered model 'SVM_Model' already exists. Creating a new version of this model...
Created version '2' of model 'SVM_Model'.



[INFO] Running GridSearch Tuning for: LogReg_Model...
[INFO] Best Parameters Found: {'clf__C': 5.0, 'clf__solver': 'liblinear'}





Training Accuracy: 0.9914198726819817
Validation Accuracy: 0.9806451612903225

Model (LogReg_Model):
Accuracy: 0.9806451612903225
AUCPR: 0.9331468071099408
Confusion Matrix:
 [[677   0]
 [ 15  83]] 

LogReg_Model


On Test Dataset:
Accuracy : 98.06%
Precision : 100.00%
Recall : 84.69%
F1 Score : 91.71%
**** AUCPR : 93.31% ****

--------------------------------------------------

Validation AUCPR: 0.9305663132883855


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1661.30it/s] 
Registered model 'LogReg_Model' already exists. Creating a new version of this model...
Created version '2' of model 'LogReg_Model'.



[INFO] Running GridSearch Tuning for: RandomForest_Model...
[INFO] Best Parameters Found: {'clf__max_depth': None, 'clf__n_estimators': 150}





Training Accuracy: 0.9994464433988375
Validation Accuracy: 0.9819354838709677

Model (RandomForest_Model):
Accuracy: 0.9716129032258064
AUCPR: 0.8999952735436607
Confusion Matrix:
 [[676   1]
 [ 21  77]] 

RandomForest_Model


On Test Dataset:
Accuracy : 97.16%
Precision : 98.72%
Recall : 78.57%
F1 Score : 87.50%
**** AUCPR : 90.00% ****

--------------------------------------------------

Validation AUCPR: 0.9376036866359447


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1177.70it/s] 
Registered model 'RandomForest_Model' already exists. Creating a new version of this model...
Created version '2' of model 'RandomForest_Model'.


Observations:
1. A New Champion Model: The data shift caused a change in leadership! The Support Vector Machine (SVM) outperformed Logistic Regression this time, achieving the highest Test AUCPR (93.76%), highest Recall (85.71%), and the best F1 Score (92.31%). It successfully caught 84 spam messages.

2. Perfect Precision (Zero False Positives): Both the SVM and Logistic Regression models achieved 0 False Positives on the test set, resulting in a perfect 100% Precision score. This is the ultimate goal for a spam filter, meaning absolutely no legitimate messages would be blocked for the user.

3. Robust Hyperparameters: Interestingly, GridSearchCV selected the exact same optimal hyperparameters for all three models as it did in Version 1. This proves that these specific parameter configurations (e.g., C=5.0 for LogReg, C=2.0 for SVM) are highly robust and not just flukes of the first data split.

4. Overall Performance Improvement: Despite using the same hyperparameters, all three models saw a noticeable jump in AUCPR and F1 scores compared to Version 1. This indicates that the Seed 77 data split might contain slightly more balanced or distinguishable text patterns in the training set, making the test set easier to predict.

5. Random Forest Still Overfitting: Random Forest remained the weakest model. It still allowed trees to grow infinitely deep (max_depth: None), causing it to memorize the training data (99.9% accuracy) but struggle on unseen data, resulting in the lowest AUCPR (90.00%) and the only False Positive of the group.

7. Find Champion Model

In [48]:
def find_champion_model(model_list):
    """Queries the MLflow registry to find the absolute best model across all versions."""
    tracker = MlflowClient()
    print("\nCHAMPION MODEL SEARCH (Metric: Test AUCPR)")
    print("-" * 50)
    
    top_overall_aucpr = -1
    champion_details = ""
    
    for m_name in model_list:
        try:
            versions = tracker.search_model_versions(f"name='{m_name}'")
            best_local_aucpr = -1
            
            for v in versions:
                # Fetch the metrics for this specific run
                run_data = tracker.get_run(v.run_id)
                current_aucpr = float(run_data.data.metrics.get("AUCPR", 0))
                
                if current_aucpr > best_local_aucpr:
                    best_local_aucpr = current_aucpr
                    
                # Check if it beats the global record
                if current_aucpr > top_overall_aucpr:
                    top_overall_aucpr = current_aucpr
                    champion_details = f"{m_name} (Version {v.version})"
                    
            print(f"  Best {m_name} Score: {best_local_aucpr:.4f}")
            
        except Exception as e:
            print(f"  [WARNING] Could not evaluate {m_name}")

    print("=" * 50)
    if top_overall_aucpr > 0:
        print(f"CHAMPION: {champion_details} | AUCPR: {top_overall_aucpr:.4f}")
    else:
        print("No models found in the registry.")

# Run the search
find_champion_model(["SVM_Model", "LogReg_Model", "RandomForest_Model"])


CHAMPION MODEL SEARCH (Metric: Test AUCPR)
--------------------------------------------------
  Best SVM_Model Score: 0.9376
  Best LogReg_Model Score: 0.9331
  Best RandomForest_Model Score: 0.9000
CHAMPION: SVM_Model (Version 2) | AUCPR: 0.9376


Summary
1. Pipeline Setup: Built a fully reproducible MLOps pipeline using DVC to manage different data splits (Seed 21 vs. Seed 77) and MLflow to track hyperparameters, metrics, and model versions.

2. Methodology: Trained three algorithms (SVM, Logistic Regression, Random Forest) using TF-IDF pipelines. Tuned them with GridSearchCV, specifically optimizing for AUCPR to handle the imbalanced spam data.

3. Data Shift Impact: Changing the data seed changed the winner. Logistic Regression won on Version 1, but overall performance improved on Version 2, where SVM took the lead. Interestingly, the optimal hyperparameters remained exactly the same across both data versions, proving their robustness.

4. The Champion Model: The overall winner from the MLflow registry was the Support Vector Machine (Version 2) with C=2.0 and kernel='rbf'. It achieved the highest Test AUCPR (93.76%) and a perfect 100% Precision (0 False Positives)—meaning it caught 84 spam texts without blocking a single legitimate message.