# Assignment 2 — MLflow Experiment Tracking & Model Versioning

**Goals (this notebook):**
1. Load the versioned dataset (the current `train.csv`/`validation.csv`/`test.csv` under PROJECT_DIR).
2. Train and log **three benchmark models** with MLflow, logging AUCPR as the primary selection metric.
3. Show how to query MLflow runs and print AUCPR for each trained model.
4. Demonstrate model registration (optional; only if an MLflow Model Registry is available).

In [1]:
# Optional: install mlflow in notebook environment (prefer terminal)
# !pip install mlflow

import os, joblib, json
import pandas as pd, numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn

# Project settings (match prepare notebook)
PROJECT_DIR = '.'   
TRAIN_CSV = os.path.join(PROJECT_DIR, 'train.csv')
VAL_CSV = os.path.join(PROJECT_DIR, 'validation.csv')
TEST_CSV = os.path.join(PROJECT_DIR, 'test.csv')

# MLflow experiment name
EXPERIMENT_NAME = "Assignment2_SMS_Spam"
mlflow.set_experiment(EXPERIMENT_NAME)
print("MLflow tracking URI:", mlflow.get_tracking_uri())

  from .autonotebook import tqdm as notebook_tqdm
2026/02/16 00:11:29 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/02/16 00:11:29 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/02/16 00:11:29 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/02/16 00:11:29 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/02/16 00:11:29 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/02/16 00:11:29 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/02/16 00:11:30 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/02/16 00:11:30 INFO mlflow.store.db.utils: Updating database tables
2026/02/16 00:11:30 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/16 00:11:30 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/02/16 00:11:30 INFO alembic.runtime.migration: Running upgrade

MLflow tracking URI: sqlite:///mlflow.db


In [2]:
# Load CSVs created/tracked by DVC (current checked out version in PROJECT_DIR)
train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(VAL_CSV)
test_df = pd.read_csv(TEST_CSV)

print("Shapes -> train, val, test:", train_df.shape, val_df.shape, test_df.shape)
print("Train label distribution:", train_df['label'].value_counts().to_dict())

Shapes -> train, val, test: (3901, 2) (836, 2) (837, 2)
Train label distribution: {'ham': 3378, 'spam': 523}


In [3]:
# Helper functions for scoring and AUCPR
from sklearn.preprocessing import LabelEncoder

def get_positive_scores(pipeline, X):
    # Try predict_proba, otherwise decision_function, else fallback to predict
    try:
        probs = pipeline.predict_proba(X)
        # find column index for 'spam' if classes_ are strings
        if hasattr(pipeline, 'classes_'):
            classes = pipeline.classes_
        else:
            # some sklearn pipelines place classes_ on the last estimator
            last = list(pipeline.named_steps.items())[-1][1]
            classes = getattr(last, 'classes_', None)
        if classes is not None and 'spam' in classes:
            pos_idx = list(classes).index('spam')
            return probs[:, pos_idx]
        else:
            # assume positive class is column index 1
            return probs[:, -1]
    except Exception:
        try:
            return pipeline.decision_function(X)
        except Exception:
            # fallback: predict returns 0/1 -> convert to 0/1 float
            preds = pipeline.predict(X)
            return (preds == 'spam').astype(float)

def compute_aupr(y_true, y_scores):
    y_bin = (y_true == 'spam').astype(int)
    return average_precision_score(y_bin, y_scores)

In [4]:
def train_and_log_mlflow(run_name, pipeline, X_train, y_train, X_val, y_val, params=None):
    """
    Train pipeline, compute AUCPR on validation, log model and metrics to MLflow.
    Returns run_id and aupr.
    """
    params = params or {}
    pipeline.fit(X_train, y_train)
    scores_val = get_positive_scores(pipeline, X_val)
    aupr_val = compute_aupr(y_val, scores_val)
    # classification metrics (hard preds)
    y_val_pred = pipeline.predict(X_val)
    report = classification_report(y_val, y_val_pred, output_dict=True)
    # log with MLflow
    with mlflow.start_run(run_name=run_name) as run:
        mlflow.log_params(params)
        mlflow.log_metric("aupr_val", float(aupr_val))
        # also log spam f1/precision/recall
        if 'spam' in report:
            mlflow.log_metric("f1_spam_val", float(report['spam']['f1-score']))
            mlflow.log_metric("precision_spam_val", float(report['spam']['precision']))
            mlflow.log_metric("recall_spam_val", float(report['spam']['recall']))
        # log model artifact
        mlflow.sklearn.log_model(pipeline, artifact_path="model")
        run_id = run.info.run_id
    print(f"Logged run {run_id} ({run_name}) -> AUCPR (val) = {aupr_val:.4f}")
    return run_id, aupr_val

In [5]:
# Define 3 benchmarks (you can replace RandomForest with LinearSVC if desired)
pipelines = {
    "nb_count": Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())]),
    "logreg_tfidf": Pipeline([('vect', TfidfVectorizer()), ('clf', LogisticRegression(max_iter=2000))]),
    "rf_tfidf": Pipeline([('vect', TfidfVectorizer()), ('clf', RandomForestClassifier(n_estimators=200, random_state=42))])
}

X_train = train_df['message']
y_train = train_df['label']
X_val = val_df['message']
y_val = val_df['label']

runs = []
for name, pipe in pipelines.items():
    run_id, aupr = train_and_log_mlflow(name, pipe, X_train, y_train, X_val, y_val, params={"pipeline": name})
    runs.append({'name': name, 'run_id': run_id, 'aupr_val': aupr})
    
import pandas as pd
display(pd.DataFrame(runs).sort_values('aupr_val', ascending=False))

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Logged run 1dc4244162f9435a9c0e88d2c5368ffa (nb_count) -> AUCPR (val) = 0.9733


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Logged run dfa3e2ff48954ba080bf9e717be66def (logreg_tfidf) -> AUCPR (val) = 0.9856


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Logged run 6999b065dbc540f589b615542ef53940 (rf_tfidf) -> AUCPR (val) = 0.9920


Unnamed: 0,name,run_id,aupr_val
2,rf_tfidf,6999b065dbc540f589b615542ef53940,0.992001
1,logreg_tfidf,dfa3e2ff48954ba080bf9e717be66def,0.985605
0,nb_count,1dc4244162f9435a9c0e88d2c5368ffa,0.973329


In [6]:
# Use MlflowClient to list runs for the experiment
from mlflow.tracking import MlflowClient
client = MlflowClient()
exp = client.get_experiment_by_name(EXPERIMENT_NAME)
run_infos = client.search_runs(experiment_ids=[exp.experiment_id], filter_string="")
rows = []
for r in run_infos:
    rows.append({
        'run_id': r.info.run_id,
        'run_name': r.data.tags.get('mlflow.runName'),
        'aupr_val': r.data.metrics.get('aupr_val')
    })
df_runs = pd.DataFrame(rows)
df_runs = df_runs.sort_values('aupr_val', ascending=False)
display(df_runs)

Unnamed: 0,run_id,run_name,aupr_val
0,6999b065dbc540f589b615542ef53940,rf_tfidf,0.992001
1,dfa3e2ff48954ba080bf9e717be66def,logreg_tfidf,0.985605
2,1dc4244162f9435a9c0e88d2c5368ffa,nb_count,0.973329


In [None]:
# Evaluate each run's logged model on test set. This downloads the model artifact and loads it.


test_X = test_df['message']
test_y = test_df['label']

final_rows = []
for r in df_runs.itertuples():
    run_id = r.run_id
    # model URI: "runs:/<run_id>/model"
    model_uri = f"runs:/{run_id}/model"
    try:
        model = mlflow.sklearn.load_model(model_uri)
        scores_test = get_positive_scores(model, test_X)
        aupr_test = compute_aupr(test_y, scores_test)
        final_rows.append({'run_id': run_id, 'run_name': r.run_name, 'aupr_test': aupr_test})
        print(f"Run {run_id} ({r.run_name}) -> AUCPR on test: {aupr_test:.4f}")
    except Exception as e:
        print(f"Could not load model for run {run_id}: {e}")

display(pd.DataFrame(final_rows).sort_values('aupr_test', ascending=False))

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 520.06it/s] 


Run 6999b065dbc540f589b615542ef53940 (rf_tfidf) -> AUCPR on test: 0.9673


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 670.51it/s] 


Run dfa3e2ff48954ba080bf9e717be66def (logreg_tfidf) -> AUCPR on test: 0.9667


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 865.13it/s] 


Run 1dc4244162f9435a9c0e88d2c5368ffa (nb_count) -> AUCPR on test: 0.9822


Unnamed: 0,run_id,run_name,aupr_test
2,1dc4244162f9435a9c0e88d2c5368ffa,nb_count,0.982216
0,6999b065dbc540f589b615542ef53940,rf_tfidf,0.967277
1,dfa3e2ff48954ba080bf9e717be66def,logreg_tfidf,0.966741


In [None]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

if not df_runs.empty:
    best_run_id = df_runs.iloc[0].run_id
    model_uri = f"runs:/{best_run_id}/model"
    model_name = "Assignment2_SMSSpam"
    try:
        client.create_registered_model(model_name)
    except Exception as e:
        # model may already exist
        print("create_registered_model: skipped or failed:", e)
    try:
        mv = client.create_model_version(name=model_name, source=model_uri, run_id=best_run_id)
        print("Registered model version:", mv.version)
    except Exception as e:
        print("Model registration failed (likely no registry configured). Exception:", e)

Registered model version: 1
