###Assignment 2 - Model Version Control (MDS202414)

In [2]:
# !pip install mlflow

In [3]:
import mlflow

###Loading splits

In [4]:
#function to load the data, drop na and convert spam label to 1 and ham label to 0

import pandas as pd

def load_splits(train_path, val_path, test_path, label_col="label", pos_label="spam"):
    # Load CSVs
    train = pd.read_csv(train_path)
    val   = pd.read_csv(val_path)
    test  = pd.read_csv(test_path)

    # Drop rows with NA anywhere (features or label)
    train = train.dropna()
    val   = val.dropna()
    test  = test.dropna()

    # Convert labels to binary
    def convert_labels(df):
        y = (df[label_col] == pos_label).astype(int)
        X = df.drop(columns=[label_col])
        return X, y

    X_train, y_train = convert_labels(train)
    X_val, y_val     = convert_labels(val)
    X_test, y_test   = convert_labels(test)

    return X_train, y_train, X_val, y_val, X_test, y_test


In [5]:
X_train, y_train, X_val, y_val, X_test, y_test = load_splits("train_set.csv","validation_set.csv","test_set.csv")

In [11]:
#scaling data -> converting raw counts to proportions
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)


###ML pipeline

In [6]:
#function to train and get score for model

from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

def train_and_eval(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    # Get continuous scores
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_val)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_val)
    else:
        y_score = None

    metrics = {
        "accuracy": accuracy_score(y_val, y_pred)
    }

    if y_score is not None:
        metrics["roc_auc"] = roc_auc_score(y_val, y_score)
        metrics["pr_auc"]  = average_precision_score(y_val, y_score)

    return metrics


In [7]:
#function for hyperparameter tuning using the score function used in above function

from itertools import product
import numpy as np

def tune_model(model_class, param_grid, X_train, y_train, X_val, y_val, metric="pr_auc"):
    best_score = -np.inf
    best_params = None
    best_model = None

    keys, values = zip(*param_grid.items())

    for v in product(*values):
        params = dict(zip(keys, v))
        model = model_class(**params)

        metrics = train_and_eval(model, X_train, y_train, X_val, y_val)
        score = metrics[metric]

        print(f"{model_class.__name__} params={params} metrics={metrics}")

        if score > best_score:
            best_score = score
            best_params = params
            best_model = model

    return best_model, best_params, best_score


In [10]:
#setting up the hyperparameter space for the 3 models

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

models = {
    "LogReg": (LogisticRegression, {
        "C": [0.01, 0.1, 1, 10],
        "solver": ["liblinear"],
        "max_iter": [1000]
    }),

    "NaiveBayes": (MultinomialNB, {
        "alpha": [0.1, 0.5, 1, 2]
    }),

    "LinearSVM": (LinearSVC, {
        "C": [0.01, 0.1, 1, 10],
        "max_iter": [5000]
    })
}


###Function for logging experiment along with evaluation of model on test data

In [12]:
def run_model_and_versioning(model_name, model_class, best_params,
                             X_train, y_train, X_val, y_val,
                             X_test, y_test):

    with mlflow.start_run(run_name=model_name):

        # Combine train + val
        X_train_full = np.vstack([X_train, X_val])
        y_train_full = np.concatenate([y_train, y_val])

        model = model_class(**best_params)
        model.fit(X_train_full, y_train_full)

        y_pred = model.predict(X_test)

        # Scores
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, "decision_function"):
            y_score = model.decision_function(X_test)
        else:
            y_score = None

        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_score)
        pr_auc  = average_precision_score(y_test, y_score)

        # Log params
        mlflow.log_params(best_params)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("pr_auc", pr_auc)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        run_id = mlflow.active_run().info.run_id

    return run_id, {"accuracy": accuracy,
                    "roc_auc": roc_auc,
                    "pr_auc": pr_auc}


###Executing the runs for each model

In [44]:
mlflow.set_experiment("MLflow Assignment2")

results={}

for name, (model_class, param_grid) in models.items():

    best_model, best_params, best_score =  tune_model(
        model_class, param_grid,
        X_train, y_train, X_val, y_val,
        metric="pr_auc"   # spam â†’ PR-AUC best
    )

    run_id, test_metrics = run_model_and_versioning(
        name, model_class, best_params,
        X_train, y_train, X_val, y_val,
        X_test, y_test
    )

    # Register model
    model_uri = f"runs:/{run_id}/model"

    mlflow.register_model(
        model_uri=model_uri,
        name="BenchmarkModels"
    )

    results[name] = {
        "best_params": best_params,
        "test_metrics": test_metrics
    }


2026/02/15 18:15:16 INFO mlflow.tracking.fluent: Experiment with name 'MLflow Assignment2' does not exist. Creating a new experiment.


LogisticRegression params={'C': 0.01, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.972488038277512, 'roc_auc': np.float64(0.9609720798408008), 'pr_auc': np.float64(0.9337120656590089)}
LogisticRegression params={'C': 0.1, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.9712918660287081, 'roc_auc': np.float64(0.9605861424350238), 'pr_auc': np.float64(0.933600426169063)}
LogisticRegression params={'C': 1, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.972488038277512, 'roc_auc': np.float64(0.9606826267864682), 'pr_auc': np.float64(0.9351292102410739)}
LogisticRegression params={'C': 10, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.9712918660287081, 'roc_auc': np.float64(0.9622504974974371), 'pr_auc': np.float64(0.9360148391560262)}


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
Successfully registered model 'BenchmarkModels'.
Created version '1' of model 'BenchmarkModels'.


MultinomialNB params={'alpha': 0.1} metrics={'accuracy': 0.9509569377990431, 'roc_auc': np.float64(0.9631128263884702), 'pr_auc': np.float64(0.848404737673039)}
MultinomialNB params={'alpha': 0.5} metrics={'accuracy': 0.9557416267942583, 'roc_auc': np.float64(0.9645480311162034), 'pr_auc': np.float64(0.8415999163142216)}
MultinomialNB params={'alpha': 1} metrics={'accuracy': 0.9557416267942583, 'roc_auc': np.float64(0.9663631429777484), 'pr_auc': np.float64(0.8445417519805075)}
MultinomialNB params={'alpha': 2} metrics={'accuracy': 0.9509569377990431, 'roc_auc': np.float64(0.968196345655189), 'pr_auc': np.float64(0.8473034540603926)}


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
Registered model 'BenchmarkModels' already exists. Creating a new version of this model...
Created version '2' of model 'BenchmarkModels'.


LinearSVC params={'C': 0.01, 'max_iter': 5000} metrics={'accuracy': 0.972488038277512, 'roc_auc': np.float64(0.9558222275824639), 'pr_auc': np.float64(0.929371472215542)}




LinearSVC params={'C': 0.1, 'max_iter': 5000} metrics={'accuracy': 0.9712918660287081, 'roc_auc': np.float64(0.9626967376228668), 'pr_auc': np.float64(0.9355868138749771)}




LinearSVC params={'C': 1, 'max_iter': 5000} metrics={'accuracy': 0.965311004784689, 'roc_auc': np.float64(0.9840439003799071), 'pr_auc': np.float64(0.9664385381908788)}
LinearSVC params={'C': 10, 'max_iter': 5000} metrics={'accuracy': 0.9665071770334929, 'roc_auc': np.float64(0.9888801784960501), 'pr_auc': np.float64(0.9743866213819193)}


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
Registered model 'BenchmarkModels' already exists. Creating a new version of this model...
Created version '3' of model 'BenchmarkModels'.


In [45]:
#evaluating on test data to compare the 3 models
import pandas as pd

df_results = pd.DataFrame({
    name: res["test_metrics"] for name, res in results.items()
}).T

print(df_results)


            accuracy   roc_auc    pr_auc
LogReg      0.976105  0.978362  0.959598
NaiveBayes  0.958184  0.973498  0.875465
LinearSVM   0.972521  0.989101  0.947007


###Checkout and printing AUCPR

In [47]:
import mlflow

# Get experiment by name
experiment = mlflow.get_experiment_by_name("MLflow Assignment2")

# Search all runs in that experiment
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

for _, row in runs.iterrows():
    run_id = row["run_id"]
    model_name = row.get("tags.mlflow.runName", "Unknown")
    pr_auc = row.get("metrics.pr_auc", None)

    print(f"Run: {model_name} | Run ID: {run_id} | AUCPR: {pr_auc}")


Run: LinearSVM | Run ID: d88d4647745648719f1c389a6840a9c6 | AUCPR: 0.947006747962271
Run: NaiveBayes | Run ID: 7d045f5d5c3c47f7a1eac63caa67ce8d | AUCPR: 0.8754648199110368
Run: LogReg | Run ID: 05beeb36e290470bbfefab296d92181f | AUCPR: 0.959597646859366
