## Importing libraries

In [1]:
import os
import json
import mlflow
import numpy as np
import pandas as pd
import scipy.sparse
import mlflow.sklearn
from xgboost import XGBClassifier
from mlflow.tracking import MlflowClient
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from mlflow.models import infer_signature
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, average_precision_score

import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [2]:

# Loads and preprocesses train, validation, and test datasets

def load_data(train_path, val_path, test_path):
    # Load datasets
    train = pd.read_csv(train_path)
    val = pd.read_csv(val_path)
    test = pd.read_csv(test_path)

    
    # Remove any nan value
    X_train = train["Messages"].fillna("")
    X_val = val["Messages"].fillna("")
    X_test = test["Messages"].fillna("")
    
    y_train, y_val, y_test = train["labels"].values, val["labels"].values, test["labels"].values

    
    vectorizer = TfidfVectorizer(strip_accents='unicode')
    
    # Fit on training data and transform all sets
    X_train_vec = vectorizer.fit_transform(X_train.astype(str))
    X_val_vec = vectorizer.transform(X_val.astype(str))
    X_test_vec = vectorizer.transform(X_test.astype(str))
    
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.transform(y_val)
    y_test = encoder.transform(y_test)

    return X_train_vec, X_val_vec, X_test_vec, y_train, y_val, y_test, vectorizer

In [3]:
train_path = r"C:\Users\Keshav\Desktop\DS Course\Sem 4\3. AML\Assignment 2\train.csv"
test_path = r"C:\Users\Keshav\Desktop\DS Course\Sem 4\3. AML\Assignment 2\test.csv"
val_path = r"C:\Users\Keshav\Desktop\DS Course\Sem 4\3. AML\Assignment 2\val.csv"

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = load_data(train_path, test_path, val_path)

## Defining model dictionary and Metrics

In [5]:
model_dict = {
    "logistic_regression": {
        "model": LogisticRegression,
        "params": {
            "random_state": 42,
            "penalty": "elasticnet",
            "solver": "saga",
            "C": 0.91404735247,
            "l1_ratio": 0.00282185149,
        },
    },

    
    "decision_tree": {
         "model": DecisionTreeClassifier, 
         "params": {}
                      
    },
    
    "xgboost": {
        "model": XGBClassifier,
        "params": {
            "max_depth": 5,
            "learning_rate": 0.0980602457,
            "subsample": 0.585659001039,
        },
    },
}


# Define metrics
metrics = {
    "accuracy": accuracy_score,
    "aucpr": average_precision_score,  # AUCPR for model selection
}


## Tracking experiment run with mlflow

In [6]:

def get_model_config(model_name, model_dict):
    model_config = model_dict.get(model_name)
    if model_config is None:
        raise ValueError(f"Model name '{model_name}' not recognised.")
    return model_config

def train_and_evaluate(model_config, model_params, X_train, y_train, X_val, y_val, metrics):
    model = model_config["model"](**model_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    return {
        metric_name: metric(y_val, y_pred) for metric_name, metric in metrics.items()
    }

def log_results(model_name, model_params, model_results):
    mlflow.log_params(model_params)
    for metric_name, metric_value in model_results.items():
        mlflow.log_metric(f"{model_name}_{metric_name}", metric_value)

def register_model(model_name, model):
    client = MlflowClient()
    mlflow.sklearn.log_model(model, model_name)
    
    current_run_id = mlflow.active_run().info.run_id
    model_uri = f"runs:/{current_run_id}/{model_name}"
    
    try:
        client.create_registered_model(model_name)
    except Exception:
        print(f"Model {model_name} already exists in the registry.")
    
    model_details = client.create_model_version(model_name, model_uri, current_run_id)
    print(f"=> {model_name} version {model_details.version} has been logged to registry.")
    

def track_model(model_name, X_train, y_train, X_val, y_val, params={}, run_name=None, user_params=True, model_dict=model_dict, metrics=metrics):
    run_name = run_name or model_name
    
    with mlflow.start_run(run_name=run_name):
        print(f"Starting run {run_name}")
        
        model_config = get_model_config(model_name, model_dict)
        print(f"Model name: {model_name}")
        
        model_params = params if user_params else model_config["params"]
        print(f"Model params:{json.dumps(model_params, indent=4)}")
        
        model_results = train_and_evaluate(model_config, model_params, X_train, y_train, X_val, y_val, metrics)
        print(f"Model results:\n{json.dumps(model_results, indent=4)}")
        
        log_results(model_name, model_params, model_results)
        register_model(model_name, model_config["model"](**model_params))
        

In [7]:
track_model("logistic_regression", X_train, y_train, X_val, y_val)

Starting run logistic_regression
Model name: logistic_regression
Model params:{}
Model results:
{
    "accuracy": 0.9605263157894737,
    "aucpr": 0.7241204978047083
}




=> logistic_regression version 1 has been logged to registry.


In [8]:
track_model("logistic_regression", X_train, y_train, X_val, y_val, user_params=False)

Starting run logistic_regression
Model name: logistic_regression
Model params:{
    "random_state": 42,
    "penalty": "elasticnet",
    "solver": "saga",
    "C": 0.91404735247,
    "l1_ratio": 0.00282185149
}
Model results:
{
    "accuracy": 0.9605263157894737,
    "aucpr": 0.7241204978047083
}




Model logistic_regression already exists in the registry.
=> logistic_regression version 2 has been logged to registry.


In [9]:
track_model("xgboost", X_train, y_train, X_val, y_val)

Starting run xgboost
Model name: xgboost
Model params:{}
Model results:
{
    "accuracy": 0.9712918660287081,
    "aucpr": 0.7975334103283287
}




=> xgboost version 1 has been logged to registry.


In [10]:
track_model("xgboost", X_train, y_train, X_val, y_val, user_params=False)

Starting run xgboost
Model name: xgboost
Model params:{
    "max_depth": 5,
    "learning_rate": 0.0980602457,
    "subsample": 0.585659001039
}
Model results:
{
    "accuracy": 0.9641148325358851,
    "aucpr": 0.7482607554559585
}




Model xgboost already exists in the registry.
=> xgboost version 2 has been logged to registry.


In [11]:
track_model("decision_tree", X_train, y_train, X_val, y_val)

Starting run decision_tree
Model name: decision_tree
Model params:{}
Model results:
{
    "accuracy": 0.9569377990430622,
    "aucpr": 0.699710690312194
}




=> decision_tree version 1 has been logged to registry.


In [12]:
track_model("decision_tree", X_train, y_train, X_val, y_val, user_params=False)

Starting run decision_tree
Model name: decision_tree
Model params:{}
Model results:
{
    "accuracy": 0.9605263157894737,
    "aucpr": 0.7242894470218098
}




Model decision_tree already exists in the registry.
=> decision_tree version 2 has been logged to registry.
