#Install and Set Up MLflow

In [1]:
!pip install mlflow scikit-learn pandas openpyxl xgboost
!mkdir -p /content/mlruns  # Create directory for local MLflow tracking


Collecting mlflow
  Downloading mlflow-2.20.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.3 (from mlflow)
  Downloading mlflow_skinny-2.20.3-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.3->mlflow)
  Downloading databricks_sdk-0.44.1-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

In [2]:
import mlflow

mlflow.set_tracking_uri("file:///content/mlruns")  # Local tracking
mlflow.set_experiment("SMS_Spam_Classification")


2025/03/05 07:42:14 INFO mlflow.tracking.fluent: Experiment with name 'SMS_Spam_Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/146386704474532468', creation_time=1741160534779, experiment_id='146386704474532468', last_update_time=1741160534779, lifecycle_stage='active', name='SMS_Spam_Classification', tags={}>

# Load and Preprocess SMS Spam Data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = "/content/appliedml1.xlsx"  # Adjust based on actual location
df = pd.read_excel(file_path)

# Rename columns if necessary
df.columns = ["label", "message"]

# Encode labels: spam = 1, ham = 0
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], test_size=0.2, random_state=42)

# Ensure all messages are strings
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


 # function for Train, Log, and Register Model

In [8]:
import numpy as np
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.metrics import average_precision_score

def train_log_register_model(model, model_name):
    with mlflow.start_run() as run:
        # Train the model
        model.fit(X_train_tfidf, y_train)

        # Get predictions and calculate AUCPR
        y_probs = model.predict_proba(X_test_tfidf)[:, 1]
        aucpr = average_precision_score(y_test, y_probs)

        # Log parameters
        mlflow.log_param("model_type", model_name)

        # Log metric (AUCPR)
        mlflow.log_metric("AUCPR", aucpr)

        # Create input example
        input_example = np.array(X_test_tfidf[0].toarray())

        # Infer model signature
        signature = infer_signature(X_test_tfidf.toarray(), model.predict(X_test_tfidf))

        # Log the model with input example & signature
        model_uri = mlflow.sklearn.log_model(
            model, model_name, input_example=input_example, signature=signature
        )

        # Register model
        mlflow.register_model(model_uri=model_uri.model_uri, name=model_name)

        print(f"Model {model_name}  | Run ID: {run.info.run_id}")

        return run.info.run_id  # Return Run ID for later use


 # Function for Load Model Using Run ID and Calculate AUCPR

In [12]:
import mlflow
from sklearn.metrics import average_precision_score

def load_model_and_evaluate(run_id, model_name):
    """Load model using Run ID and evaluate AUCPR"""
    # Load model using Run ID
    model_uri = f"runs:/{run_id}/{model_name}"
    loaded_model = mlflow.sklearn.load_model(model_uri)

    # Predict probabilities and calculate AUCPR
    y_probs = loaded_model.predict_proba(X_test_tfidf)[:, 1]
    aucpr = average_precision_score(y_test, y_probs)

    print(f"Loaded Model {model_name} (Run ID: {run_id}) -> AUCPR: {aucpr:.4f}")
    return aucpr



# Train & Register Models

In [13]:
import numpy as np
import mlflow
import mlflow.sklearn
import xgboost as xgb
from mlflow.models import infer_signature
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score

# List of models
models = {
    "Logistic_Regression": LogisticRegression(),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric="logloss")  # Avoid deprecated params
}

# Dictionary to store Run IDs
run_ids = {}

for model_name, model in models.items():
    run_ids[model_name] = train_log_register_model(model, model_name)



Registered model 'Logistic_Regression' already exists. Creating a new version of this model...
Created version '3' of model 'Logistic_Regression'.


Model Logistic_Regression  | Run ID: 69de25cf8c034ee89e7acf69823c8b2a


Registered model 'Random_Forest' already exists. Creating a new version of this model...
Created version '3' of model 'Random_Forest'.


Model Random_Forest  | Run ID: 24873809378843d6bfc0eef1815ff5d0
Model XGBoost  | Run ID: d3af761c98354f3a86e89621248fc435


Registered model 'XGBoost' already exists. Creating a new version of this model...
Created version '3' of model 'XGBoost'.


# Load and evaluate models

In [14]:

for model_name, run_id in run_ids.items():
    load_model_and_evaluate(run_id, model_name)

Loaded Model Logistic_Regression (Run ID: 69de25cf8c034ee89e7acf69823c8b2a) -> AUCPR: 0.9772
Loaded Model Random_Forest (Run ID: 24873809378843d6bfc0eef1815ff5d0) -> AUCPR: 0.9889
Loaded Model XGBoost (Run ID: d3af761c98354f3a86e89621248fc435) -> AUCPR: 0.9565
