#Install and Set Up MLflow

In [1]:
!pip install mlflow scikit-learn pandas openpyxl xgboost
!mkdir -p /content/mlruns  # Create directory for local MLflow tracking


Collecting mlflow
  Downloading mlflow-2.20.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.3 (from mlflow)
  Downloading mlflow_skinny-2.20.3-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.3->mlflow)
  Downloading databricks_sdk-0.44.1-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

In [2]:
import mlflow

mlflow.set_tracking_uri("file:///content/mlruns")  # Local tracking
mlflow.set_experiment("SMS_Spam_Classification")


2025/03/04 22:33:21 INFO mlflow.tracking.fluent: Experiment with name 'SMS_Spam_Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/516238096360371125', creation_time=1741127601393, experiment_id='516238096360371125', last_update_time=1741127601393, lifecycle_stage='active', name='SMS_Spam_Classification', tags={}>

# Load and Preprocess SMS Spam Data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = "/content/appliedml1.xlsx"  # Adjust based on actual location
df = pd.read_excel(file_path)

# Rename columns if necessary
df.columns = ["label", "message"]

# Encode labels: spam = 1, ham = 0
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], test_size=0.2, random_state=42)

# Ensure all messages are strings
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Define Model Training and Logging Function

In [5]:
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import average_precision_score

def train_and_log_model(model, model_name):
    with mlflow.start_run():
        model.fit(X_train_tfidf, y_train)

        # Get predictions and calculate AUCPR
        y_probs = model.predict_proba(X_test_tfidf)[:, 1]
        aucpr = average_precision_score(y_test, y_probs)

        # Log parameters
        mlflow.log_param("model_type", model_name)

        # Log metric (AUCPR)
        mlflow.log_metric("AUCPR", aucpr)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

        print(f"{model_name} -> AUCPR: {aucpr:.4f}")

        return aucpr


# Train and Register Three Benchmark Models

In [6]:
# Logistic Regression
lr_model = LogisticRegression()
aucpr_lr = train_and_log_model(lr_model, "Logistic Regression")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
aucpr_rf = train_and_log_model(rf_model, "Random Forest")

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
aucpr_xgb = train_and_log_model(xgb_model, "XGBoost")




Logistic Regression -> AUCPR: 0.9772


Parameters: { "use_label_encoder" } are not used.



Random Forest -> AUCPR: 0.9892




XGBoost -> AUCPR: 0.9565


#Retrieve and Print AUCPR for Each Model

In [7]:
import mlflow
import pandas as pd

# Get all logged runs
runs = mlflow.search_runs()
results = runs[["params.model_type", "metrics.AUCPR"]]

print("Model Performance:")
print(results)


Model Performance:
     params.model_type  metrics.AUCPR
0              XGBoost       0.956513
1        Random Forest       0.989221
2  Logistic Regression       0.977175
