In [2]:
# Imports
!pip install mlflow
import pandas as pd
import numpy as np
import joblib
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
import os

# Ensure models directory exists
os.makedirs("../models", exist_ok=True)

Collecting mlflow
  Downloading mlflow-3.8.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.8.1 (from mlflow)
  Downloading mlflow_skinny-3.8.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.8.1 (from mlflow)
  Downloading mlflow_tracing-3.8.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Downloading huey-2.5.5-py3-none-any.whl.metadata (4.8 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.8.1->mlflow)
  Downloading databricks_sdk-0.76.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [5]:
# Load & Split Data
data_path = "/content/sample_data/data/raw/heart.csv"
df = pd.read_csv(data_path)

# Convert target if not already done in raw file
if 'target' not in df.columns:
    df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
    df = df.drop(columns=['num'])

X = df.drop('target', axis=1)
y = df['target']

# 80-20 Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Shape: {X_train.shape}")
print(f"Testing Shape: {X_test.shape}")

Training Shape: (242, 13)
Testing Shape: (61, 13)


In [6]:
# Define Preprocessing Pipeline
# This must match exactly what we will use in production/inference

# Identify column types
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# 1. Numeric: Impute missing (median) -> Scale (StandardScaler)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 2. Categorical: Impute missing (mode) -> OneHot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("Pipeline constructed successfully.")

Pipeline constructed successfully.


In [7]:
# MLflow Setup
# Set the experiment name
mlflow.set_experiment("Heart_Disease_Experiments")

def eval_metrics(actual, pred, probs):
    accuracy = accuracy_score(actual, pred)
    precision = precision_score(actual, pred)
    recall = recall_score(actual, pred)
    roc_auc = roc_auc_score(actual, probs)
    return accuracy, precision, recall, roc_auc

2026/01/05 10:38:35 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/05 10:38:35 INFO mlflow.store.db.utils: Updating database tables
2026/01/05 10:38:35 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/05 10:38:35 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/05 10:38:35 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2026/01/05 10:38:35 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2026/01/05 10:38:35 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2026/01/05 10:38:35 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2026/01/05 10:38:35 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2026/01/05 10:38:35 INFO alembic.runtime.migration: Running 

In [8]:
# Experiment 1 - Logistic Regression
with mlflow.start_run(run_name="Logistic_Regression_Baseline"):
    # Define Model
    lr = LogisticRegression(max_iter=1000, random_state=42)

    # Create Full Pipeline
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', lr)])

    # Train
    model_pipeline.fit(X_train, y_train)

    # Predict
    y_pred = model_pipeline.predict(X_test)
    y_probs = model_pipeline.predict_proba(X_test)[:, 1]

    # Evaluate
    acc, prec, rec, auc = eval_metrics(y_test, y_pred, y_probs)

    # Log Parameters & Metrics
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("roc_auc", auc)

    # Log Model Artifact
    mlflow.sklearn.log_model(model_pipeline, "model")

    print(f"Logistic Regression - Accuracy: {acc:.4f}, AUC: {auc:.4f}")



Logistic Regression - Accuracy: 0.8689, AUC: 0.9102


In [9]:
# Experiment 2 - Random Forest
with mlflow.start_run(run_name="Random_Forest_Default"):
    # Define Model
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

    # Create Full Pipeline
    rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', rf)])

    # Train
    rf_pipeline.fit(X_train, y_train)

    # Predict
    y_pred = rf_pipeline.predict(X_test)
    y_probs = rf_pipeline.predict_proba(X_test)[:, 1]

    # Evaluate
    acc, prec, rec, auc = eval_metrics(y_test, y_pred, y_probs)

    # Log Parameters & Metrics
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("roc_auc", auc)

    # Log Model Artifact
    mlflow.sklearn.log_model(rf_pipeline, "model")

    print(f"Random Forest - Accuracy: {acc:.4f}, AUC: {auc:.4f}")



Random Forest - Accuracy: 0.7869, AUC: 0.9042


In [10]:
# Model Selection & Saving
# Assuming Random Forest performed better (check output above)
# We save the Random Forest pipeline as our "Production" model

joblib.dump(rf_pipeline, "../models/model.joblib")
print("Best model saved to ../models/model.joblib")

# Sanity Check: Load and Predict
loaded_model = joblib.load("../models/model.joblib")
sample_pred = loaded_model.predict(X_test.iloc[:5])
print("Prediction on first 5 test samples:", sample_pred)

Best model saved to ../models/model.joblib
Prediction on first 5 test samples: [0 0 0 1 1]
