In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

csv_path = "/content/drive/MyDrive/MLOps_Assignment_1/heart_cleaned.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [3]:
# Convert to binary target
df["target"] = (df["target"] > 0).astype(int)

df["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,164
1,139


In [4]:
X = df.drop("target", axis=1)
y = df["target"]

Test-Train Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [6]:
numerical_features = ["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]
categorical_features = [
    "sex", "cp", "fbs", "restecg", "exang", "slope", "thal"
]

print("Numerical:", numerical_features)
print("Categorical:", categorical_features)


Numerical: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
Categorical: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']


# Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler

numerical_features = ["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]

scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_features] = scaler.fit_transform(
    X_train[numerical_features]
)
X_test_scaled[numerical_features] = scaler.transform(
    X_test[numerical_features]
)


# Model 1 : logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))
print(classification_report(y_test, y_pred_lr))


Accuracy: 0.8688524590163934
ROC-AUC: 0.9512987012987013
              precision    recall  f1-score   support

           0       0.93      0.82      0.87        33
           1       0.81      0.93      0.87        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61



#Model 2 - Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9016393442622951
ROC-AUC: 0.9545454545454546
              precision    recall  f1-score   support

           0       0.97      0.85      0.90        33
           1       0.84      0.96      0.90        28

    accuracy                           0.90        61
   macro avg       0.90      0.91      0.90        61
weighted avg       0.91      0.90      0.90        61



# Model Comparison

In [10]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf)
    ],
    "ROC-AUC": [
        roc_auc_score(y_test, y_prob_lr),
        roc_auc_score(y_test, y_prob_rf)
    ]
})

results


Unnamed: 0,Model,Accuracy,ROC-AUC
0,Logistic Regression,0.868852,0.951299
1,Random Forest,0.901639,0.954545


# Cross-Validation

In [11]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    rf, X, y, cv=5, scoring="roc_auc"
)

print("Cross-validation ROC-AUC scores:", cv_scores)
print("Mean ROC-AUC:", cv_scores.mean())


Cross-validation ROC-AUC scores: [0.89393939 0.95400433 0.89556277 0.89225589 0.86886161]
Mean ROC-AUC: 0.9009247985810486


### Best Model Selection

Based on accuracy, ROC-AUC score, and cross-validation performance,
the Random Forest model was selected as the final model for deployment.


# Save the trained ML model

In [12]:
import joblib

joblib.dump(rf, "random_forest_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

# Step 3 Model tracking with ML flow

In [13]:
!rm -rf mlruns

In [14]:
!pip install -q mlflow

In [15]:
import mlflow
import mlflow.sklearn

In [16]:
import os
os.makedirs("/content/mlruns", exist_ok=True)

In [17]:
mlflow.set_tracking_uri("file:///content/mlruns")
mlflow.set_experiment("Heart Disease Classification")

  return FileStore(store_uri, store_uri)
2025/12/26 10:08:08 INFO mlflow.tracking.fluent: Experiment with name 'Heart Disease Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/227299683731459122', creation_time=1766743688135, experiment_id='227299683731459122', last_update_time=1766743688135, lifecycle_stage='active', name='Heart Disease Classification', tags={}>

# Log Logistic Regression model

In [18]:
with mlflow.start_run(run_name="Logistic Regression"):
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_lr))
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_prob_lr))
    mlflow.sklearn.log_model(lr, "model")




# Log Random forest model

In [19]:
with mlflow.start_run(run_name="Random Forest"):
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_rf))
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_prob_rf))
    mlflow.sklearn.log_model(rf, "model")




# Run verification

In [20]:
import os

os.listdir("mlruns")


['227299683731459122', '.trash']

In [21]:
!find mlruns -type d -name model


In [None]:
!mlflow ui --host 0.0.0.0 --port 5000 &

Backend store URI not provided. Using ./mlruns
Registry store URI not provided. Using backend store URI.
  return FileStore(store_uri, artifact_uri)
  return FileStore(store_uri)
[MLflow] Security middleware enabled with default settings (localhost-only). To allow connections from other hosts, use --host 0.0.0.0 and configure --allowed-hosts and --cors-allowed-origins.
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:5000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started parent process [[36m[1m7303[0m]
[32mINFO[0m:     Started server process [[36m7305[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m7306[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m7307[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.


In [None]:
import time
from google.colab import output

# Give MLflow a moment to fully start up
time.sleep(5)
output.serve_kernel_port(5000, external_link=True)