### Step.1 : Import `mlflow` and Setup the Tracking URL and Set up the Experiment

In [29]:
# Import MLflow
import mlflow
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set the tracking URI
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [3]:
# Set the experiment name
mlflow.set_experiment("heart-disease-exp")

2023/02/03 14:09:11 INFO mlflow.tracking.fluent: Experiment with name 'heart-disease-exp' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', creation_time=1675413551436, experiment_id='1', last_update_time=1675413551436, lifecycle_stage='active', name='heart-disease-exp', tags={}>

### Step2: Print Experiment Details

In [4]:
experiment = mlflow.get_experiment_by_name("heart-disease-exp")
print(f"Name : {experiment.name}")
print(f"Artifacts Location : {experiment.artifact_location}")
print(f"LifeCycle Stage : {experiment.lifecycle_stage}")
print(f"Experiment ID : {experiment.experiment_id}")

Name : heart-disease-exp
Artifacts Location : ./mlruns/1
LifeCycle Stage : active
Experiment ID : 1


### Step 3: Load Libraries

In [23]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                             precision_score, recall_score, f1_score,
                             roc_auc_score, cohen_kappa_score)

### Step 4: Load the Dataset

In [9]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [13]:
categorical = df.select_dtypes('O').columns.tolist()

In [17]:
pd.get_dummies(df["ChestPainType"])

Unnamed: 0,ASY,ATA,NAP,TA
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,1,0,0,0
4,0,0,1,0
...,...,...,...,...
913,0,0,0,1
914,1,0,0,0
915,1,0,0,0
916,0,1,0,0


### Step 5: Single RUN Model

In [20]:
with mlflow.start_run():
    # 1. Log the artifact (model)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    
    # 2. Set the tags
    mlflow.set_tag("datascientist","kb")
    
    # 3. Seperate the data into X and y
    y = df["HeartDisease"]
    X = df.drop("HeartDisease", axis=1)
    
    # 4. Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 5. Create the preprocessor : OneHotEncoder
    ohe = OneHotEncoder()
    
    # 6. Create the column transformer
    ct = make_column_transformer((ohe, categorical), remainder="passthrough")
    
    # 7. Define the model: Logistic Regression
    lr = LogisticRegression(solver='liblinear')
    
    # 8. Set tag for model
    mlflow.set_tag("Model", lr)
    
    # 9. Create the pipeline
    pipe = make_pipeline(ct, lr)
    
    # 10. Fit the model from the pipeline
    pipe.fit(X_train, y_train)
    
    # 11. Predict the model
    y_pred = pipe.predict(X_test)
    y_test = y_test.values
    
    # 12. Metrics Varriables
    accuracy = round(accuracy_score(y_test, y_pred), 2)
    precision = round(precision_score(y_test, y_pred), 2)
    recall = round(recall_score(y_test, y_pred), 2)
    roc_auc = round(roc_auc_score(y_test, y_pred), 2)
    cohen_kappa = round(cohen_kappa_score(y_test, y_pred), 2)
    
    # 13. Log the metrics on mlflow experiment
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("Precision", precision)
    mlflow.log_metric("Recall", recall)
    mlflow.log_metric("ROC_AUC", roc_auc)
    mlflow.log_metric("Cohen_Kappa", cohen_kappa)

### Step 6: Multiple Model Run

In [39]:
mlflow.sklearn.autolog()

lda = LinearDiscriminantAnalysis()
svm = SVC(gamma='scale')
knn = KNeighborsClassifier()
ada = AdaBoostClassifier(random_state=0)
gb = GradientBoostingClassifier(random_state=0, learning_rate=0.1, n_estimators=100)
rf = RandomForestClassifier(random_state=0)
et = ExtraTreesClassifier(random_state=0, n_estimators=100)
xgbc = XGBClassifier(random_state=0)

# models = [lda, svm, knn, ada, gb, rf, et, xgbc]
models = [gb, et]



In [40]:
for index, model in enumerate(models): 
    print(f"Model {index} Started Executing...🧑‍💻")
    with mlflow.start_run(run_name=f'Run {model}') as run:
        # 1. Log the artifact (model)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        
        # 2. Set the tags
        mlflow.set_tag("datascientist", model)
        
        # 3. Seperate the data into X and y
        y = df["HeartDisease"]
        X = df.drop("HeartDisease", axis=1)
        
        # 4. Split the data into train and test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # 5. Create the preprocessor : OneHotEncoder
        ohe = OneHotEncoder()
        
        # 6. Create the column transformer
        ct =  make_column_transformer((ohe, categorical), remainder="passthrough")
        
        # 7. Set the tag for model
        mlflow.set_tag("Model", model)
        
        # 8. Create the pipeline
        pipe = make_pipeline(ct, model)
        
        # 9. Fit the model from the pipeline
        pipe.fit(X_train, y_train)
        
        # 10. Predict the model
        y_pred = pipe.predict(X_test)
        y_test = y_test.values
        
        # 11. Metrics Varriables
        accuracy = round(accuracy_score(y_test, y_pred), 2)
        precision = round(precision_score(y_test, y_pred), 2)
        recall = round(recall_score(y_test, y_pred), 2)
        roc_auc = round(roc_auc_score(y_test, y_pred), 2)
        cohen_kappa = round(cohen_kappa_score(y_test, y_pred), 2)
        
        # 12. Log the metrics on mlflow experiment
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Precision", precision)
        mlflow.log_metric("Recall", recall)
        mlflow.log_metric("ROC_AUC", roc_auc)
        mlflow.log_metric("Cohen_Kappa", cohen_kappa)

Model 0 Started Executing...🧑‍💻
Model 1 Started Executing...🧑‍💻


### Step 7: Acces the Stagging Model

In [41]:
# Predict  with the MLFlow Model
print("Predict with MLflow Model:")
model = mlflow.sklearn.load_model("models:///ExtraTreeModel/Staging")
# model = mlflow.sklearn.load_model("models:///GBTree/Staging")
print("="*50)
print("Model:\n", model)
print("="*50)
prediction = model.predict(X_test)
print("Prediction:\n", prediction)
print("Prediction is Done...🕺")

Predict with MLflow Model:
Model:
 Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Sex', 'ChestPainType',
                                                   'RestingECG',
                                                   'ExerciseAngina',
                                                   'ST_Slope'])])),
                ('extratreesclassifier', ExtraTreesClassifier(random_state=0))])
Prediction:
 [1 1 0 1 0 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1]
Prediction is Done...🕺


In [42]:
df_result = pd.DataFrame(X_test)
df_result["HeartDisease"] = prediction
df_result

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,1
541,76,M,NAP,104,113,0,LVH,120,N,3.5,Down,1
684,47,M,NAP,108,243,0,Normal,152,N,0.0,Up,0
444,56,M,ASY,120,100,0,Normal,120,Y,1.5,Flat,1
168,58,M,ASY,135,222,0,Normal,100,N,0.0,Up,0
312,41,M,ASY,125,0,1,Normal,176,N,1.6,Up,1
603,74,M,ASY,155,310,0,Normal,112,Y,1.5,Down,1
531,64,M,ASY,143,306,1,ST,115,Y,1.8,Flat,1
597,55,M,NAP,133,185,0,ST,136,N,0.2,Up,0
762,40,M,ASY,110,167,0,LVH,114,Y,2.0,Flat,1


In [43]:
df_result.shape

(37, 12)