In [13]:
%pip install mlflow
%pip install xgboost

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting matplotlib<4 (from mlflow)
  Downloading matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting sqlalchemy<3,>=1.4.0 (from mlflow)
  Downloading sqlalchemy-2.0.41-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting waitress<4 (from mlflow)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting cloudpickle<4 (from mlflow-skinny==3.1.0->mlflow)
 

## Import The Datasets using Pandas and read_csv function
## Also Drop useless/ non feature columns like names
## Check for missing values with .isnull()

In [6]:
import pandas as pd  
import os

# Load Datasets
data_path_d = os.path.join('..', 'Datasets', 'diabetes.csv')
data_path_h = os.path.join('..', 'Datasets', 'heart_disease.csv')
data_path_p = os.path.join('..', 'Datasets', 'parkinsons.csv')

# Load Datasets
diabetes = pd.read_csv(data_path_d)
heart = pd.read_csv(data_path_h)
parkinsons = pd.read_csv(data_path_p)

# Drop name column as its useless
parkinsons = parkinsons.drop(columns=['name'], axis=1)

# Display first few rows
print("Diabetes Dataset:\n", diabetes.head(), "\n")
print("Heart Disease Dataset:\n", heart.head(), "\n")
print("parkinsons Disease Dataset:\n", parkinsons.head(), "\n")

# Check for missing values
print("Missing Values:\n")
print("Diabetes:\n", diabetes.isnull().sum(), "\n")
print("Heart:\n", heart.isnull().sum(), "\n")
print("parkinsons:\n", parkinsons.isnull().sum(), "\n")


Diabetes Dataset:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1   

Heart Disease Dataset:
    age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   0       145   233    1        2      150      0      2.3      2   
1   67    1   3       160   286    0        2      108      1      1.5      1   
2   67    1   3

## Create Scalers to mitigate Bias towards Large values and or way too small values
## Use joblib to dump them in respective folders
## Output before and after scaling
## Create train/test split 

In [3]:
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.model_selection import train_test_split
import pandas as pd
import os

def preprocess_data(disease_frame, target_feature, scaler_path):
    # Seperate Features to train on and the output of the data
    X = disease_frame.drop(columns=[target_feature])
    Y = disease_frame[target_feature]

    # Use Standard Scaler to scale the features and fix Extreme high and extreme low values and store it in the folder
    scaler = StandardScaler()

    # Values Before Scaling
    print("BEFORE SCALING:")
    print(X.head())
    
    # Apply the Scaler to X
    X_scaled = scaler.fit_transform(X)

    # Display the first few rows of the scaled data
    print("\nAFTER SCALING:")
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    print(X_scaled_df.head())

    # Save the scalers to the specified path
    joblib.dump(scaler, scaler_path)

    # Create a train test Split for verifying Model metrics, using random_state = 42 reference to Hitchhiker's Guide to the Galaxy for consistent splits
    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size = 0.2, random_state = 42)
    
    # Return the train and test data
    return X_train, X_test , Y_train, Y_test




# Main Function Call to store Preprocessed Data in respective variables

X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = preprocess_data(diabetes, "Outcome", os.path.join('..','Trained_Models/Scalers/diabetes_scaler.pkl'))
X_train_heart, X_test_heart, y_train_heart, y_test_heart = preprocess_data(heart, "target", os.path.join('..','Trained_Models/Scalers/heart_scaler.pkl'))
X_train_parkinsons, X_test_parkinsons, y_train_parkinsons, y_test_parkinsons = preprocess_data(parkinsons, "status", os.path.join('..','Trained_Models/Scalers/parkinsons_scaler.pkl'))

print("Preprocessing Completed and saved Scalers to designated folder...")

BEFORE SCALING:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  

AFTER SCALING:
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.639947  0.848324       0.149641       0.907270 -0.692891  0.204013   
1    -0.844885 -1.123396      -0.160546       0.530902 -0.692891 -0.684422   
2     1.233880  1.943724      -0.263941      -1.288212 -0.692891 -1.103255   
3    -0.84488

## Initiate MLFLOW for Comparison and Tracking
## Train Models with Multiple Algorthims RFC, LR, KNN, XGBClassifier
## Saved the Models in their Respective folders

In [9]:
# Import all necessary Machine Learning Algorithms for Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier


# Import metrics for Logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function Definations for Model Training Algorthims

def train_model(X_Train, Y_Train, disease_name):

    # Initialze the Algorithms
    LR_model = LogisticRegression(max_iter = 1000)
    RF_model = RandomForestClassifier(n_estimators = 100, random_state=42)
    KNN_model = KNeighborsClassifier(n_neighbors=5)
    xgb_model = XGBClassifier(random_state=42)

    # Fit the Algorithms with the Training Data
    LR_model.fit(X_Train, Y_Train)
    RF_model.fit(X_Train, Y_Train)
    KNN_model.fit(X_Train, Y_Train)
    xgb_model.fit(X_Train, Y_Train)

    # Store the Models in respective folders
    joblib.dump(LR_model, os.path.join('..', 'Trained_Models', disease_name, f'{disease_name}_LR_model.pkl'))
    joblib.dump(RF_model, os.path.join('..', 'Trained_Models', disease_name, f'{disease_name}_RF_model.pkl'))
    joblib.dump(KNN_model, os.path.join('..', 'Trained_Models', disease_name, f'{disease_name}_KNN_model.pkl'))
    joblib.dump(xgb_model, os.path.join('..', 'Trained_Models', disease_name, f'{disease_name}_XGB_model.pkl'))
    print(f"Models for {disease_name} trained and saved successfully.")



# Main Function call to Train the models
train_model(X_train_diabetes, y_train_diabetes, "Diabetes_Models")
train_model(X_train_heart, y_train_heart, "Heart_Models")
train_model(X_train_parkinsons, y_train_parkinsons, "Parkinsons_Models")

Models for Diabetes_Models trained and saved successfully.
Models for Heart_Models trained and saved successfully.
Models for Parkinsons_Models trained and saved successfully.


## Add MLFLOW LOGGING
## Run via mlflow ui --backend-store-uri "file:///E:/Github Projects/MlOps MDP/MLOPS/Jupyter Notebooks/Mlflow"

In [None]:
# Import Mlflow libraries for Logging
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# Import Machine Learning Algorithms 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Import Metrics from sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature

def train_model_with_mlflow(X_Train, Y_Train, X_Test, Y_Test, disease_name):
    # Create experiment 
    mlflow.set_experiment(disease_name)
    
    # Create directories if they don't exist
    os.makedirs(os.path.join('..', 'Trained_Models', disease_name), exist_ok=True)
    
    # Dictionary of models
    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "XGBoost": XGBClassifier(random_state=42)
    }
    
    # Create input example for model signature
    input_example = X_Train[:5]
    
    for model_name, model in models.items():
        # Start a new run for each model
        with mlflow.start_run(run_name=model_name):
            # Train model
            model.fit(X_Train, Y_Train)
            
            # Make predictions
            y_pred = model.predict(X_Test)
            
            # Infer model signature
            signature = infer_signature(X_Train, y_pred)
            
            # Calculate metrics
            metrics = {
                "accuracy": accuracy_score(Y_Test, y_pred),
                "precision": precision_score(Y_Test, y_pred, zero_division=0),
                "recall": recall_score(Y_Test, y_pred, zero_division=0),
                "f1_score": f1_score(Y_Test, y_pred, zero_division=0)
            }
            
            # Print results
            print(f"\n{disease_name} - {model_name} Results:")
            for metric_name, value in metrics.items():
                print(f"{metric_name}: {value:.4f}")
            
            # Log metrics to MLflow
            mlflow.log_metrics(metrics)
            
            # Log model parameters
            mlflow.log_params(model.get_params())
            
            # Log the model with signature and input example
            if isinstance(model, XGBClassifier):
                mlflow.xgboost.log_model(
                    model, 
                    name=model_name,
                    signature=signature,
                    input_example=input_example
                )
            else:
                mlflow.sklearn.log_model(
                    model, 
                    name=model_name,
                    signature=signature,
                    input_example=input_example
                )
            
            # Save model locally
            joblib.dump(model, os.path.join('..', 'Trained_Models', disease_name, f'{model_name}_model.pkl'))

# Set MLflow tracking URI and create experiments
mlflow_dir = os.path.abspath("Mlflow")  # Converts to something like E:/Your/Path/Mlflow
os.makedirs(mlflow_dir, exist_ok=True) 
mlflow.set_tracking_uri(f"file:///{mlflow_dir.replace(os.sep, '/')}")

# Train and log models
train_model_with_mlflow(
    X_train_diabetes, y_train_diabetes, 
    X_test_diabetes, y_test_diabetes, 
    "Diabetes_Experiment"
)

train_model_with_mlflow(
    X_train_heart, y_train_heart, 
    X_test_heart, y_test_heart, 
    "Heart_Disease_Experiment"
)

train_model_with_mlflow(
    X_train_parkinsons, y_train_parkinsons, 
    X_test_parkinsons, y_test_parkinsons,
    "Parkinsons_Experiment"
)

2025/06/11 19:02:54 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes_Experiment' does not exist. Creating a new experiment.



Diabetes_Experiment - LogisticRegression Results:
accuracy: 0.7532
precision: 0.6491
recall: 0.6727
f1_score: 0.6607

Diabetes_Experiment - RandomForest Results:
accuracy: 0.7273
precision: 0.6182
recall: 0.6182
f1_score: 0.6182

Diabetes_Experiment - KNN Results:
accuracy: 0.6883
precision: 0.5745
recall: 0.4909
f1_score: 0.5294

Diabetes_Experiment - XGBoost Results:
accuracy: 0.7208
precision: 0.5909
recall: 0.7091
f1_score: 0.6446


  self.get_booster().save_model(fname)
2025/06/11 19:03:07 INFO mlflow.tracking.fluent: Experiment with name 'Heart_Disease_Experiment' does not exist. Creating a new experiment.



Heart_Disease_Experiment - LogisticRegression Results:
accuracy: 0.8852
precision: 0.8788
recall: 0.9062
f1_score: 0.8923

Heart_Disease_Experiment - RandomForest Results:
accuracy: 0.8689
precision: 0.9000
recall: 0.8438
f1_score: 0.8710

Heart_Disease_Experiment - KNN Results:
accuracy: 0.9180
precision: 0.9355
recall: 0.9062
f1_score: 0.9206

Heart_Disease_Experiment - XGBoost Results:
accuracy: 0.8689
precision: 0.8750
recall: 0.8750
f1_score: 0.8750


  self.get_booster().save_model(fname)
2025/06/11 19:03:21 INFO mlflow.tracking.fluent: Experiment with name 'Parkinsons_Experiment' does not exist. Creating a new experiment.



Parkinsons_Experiment - LogisticRegression Results:
accuracy: 0.8974
precision: 0.8889
recall: 1.0000
f1_score: 0.9412

Parkinsons_Experiment - RandomForest Results:
accuracy: 0.9487
precision: 0.9412
recall: 1.0000
f1_score: 0.9697

Parkinsons_Experiment - KNN Results:
accuracy: 0.9487
precision: 0.9412
recall: 1.0000
f1_score: 0.9697

Parkinsons_Experiment - XGBoost Results:
accuracy: 0.9487
precision: 0.9412
recall: 1.0000
f1_score: 0.9697


  self.get_booster().save_model(fname)
