In [11]:
%pip install mlflow
%pip install xgboost
%pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn

   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   ---------------------------------------- 2/2 [imbalanced-learn]

Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3
Note: you may need to restart the kernel to use updated packages.


## Import The Datasets using Pandas and read_csv function
## Also Drop useless/ non feature columns like names
## Check for missing values with .isnull()

In [2]:
import pandas as pd  
import os

# Load Datasets
data_path_d = os.path.join('..', 'Datasets', 'diabetes.csv')
data_path_h = os.path.join('..', 'Datasets', 'heart_disease.csv')
data_path_p = os.path.join('..', 'Datasets', 'parkinsons.csv')

# Load Datasets
diabetes = pd.read_csv(data_path_d)
heart = pd.read_csv(data_path_h)
parkinsons = pd.read_csv(data_path_p)

# Drop name column as its useless
parkinsons = parkinsons.drop(columns=['name'], axis=1)

# Display first few rows
print("Diabetes Dataset:\n", diabetes.head(), "\n")
print("Heart Disease Dataset:\n", heart.head(), "\n")
print("parkinsons Disease Dataset:\n", parkinsons.head(), "\n")

# Check for missing values
print("Missing Values:\n")
print("Diabetes:\n", diabetes.isnull().sum(), "\n")
print("Heart:\n", heart.isnull().sum(), "\n")
print("parkinsons:\n", parkinsons.isnull().sum(), "\n")


Diabetes Dataset:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1   

Heart Disease Dataset:
    age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   0       145   233    1        2      150      0      2.3      2   
1   67    1   3       160   286    0        2      108      1      1.5      1   
2   67    1   3

## Create Scalers to mitigate Bias towards Large values and or way too small values
## Use joblib to dump them in respective folders
## Output before and after scaling
## Create train/test split 

In [3]:
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.model_selection import train_test_split
import pandas as pd
import os

def preprocess_data(disease_frame, target_feature, scaler_path):
    # Seperate Features to train on and the output of the data
    X = disease_frame.drop(columns=[target_feature])
    Y = disease_frame[target_feature]

    # Use Standard Scaler to scale the features and fix Extreme high and extreme low values and store it in the folder
    scaler = StandardScaler()

    # Values Before Scaling
    print("BEFORE SCALING:")
    print(X.head())
    
    # Apply the Scaler to X
    X_scaled = scaler.fit_transform(X)

    # Display the first few rows of the scaled data
    print("\nAFTER SCALING:")
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    print(X_scaled_df.head())

    # Save the scalers to the specified path
    joblib.dump(scaler, scaler_path)

    # Create a train test Split for verifying Model metrics, using random_state = 42 reference to Hitchhiker's Guide to the Galaxy for consistent splits
    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size = 0.2, random_state = 42)
    
    # Return the train and test data
    return X_train, X_test , Y_train, Y_test




# Main Function Call to store Preprocessed Data in respective variables

X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = preprocess_data(diabetes, "Outcome", os.path.join('..','Trained_Models/Scalers/diabetes_scaler.pkl'))
X_train_heart, X_test_heart, y_train_heart, y_test_heart = preprocess_data(heart, "target", os.path.join('..','Trained_Models/Scalers/heart_scaler.pkl'))
X_train_parkinsons, X_test_parkinsons, y_train_parkinsons, y_test_parkinsons = preprocess_data(parkinsons, "status", os.path.join('..','Trained_Models/Scalers/parkinsons_scaler.pkl'))

print("Preprocessing Completed and saved Scalers to designated folder...")

BEFORE SCALING:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  

AFTER SCALING:
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.639947  0.848324       0.149641       0.907270 -0.692891  0.204013   
1    -0.844885 -1.123396      -0.160546       0.530902 -0.692891 -0.684422   
2     1.233880  1.943724      -0.263941      -1.288212 -0.692891 -1.103255   
3    -0.84488

## Initiate MLFLOW for Comparison and Tracking
## Train Models with Multiple Algorthims RFC, LR, KNN, XGBClassifier
## Saved the Models in their Respective folders

## Add MLFLOW LOGGING
## Modify below command as needed
## Run via mlflow ui --backend-store-uri "file:///E:/Github Projects/MlOps MDP/MLOPS/Jupyter Notebooks/Mlflow"

In [4]:
# Import Mlflow libraries for Logging
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import pandas as pd
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.sources import LocalArtifactDatasetSource


# Import Machine Learning Algorithms 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Import Metrics from sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature

def train_model_with_mlflow(X_Train, Y_Train, X_Test, Y_Test, disease_name, dataset):
    # Create experiment 
    mlflow.set_experiment(disease_name)
    
    # Create directories if they don't exist
    os.makedirs(os.path.join('..', 'Trained_Models', disease_name), exist_ok=True)
    
    # Dictionary of models
    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "XGBoost": XGBClassifier(random_state=42)
    }
    
    # Create input example for model signature
    input_example = X_Train[:5]
    
    for model_name, model in models.items():
        # Start a new run for each model
        with mlflow.start_run(run_name=model_name):
            # Train model
            model.fit(X_Train, Y_Train)
            
            # Make predictions
            y_pred = model.predict(X_Test)
            
            # Infer model signature
            signature = infer_signature(X_Train, y_pred)
            
            # Calculate metrics
            metrics = {
                "accuracy": accuracy_score(Y_Test, y_pred),
                "precision": precision_score(Y_Test, y_pred, zero_division=0),
                "recall": recall_score(Y_Test, y_pred, zero_division=0),
                "f1_score": f1_score(Y_Test, y_pred, zero_division=0)
            }
            
            # Print results
            print(f"\n{disease_name} - {model_name} Results:")
            for metric_name, value in metrics.items():
                print(f"{metric_name}: {value:.4f}")
            
            # Log metrics to MLflow
            mlflow.log_metrics(metrics)
            
            # Test dataset import
            mlflow.log_input(dataset, context="training")

            # Log model parameters
            mlflow.log_params(model.get_params())
            
            # Log the model with signature and input example
            if isinstance(model, XGBClassifier):
                mlflow.xgboost.log_model(
                    model, 
                    name=model_name,
                    signature=signature,
                    input_example=input_example
                )
            else:
                mlflow.sklearn.log_model(
                    model, 
                    name=model_name,
                    signature=signature,
                    input_example=input_example
                )
            
            # Save model locally
            joblib.dump(model, os.path.join('..', 'Trained_Models', disease_name, f'{model_name}_model.pkl'))

# Set MLflow tracking URI and create experiments
mlflow_dir = os.path.abspath("Mlflow")  # Converts to something like E:/Your/Path/Mlflow
os.makedirs(mlflow_dir, exist_ok=True) 
mlflow.set_tracking_uri(f"file:///{mlflow_dir.replace(os.sep, '/')}")

# pre-load datasets for ml-flow
diabetes_mlflow: PandasDataset = mlflow.data.from_pandas(diabetes, source=LocalArtifactDatasetSource(data_path_d))
heart_mlflow: PandasDataset = mlflow.data.from_pandas(diabetes, source= LocalArtifactDatasetSource(data_path_h))
parkinsons_mlflow: PandasDataset = mlflow.data.from_pandas(diabetes, source= LocalArtifactDatasetSource(data_path_p))

In [None]:
# Train and log models
train_model_with_mlflow(X_train_diabetes, y_train_diabetes,X_test_diabetes, y_test_diabetes,"Diabetes_Experiment", diabetes_mlflow)

In [None]:
train_model_with_mlflow(X_train_heart,y_train_heart,X_test_heart,y_test_heart,"Heart_Disease_Experiment", heart_mlflow)

In [None]:
train_model_with_mlflow(X_train_parkinsons, y_train_parkinsons,X_test_parkinsons, y_test_parkinsons,"Parkinsons_Experiment", parkinsons_mlflow)

In [None]:
print(mlflow.__version__)

## Diabetes has sub-par accuracy mertics try to tune it

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn
import pandas as pd
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.sources import LocalArtifactDatasetSource
from imblearn.over_sampling import SMOTE
import os

# Set Custom MLflow tracking URI and create experiments
mlflow_dir = os.path.abspath("Mlflow")  # Converts to something like E:/Your/Path/Mlflow
os.makedirs(mlflow_dir, exist_ok=True) 
mlflow.set_tracking_uri(f"file:///{mlflow_dir.replace(os.sep, '/')}")

# Set Dataset tracking
diabetes_mlflow: PandasDataset = mlflow.data.from_pandas(diabetes, source=LocalArtifactDatasetSource(data_path_d))
heart_mlflow: PandasDataset = mlflow.data.from_pandas(diabetes, source= LocalArtifactDatasetSource(data_path_h))
parkinsons_mlflow: PandasDataset = mlflow.data.from_pandas(diabetes, source= LocalArtifactDatasetSource(data_path_p))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn
import pandas as pd
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.sources import LocalArtifactDatasetSource
from imblearn.over_sampling import SMOTE
import os

def Hyperparameter_tuning(X_train, X_test, y_train, y_test, dataset):
    mlflow.set_experiment("HyperParameter-Tuning: Diabetes")

    with mlflow.start_run(run_name="LogisticTuning (Elk)"):
        param_grid = {
            'penalty': ['elasticnet'],
            'l1_ratio': [0.1, 0.5, 0.9],
            'solver': ['saga'],
            'class_weight' : ['balanced'],
            'C': [0.01, 0.1, 1, 10]
        }

        grid = GridSearchCV(LogisticRegression(), param_grid, scoring='f1', cv=5)
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)

        # Infer model signature
        signature = infer_signature(X_train, y_pred)

        # Load Dataset info
        mlflow.log_input(dataset, context="training")

        # Metrics
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, zero_division=0),
            "recall": recall_score(y_test, y_pred, zero_division=0),
            "f1_score": f1_score(y_test, y_pred, zero_division=0)
        }

        mlflow.log_metrics(metrics)
        mlflow.log_params(best_model.get_params())

        # Optional: log a sample of test data as input example
        input_example = X_test.iloc[:5] if hasattr(X_test, "iloc") else X_test[:5]

        # Log the model
        mlflow.sklearn.log_model(
            best_model,
            name="LogisticRegression",
            signature=signature,
            input_example=input_example
        )

        # Print results
        print(f"Diabetes - Logistic Regression: Parameter-Tuning")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value:.4f}")

# Function call
Hyperparameter_tuning(X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes, diabetes_mlflow)

Accuracy: 0.7597402597402597
Precision: 0.6307692307692307
Recall: 0.7592592592592593
F1 Score: 0.6890756302521008


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn
import pandas as pd
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.sources import LocalArtifactDatasetSource
from imblearn.over_sampling import SMOTE
import os

def SMOTE_Logging(X_train, X_test, y_test, dataset):
    mlflow.set_experiment("HyperParameter-Tuning: Diabetes")

    with mlflow.start_run(run_name="LogisticTuning (SMOTE)"):
        # Load your diabetes dataset
        df = pd.read_csv(data_path_d)  # Replace with your actual path or DataFrame
        X = df.drop(columns=["Outcome"])
        y = df["Outcome"]

        # Train-test split
        X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # Apply SMOTE
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote, y_train_smote)

        # Train Logistic Regression
        model = LogisticRegression(solver='liblinear')
        model.fit(X_train_resampled, y_train_resampled)

        # Predict
        y_pred = model.predict(X_test_smote)

        # Evaluate
        print("Accuracy:", accuracy_score(y_test_smote, y_pred))
        print("Precision:", precision_score(y_test_smote, y_pred))
        print("Recall:", recall_score(y_test_smote, y_pred))
        print("F1 Score:", f1_score(y_test_smote, y_pred))

        # Infer model signature
        signature = infer_signature(X_train, y_pred)

        # Load Dataset info
        mlflow.log_input(dataset, context="training")

        # Metrics
        metrics = {
            "accuracy": accuracy_score(y_test_smote, y_pred),
            "precision": precision_score(y_test_smote, y_pred),
            "recall": recall_score(y_test_smote, y_pred),
            "f1_score": f1_score(y_test_smote, y_pred)
        }

        mlflow.log_metrics(metrics)

        # Optional: log a sample of test data as input example
        input_example = X_test.iloc[:5] if hasattr(X_test, "iloc") else X_test[:5]

        # Log the model
        mlflow.sklearn.log_model(
            model,
            name="LogisticRegression",
            signature=signature,
            input_example=input_example
        )

    



SMOTE_Logging(X_train_diabetes, X_test_diabetes, y_test_diabetes, diabetes_mlflow)


    

Accuracy: 0.7597402597402597
Precision: 0.6307692307692307
Recall: 0.7592592592592593
F1 Score: 0.6890756302521008




## Logistic Regression not yielding good results even after multiple tuning stages
## Moving on to XGBOOST as its a non-linear model

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn
import pandas as pd
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.sources import LocalArtifactDatasetSource
from imblearn.over_sampling import SMOTE
import os

def SMOTE_Logging(X_train, X_test, y_test, dataset):
    mlflow.set_experiment("HyperParameter-Tuning: Diabetes")

    with mlflow.start_run(run_name="XGBoost(SMOTE)"):
        # Load your diabetes dataset
        df = pd.read_csv(data_path_d)  # Replace with your actual path or DataFrame
        X = df.drop(columns=["Outcome"])
        y = df["Outcome"]

        # Train-test split
        X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # Apply SMOTE
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote, y_train_smote)

        # Train using XGB boost
        model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')
        
        model.fit(X_train_resampled, y_train_resampled)

        # Predict
        y_pred = model.predict(X_test_smote)

        # Evaluate
        print("Accuracy:", accuracy_score(y_test_smote, y_pred))
        print("Precision:", precision_score(y_test_smote, y_pred))
        print("Recall:", recall_score(y_test_smote, y_pred))
        print("F1 Score:", f1_score(y_test_smote, y_pred))

        # Infer model signature
        signature = infer_signature(X_train, y_pred)

        # Load Dataset info
        mlflow.log_input(dataset, context="training")

        # Metrics
        metrics = {
            "accuracy": accuracy_score(y_test_smote, y_pred),
            "precision": precision_score(y_test_smote, y_pred),
            "recall": recall_score(y_test_smote, y_pred),
            "f1_score": f1_score(y_test_smote, y_pred)
        }

        mlflow.log_metrics(metrics)

        # Optional: log a sample of test data as input example
        input_example = X_test_smote.iloc[:5] if hasattr(X_test, "iloc") else X_test_smote[:5]

        # Log the model
        mlflow.sklearn.log_model(
            model,
            name="XGBoost (SMOTE)",
            signature=signature,
            input_example=input_example
        )

SMOTE_Logging(X_train_diabetes, X_test_diabetes, y_test_diabetes, diabetes_mlflow)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.7727272727272727
Precision: 0.6461538461538462
Recall: 0.7777777777777778
F1 Score: 0.7058823529411765


In [None]:
def SMOTE_Logging(X_train, X_test, y_test, dataset):
    mlflow.set_experiment("HyperParameter-Tuning: Diabetes")

    with mlflow.start_run(run_name="XGBoost(SMOTE + Tuning + 5 Cross)"):
        # Load your diabetes dataset
        df = pd.read_csv(data_path_d)
        X = df.drop(columns=["Outcome"])
        y = df["Outcome"]

        # Train-test split
        X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # Find and apply scalers 
        scaler = joblib.load(os.path.join('..','Trained_Models/Scalers/diabetes_scaler.pkl'))
        X_train_smote = scaler.transform(X_train_smote)
        X_test_smote = scaler.transform(X_test_smote)

        # Apply SMOTE
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote, y_train_smote)

        # Define parameter grid for XGBoost
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'min_child_weight': [1, 3],
            'gamma': [0, 0.1],
            'scale_pos_weight': [1, 2]
        }

        # Initialize XGBoost with base parameters
        xgb_model = XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42
        )

        # Perform Grid Search
        grid_search = GridSearchCV(
            estimator=xgb_model,
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            verbose=1
        )

        # Fit Grid Search
        grid_search.fit(X_train_resampled, y_train_resampled)

        # Get best model
        best_model = grid_search.best_estimator_
        
        # Print best parameters
        print("\nBest parameters found:")
        print(grid_search.best_params_)

        # Predict with best model
        y_pred = best_model.predict(X_test_smote)

        # Evaluate
        print("\nModel Performance:")
        print("Accuracy:", accuracy_score(y_test_smote, y_pred))
        print("Precision:", precision_score(y_test_smote, y_pred))
        print("Recall:", recall_score(y_test_smote, y_pred))
        print("F1 Score:", f1_score(y_test_smote, y_pred))

        # Infer model signature
        signature = infer_signature(X_train, y_pred)

        # Log Dataset info
        mlflow.log_input(dataset, context="training")

        # Log metrics
        metrics = {
            "accuracy": accuracy_score(y_test_smote, y_pred),
            "precision": precision_score(y_test_smote, y_pred),
            "recall": recall_score(y_test_smote, y_pred),
            "f1_score": f1_score(y_test_smote, y_pred)
        }

        # Log all results to MLflow
        mlflow.log_metrics(metrics)
        mlflow.log_params(grid_search.best_params_)
        
        # Log cross-validation results
        for i, score in enumerate(grid_search.cv_results_['mean_test_score']):
            mlflow.log_metric(f"cv_score_{i}", score)

        # Log best model with example
        input_example = X_test_smote.iloc[:5] if hasattr(X_test_smote, "iloc") else X_test_smote[:5]
        
        mlflow.xgboost.log_model(
            best_model,
            name="XGBoost (SMOTE + Tuned)",
            signature=signature,
            input_example=input_example
        )

# Function call
SMOTE_Logging(X_train_diabetes, X_test_diabetes, y_test_diabetes, diabetes_mlflow)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters found:
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'scale_pos_weight': 2, 'subsample': 0.8}

Model Performance:
Accuracy: 0.7727272727272727
Precision: 0.6507936507936508
Recall: 0.7592592592592593
F1 Score: 0.7008547008547008


  self.get_booster().save_model(fname)


## PIMA-DIABETES DATSET WONT GO BEYOND 77% so stopping my work here, But previous experiments yielded Boost in Re-call, Precision, f1-score so its a success

## Testing Parkinsons

In [7]:
def SMOTE_Logging_Parkinsons(X_train, X_test, y_test, dataset):
    mlflow.set_experiment("Parkinsons_Experiment")

    with mlflow.start_run(run_name="XGBoost(SMOTE + Tuning + 5 Cross)"):
        # Load your parkinsons dataset
        df = pd.read_csv(data_path_p)
        df = df.drop(columns=['name'], axis=1)
        X = df.drop(columns=["status"])
        y = df["status"]

        # Train-test split
        X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # Find and apply scalers 
        scaler = joblib.load(os.path.join('..','Trained_Models/Scalers/parkinsons_scaler.pkl'))
        X_train_smote = scaler.transform(X_train_smote)
        X_test_smote = scaler.transform(X_test_smote)

        # Apply SMOTE
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote, y_train_smote)

        # Define parameter grid for XGBoost
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'min_child_weight': [1, 3],
            'gamma': [0, 0.1],
            'scale_pos_weight': [1, 2]
        }

        # Initialize XGBoost with base parameters
        xgb_model = XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42
        )

        # Perform Grid Search
        grid_search = GridSearchCV(
            estimator=xgb_model,
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            verbose=1
        )

        # Fit Grid Search
        grid_search.fit(X_train_resampled, y_train_resampled)

        # Get best model
        best_model = grid_search.best_estimator_
        
        # Print best parameters
        print("\nBest parameters found:")
        print(grid_search.best_params_)

        # Predict with best model
        y_pred = best_model.predict(X_test_smote)

        # Evaluate
        print("\nModel Performance:")
        print("Accuracy:", accuracy_score(y_test_smote, y_pred))
        print("Precision:", precision_score(y_test_smote, y_pred))
        print("Recall:", recall_score(y_test_smote, y_pred))
        print("F1 Score:", f1_score(y_test_smote, y_pred))

        # Ensure X_test_smote is a DataFrame for signature and input_example
        if not isinstance(X_test_smote, pd.DataFrame):
            X_test_smote_df = pd.DataFrame(X_test_smote, columns=X.columns)
        else:
            X_test_smote_df = X_test_smote

        # Infer model signature
        signature = infer_signature(X_test_smote_df, y_pred)

        # Log Dataset info
        mlflow.log_input(dataset, context="training")

        # Log metrics
        metrics = {
            "accuracy": accuracy_score(y_test_smote, y_pred),
            "precision": precision_score(y_test_smote, y_pred),
            "recall": recall_score(y_test_smote, y_pred),
            "f1_score": f1_score(y_test_smote, y_pred)
        }

        # Log all results to MLflow
        mlflow.log_metrics(metrics)
        mlflow.log_params(grid_search.best_params_)
        
        # Log cross-validation results
        for i, score in enumerate(grid_search.cv_results_['mean_test_score']):
            mlflow.log_metric(f"cv_score_{i}", score)

        # Log best model with example
        input_example = X_test_smote_df.iloc[:5]
        
        mlflow.xgboost.log_model(
            best_model,
            name="XGBoost (SMOTE + Tuned)",
            signature=signature,
            input_example=input_example
        )

# Function call
SMOTE_Logging_Parkinsons(X_train_parkinsons, X_test_parkinsons, y_test_parkinsons, parkinsons_mlflow)

[WinError 2] The system cannot find the file specified
  File "c:\Users\Bhand\anaconda3\envs\prediction\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\Bhand\anaconda3\envs\prediction\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bhand\anaconda3\envs\prediction\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
   

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters found:
{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'scale_pos_weight': 2, 'subsample': 0.8}

Model Performance:
Accuracy: 0.9230769230769231
Precision: 0.9333333333333333
Recall: 0.9655172413793104
F1 Score: 0.9491525423728814


  self.get_booster().save_model(fname)


## Verified Parkinsons Model Moving to Heart_disease

In [6]:
def SMOTE_Logging_Heart(dataset):
    mlflow.set_experiment("Heart_Disease_Experiment")

    with mlflow.start_run(run_name="XGBoost(SMOTE + Scaler + Tuning + 5 Cross)"):
        # Load your parkinsons dataset
        df = pd.read_csv(data_path_h)
        X = df.drop(columns=["target"])
        y = df["target"]

        # Train-test split
        X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )
        
        # Find and apply scalers 
        scaler = joblib.load(os.path.join('..','Trained_Models/Scalers/heart_scaler.pkl'))
        X_train_smote = scaler.transform(X_train_smote)
        X_test_smote = scaler.transform(X_test_smote)

        # Apply SMOTE
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote, y_train_smote)

        # Define parameter grid for XGBoost
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'min_child_weight': [1, 3],
            'gamma': [0, 0.1],
            'scale_pos_weight': [1, 2]
        }

        # Initialize XGBoost with base parameters
        xgb_model = XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42
        )

        # Perform Grid Search
        grid_search = GridSearchCV(
            estimator=xgb_model,
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            verbose=1
        )

        # Fit Grid Search
        grid_search.fit(X_train_resampled, y_train_resampled)

        # Get best model
        best_model = grid_search.best_estimator_
        
        # Print best parameters
        print("\nBest parameters found:")
        print(grid_search.best_params_)

        # Predict with best model
        y_pred = best_model.predict(X_test_smote)

        # Evaluate
        print("\nModel Performance:")
        print("Accuracy:", accuracy_score(y_test_smote, y_pred))
        print("Precision:", precision_score(y_test_smote, y_pred))
        print("Recall:", recall_score(y_test_smote, y_pred))
        print("F1 Score:", f1_score(y_test_smote, y_pred))

        # Ensure X_test_smote is a DataFrame for signature and input_example
        if not isinstance(X_test_smote, pd.DataFrame):
            X_test_smote_df = pd.DataFrame(X_test_smote, columns=X.columns)
        else:
            X_test_smote_df = X_test_smote

        # Infer model signature
        signature = infer_signature(X_test_smote_df, y_pred)

        # Log Dataset info
        mlflow.log_input(dataset, context="training")

        # Log metrics
        metrics = {
            "accuracy": accuracy_score(y_test_smote, y_pred),
            "precision": precision_score(y_test_smote, y_pred),
            "recall": recall_score(y_test_smote, y_pred),
            "f1_score": f1_score(y_test_smote, y_pred)
        }

        # Log all results to MLflow
        mlflow.log_metrics(metrics)
        mlflow.log_params(grid_search.best_params_)
        
        # Log cross-validation results
        for i, score in enumerate(grid_search.cv_results_['mean_test_score']):
            mlflow.log_metric(f"cv_score_{i}", score)

        # Log best model with example
        input_example = X_test_smote_df.iloc[:5]
        
        mlflow.xgboost.log_model(
            best_model,
            name="XGBoost (SMOTE + Tuned)",
            signature=signature,
            input_example=input_example
        )

# Function call
SMOTE_Logging_Heart(parkinsons_mlflow)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters found:
{'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 0.8}

Model Performance:
Accuracy: 0.8688524590163934
Precision: 0.8333333333333334
Recall: 0.8928571428571429
F1 Score: 0.8620689655172413


  self.get_booster().save_model(fname)
