In [1]:
import os
import pickle
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For scaling and encoding

In [2]:
def dump_pickle(obj, filename: str):
    """Dump an object to a pickle file."""
    with open(filename, 'wb') as f_out:
        return pickle.dump(obj, f_out)

def read_dataframe(filename: str):
    """Read a DataFrame from a csv."""
    df = pd.read_csv(filename)
    return df

def preprocess(df: pd.DataFrame, ss: StandardScaler, le: LabelEncoder):
    """Preprocess the DataFrame."""
    # Drop rows with missing values
    df.dropna(inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['id'], inplace=True)

    # Encoding categorical features
    categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
    for col in categorical_columns:
        df[col] = le.fit_transform(df[col])

    # Scaling numerical features
    numerical_columns = ['age', 'avg_glucose_level', 'bmi']
    df[numerical_columns] = ss.fit_transform(df[numerical_columns])

    return df, ss, le

def run_data_prep(
    input_file: str,
    output_dir: str,
    test_size: float = 0.2,
    random_state: int = 42
):
    """Main function to run data preparation."""
    # Read the dataset
    df = read_dataframe(input_file)

    # Initialize StandardScaler and LabelEncoder
    ss = StandardScaler()
    le = LabelEncoder()

    # Preprocess the DataFrame
    df, ss, le = preprocess(df, ss, le)

    # Split the dataset into training and testing sets
    X = df.drop(columns=['stroke'])
    y = df['stroke']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Save the preprocessed data
    dump_pickle((X_train, y_train), os.path.join(output_dir, 'train.pkl'))
    dump_pickle((X_test, y_test), os.path.join(output_dir, 'test.pkl'))
    dump_pickle(ss, os.path.join(output_dir, 'scaler.pkl'))
    dump_pickle(le, os.path.join(output_dir, 'label_encoder.pkl'))

run_data_prep(input_file='./data/healthcare-dataset-stroke-data.csv', output_dir='../models')

Train

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import mlflow
import os
import pickle

In [4]:
mlflow.set_tracking_uri("http://experiment-tracking:5000")
mlflow.set_experiment("random-forest-train")

def load_pickle(filename: str):
    """Load an object from a pickle file."""
    with open(filename, 'rb') as f_in:
        return pickle.load(f_in)

def run_train(data_path: str):
    """Main function to run training."""
    mlflow.sklearn.autolog()
    # Load the preprocessed data
    X_train, y_train = load_pickle(os.path.join(data_path, 'train.pkl'))
    X_test, y_test = load_pickle(os.path.join(data_path, 'test.pkl'))

    with mlflow.start_run():
        
        # Initialize the Random Forest Classifier
        rf_classifier = RandomForestClassifier(random_state=42)

        # Train the model
        rf_classifier.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = rf_classifier.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

run_train(data_path='../models')



🏃 View run funny-shrike-785 at: http://experiment-tracking:5000/#/experiments/3/runs/52659204dc6e4e81b85f5e8f57999874
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/3


HPO

In [5]:
import os
import pickle
import mlflow
import optuna

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
mlflow.set_tracking_uri("http://experiment-tracking:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename: str):
    """Load an object from a pickle file."""
    with open(filename, 'rb') as f_in:
        return pickle.load(f_in)

def run_optimization(data_path: str, num_trials: int):
    """Main function to run hyperparameter optimization."""
    # Load the preprocessed data
    X_train, y_train = load_pickle(os.path.join(data_path, 'train.pkl'))
    X_test, y_test = load_pickle(os.path.join(data_path, 'test.pkl'))

    # Disable autologging to avoid conflicts with Optuna
    mlflow.sklearn.autolog(disable=True)

    def objective(trial):
        # Define the hyperparameters to tune
        params = {
                'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
                'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
                'random_state': 42,
                'n_jobs': -1
            }
        
        with mlflow.start_run():
            mlflow.log_params(params)
            # Create the model with the suggested hyperparameters
            rf = RandomForestClassifier(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("accuracy", accuracy)
        
        return accuracy

    # Create a study and optimize the objective function
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=num_trials)

run_optimization(data_path='../models', num_trials=10)

[I 2025-05-06 12:34:57,895] A new study created in memory with name: no-name-210cfd78-625d-456a-a297-83aeff95ab17
  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:58,060] Trial 0 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 21, 'max_depth': 1, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9460285132382892.
  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:58,227] Trial 1 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 26, 'max_depth': 16, 'mi

🏃 View run silent-sow-259 at: http://experiment-tracking:5000/#/experiments/1/runs/b0d3ff9c3a9340daaf0619eb31e6d430
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1
🏃 View run awesome-ray-98 at: http://experiment-tracking:5000/#/experiments/1/runs/f9e164d0a1c043f69b592ec64f125768
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1


  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:58,440] Trial 2 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 45, 'max_depth': 1, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9460285132382892.


🏃 View run unruly-dolphin-760 at: http://experiment-tracking:5000/#/experiments/1/runs/7b2208e68908403badcd810febd6f157
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1


  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:58,649] Trial 3 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 38, 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9460285132382892.
  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:58,765] Trial 4 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 10, 'max_depth': 16, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9460285132382892.


🏃 View run gentle-stork-426 at: http://experiment-tracking:5000/#/experiments/1/runs/29cc5d5e1f0f426586c0c10a1cf53298
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1
🏃 View run spiffy-frog-634 at: http://experiment-tracking:5000/#/experiments/1/runs/c1c8985a32cf44fa9668a91a6611810d
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1


  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:58,947] Trial 5 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 34, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9460285132382892.
  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),


🏃 View run omniscient-foal-888 at: http://experiment-tracking:5000/#/experiments/1/runs/da5aeeac151945cea680a3cc4e7b171f
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1


[I 2025-05-06 12:34:59,166] Trial 6 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 47, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9460285132382892.


🏃 View run illustrious-hawk-785 at: http://experiment-tracking:5000/#/experiments/1/runs/3e787f14cb7643d5a7e768daf935bb9b
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1


  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:59,378] Trial 7 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 41, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9460285132382892.


🏃 View run handsome-eel-915 at: http://experiment-tracking:5000/#/experiments/1/runs/f0d43cb49d20486da532ece1bb917a61
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1


  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:59,580] Trial 8 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 42, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9460285132382892.
  'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
  'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
  'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
[I 2025-05-06 12:34:59,756] Trial 9 finished with value: 0.9460285132382892 and parameters: {'n_estimators': 31, 'max_depth': 1, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9460285132382892.


🏃 View run gentle-midge-993 at: http://experiment-tracking:5000/#/experiments/1/runs/a89a330dafbe4955872716d29ceef435
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1
🏃 View run suave-auk-404 at: http://experiment-tracking:5000/#/experiments/1/runs/ff11f60cf0b54260ad9fb37a3643bad2
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/1


Register

In [7]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

from sklearn.pipeline import make_pipeline

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [8]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']

mlflow.set_tracking_uri("http://experiment-tracking:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

def train_and_log_model(params):
    # Load preprocessed training and validation data
    X_train, y_train = load_pickle(os.path.join('../models', 'train.pkl'))
    X_test, y_test = load_pickle(os.path.join('../models', 'test.pkl'))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        # Train the Random Forest Classifier
        rf_classifier = RandomForestClassifier(**params)
        rf_classifier.fit(X_train, y_train)
        y_pred = rf_classifier.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        # Log the model
        mlflow.sklearn.log_model(rf_classifier, artifact_path="model")


def run_register_model(data_path: str, top_n: int):
    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.accuracy DESC"]
    )
    for run in runs:
        train_and_log_model(params=run.data.params)

    # Select the model with the highest test accuracy
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.accuracy DESC"]
    )[0]

    # Register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri, name="rf-best-model")


run_register_model("../models/", 5)



🏃 View run funny-stoat-849 at: http://experiment-tracking:5000/#/experiments/2/runs/e991236d03614a9f8954f054f3c8401b
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/2




🏃 View run legendary-crab-351 at: http://experiment-tracking:5000/#/experiments/2/runs/eb558cb070ad47948fb3a31f8eea4cb8
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/2




🏃 View run omniscient-stoat-903 at: http://experiment-tracking:5000/#/experiments/2/runs/c4e1453ff44142c48156c40fd60252fa
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/2




🏃 View run orderly-shoat-163 at: http://experiment-tracking:5000/#/experiments/2/runs/2007d2c6e53c4a51a6eb26b755730ec2
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/2


Successfully registered model 'rf-best-model'.
2025/05/06 12:35:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf-best-model, version 1


🏃 View run enchanting-perch-947 at: http://experiment-tracking:5000/#/experiments/2/runs/718d7eca7c7749f3b6032911d3ba6323
🧪 View experiment at: http://experiment-tracking:5000/#/experiments/2


Created version '1' of model 'rf-best-model'.
