# Preprocess data

In [None]:
import os
import pickle
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For scaling and encoding

In [None]:
def dump_pickle(obj, filename: str):
    """Dump an object to a pickle file."""
    with open(filename, 'wb') as f_out:
        return pickle.dump(obj, f_out)

def read_dataframe(filename: str):
    """Read a DataFrame from a csv."""
    df = pd.read_csv(filename)
    return df

def preprocess(df: pd.DataFrame, ss: StandardScaler, le: LabelEncoder):
    """Preprocess the DataFrame."""
    # Drop rows with missing values
    df.dropna(inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['id'], inplace=True)

    # Encoding categorical features
    categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
    for col in categorical_columns:
        df[col] = le.fit_transform(df[col])

    # Scaling numerical features
    numerical_columns = ['age', 'avg_glucose_level', 'bmi']
    df[numerical_columns] = ss.fit_transform(df[numerical_columns])

    return df, ss, le

def run_data_prep(
    input_file: str,
    output_dir: str,
    test_size: float = 0.2,
    random_state: int = 42
):
    """Main function to run data preparation."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the dataset
    df = read_dataframe(input_file)

    # Initialize StandardScaler and LabelEncoder
    ss = StandardScaler()
    le = LabelEncoder()

    # Preprocess the DataFrame
    df, ss, le = preprocess(df, ss, le)

    # Split the dataset into training and testing sets
    X = df.drop(columns=['stroke'])
    y = df['stroke']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Save the preprocessed data
    dump_pickle((X_train, y_train), os.path.join(output_dir, 'train.pkl'))
    dump_pickle((X_test, y_test), os.path.join(output_dir, 'test.pkl'))
    dump_pickle(ss, os.path.join(output_dir, 'scaler.pkl'))
    dump_pickle(le, os.path.join(output_dir, 'label_encoder.pkl'))

run_data_prep(input_file='./data/healthcare-dataset-stroke-data.csv', output_dir='./models')

# Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import mlflow
import os
import pickle

In [None]:
mlflow.set_tracking_uri("http://experiment-tracking:5000")
mlflow.set_experiment("random-forest-train")

def load_pickle(filename: str):
    """Load an object from a pickle file."""
    with open(filename, 'rb') as f_in:
        return pickle.load(f_in)

def run_train(data_path: str):
    """Main function to run training."""
    mlflow.sklearn.autolog()
    # Load the preprocessed data
    X_train, y_train = load_pickle(os.path.join(data_path, 'train.pkl'))
    X_test, y_test = load_pickle(os.path.join(data_path, 'test.pkl'))

    with mlflow.start_run():
        
        # Initialize the Random Forest Classifier
        rf_classifier = RandomForestClassifier(random_state=42)

        # Train the model
        rf_classifier.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = rf_classifier.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

run_train(data_path='./models')

# HPO

In [None]:
import os
import pickle
import mlflow
import optuna

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
mlflow.set_tracking_uri("http://experiment-tracking:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename: str):
    """Load an object from a pickle file."""
    with open(filename, 'rb') as f_in:
        return pickle.load(f_in)

def run_optimization(data_path: str, num_trials: int):
    """Main function to run hyperparameter optimization."""
    # Load the preprocessed data
    X_train, y_train = load_pickle(os.path.join(data_path, 'train.pkl'))
    X_test, y_test = load_pickle(os.path.join(data_path, 'test.pkl'))

    # Disable autologging to avoid conflicts with Optuna
    mlflow.sklearn.autolog(disable=True)

    def objective(trial):
        # Define the hyperparameters to tune
        params = {
                'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
                'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
                'random_state': 42,
                'n_jobs': -1
            }
        
        with mlflow.start_run():
            mlflow.log_params(params)
            # Create the model with the suggested hyperparameters
            rf = RandomForestClassifier(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("accuracy", accuracy)
        
        return accuracy

    # Create a study and optimize the objective function
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=num_trials)

run_optimization(data_path='./models', num_trials=10)

# Register model

In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [None]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']

mlflow.set_tracking_uri("http://experiment-tracking:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

def train_and_log_model(params):
    # Load preprocessed training and validation data
    X_train, y_train = load_pickle(os.path.join('./models', 'train.pkl'))
    X_test, y_test = load_pickle(os.path.join('../models', 'test.pkl'))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        # Train the Random Forest Classifier
        rf_classifier = RandomForestClassifier(**params)
        rf_classifier.fit(X_train, y_train)
        y_pred = rf_classifier.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        # Log the model
        mlflow.sklearn.log_model(rf_classifier, artifact_path="model")


def run_register_model(data_path: str, top_n: int):
    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.accuracy DESC"]
    )
    for run in runs:
        train_and_log_model(params=run.data.params)

    # Select the model with the highest test accuracy
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.accuracy DESC"]
    )[0]

    # Register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri, name="rf-best-model")


run_register_model("./models/", 5)