In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Define 5 hyperparameter sets for the experiments
experiment_configs = [
    {"n_estimators": 10, "max_depth": 2, "min_samples_split": 2},
    {"n_estimators": 50, "max_depth": 5, "min_samples_split": 5},
    {"n_estimators": 100, "max_depth": None, "min_samples_split": 2},
    {"n_estimators": 200, "max_depth": 10, "min_samples_split": 10},
    {"n_estimators": 150, "max_depth": 7, "min_samples_split": 4},
]

mlflow.set_experiment("FloraAI_Iris_Classification")

def train_and_log(params, run_name):
    with mlflow.start_run(run_name=run_name):
        start_time = time.time()

        # Train Model
        rf = RandomForestClassifier(**params, random_state=42)
        rf.fit(X_train, y_train)

        training_time = time.time() - start_time
        y_pred = rf.predict(X_test)

        # Calculate Metrics
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, average='weighted'),
            "recall": recall_score(y_test, y_pred, average='weighted'),
            "f1_score": f1_score(y_test, y_pred, average='weighted'),
            "training_time": training_time
        }

        # Log Parameters and Metrics
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)

        # Artifact 1: Confusion Matrix
        fig, ax = plt.subplots(figsize=(6,6))
        ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test, ax=ax)
        plt.title(f"Confusion Matrix - {run_name}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Artifact 2: Feature Importance
        fig, ax = plt.subplots()
        importances = rf.feature_importances_
        indices = np.argsort(importances)
        plt.title('Feature Importances')
        plt.barh(range(len(indices)), importances[indices], align='center')
        plt.yticks(range(len(indices)), [iris.feature_names[i] for i in indices])
        plt.savefig("feature_importance.png")
        mlflow.log_artifact("feature_importance.png")
        plt.close()

        # Log Model
        mlflow.sklearn.log_model(rf, "model")

        print(f"Finished {run_name}: Accuracy = {metrics['accuracy']}")

# Execute 5 experiments
for i, config in enumerate(experiment_configs):
    train_and_log(config, f"Experiment_Run_{i+1}")

ModuleNotFoundError: No module named 'mlflow'