# MLflow Example: Tracking Two Model Training Runs

In [9]:
# === Model Training and Experimentation ===

# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits


In [10]:

# Load dataset
data = load_digits()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Define a function to train a model and return its accuracy
def train_model(n_estimators, max_depth):
    # Train a RandomForestClassifier
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Model with n_estimators={n_estimators}, max_depth={max_depth} achieved accuracy={accuracy:.4f}")
    return model, accuracy

In [11]:

# Train models with different hyperparameter configurations
print("Training Model 1...")
model_1, acc_1 = train_model(n_estimators=20, max_depth=5)

Training Model 1...
Model with n_estimators=20, max_depth=5 achieved accuracy=0.9389


In [12]:
print("\nTraining Model 2...")
model_2, acc_2 = train_model(n_estimators=100, max_depth=10)



Training Model 2...
Model with n_estimators=100, max_depth=10 achieved accuracy=0.9722


Now adding MLFlow

In [13]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits

In [14]:
# get this code from dagshub
import dagshub
dagshub.init(repo_owner='Bardakor', repo_name='mlflow-with-daghub', mlflow=True)

In [15]:
data = load_digits()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [16]:
# Set MLflow experiment name. This will get created if it doesn't exist
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_name = f"RandomForestExperiment_{timestamp}"
mlflow.set_experiment(experiment_name)
print(f"Using experiment: {experiment_name}")


2025/06/29 14:54:56 INFO mlflow.tracking.fluent: Experiment with name 'RandomForestExperiment_20250629_145455' does not exist. Creating a new experiment.


Using experiment: RandomForestExperiment_20250629_145455


In [17]:
def train_and_log_model(n_estimators, max_depth):
    with mlflow.start_run():
        # Train a RandomForestClassifier
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)

        # Log parameters and metrics (skip model logging due to DagHub compatibility)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_metric("accuracy", accuracy)
        
        # Save model locally for Docker
        import os
        import joblib
        os.makedirs("models", exist_ok=True)
        joblib.dump(model, f"models/model_estimators_{n_estimators}_depth_{max_depth}.pkl")

        print(f"Logged RandomForest model with n_estimators={n_estimators}, max_depth={max_depth}, accuracy={accuracy:.4f}")
        return model, accuracy

In [18]:
# Train and log Model 1
print("Training and Logging Model 1...")
train_and_log_model(n_estimators=20, max_depth=5)

Training and Logging Model 1...
Logged RandomForest model with n_estimators=20, max_depth=5, accuracy=0.9389
🏃 View run caring-slug-232 at: https://dagshub.com/Bardakor/mlflow-with-daghub.mlflow/#/experiments/2/runs/e6f0a2e8cd7f4afc88baa107fc7d3288
🧪 View experiment at: https://dagshub.com/Bardakor/mlflow-with-daghub.mlflow/#/experiments/2


(RandomForestClassifier(max_depth=5, n_estimators=20, random_state=42),
 0.9388888888888889)

In [19]:
# Train and log Model 2
print("\nTraining and Logging Model 2...")
train_and_log_model(n_estimators=100, max_depth=10)


Training and Logging Model 2...
Logged RandomForest model with n_estimators=100, max_depth=10, accuracy=0.9722
🏃 View run whimsical-penguin-72 at: https://dagshub.com/Bardakor/mlflow-with-daghub.mlflow/#/experiments/2/runs/aaba7231740848f1ba2ffb8ab8d7335d
🧪 View experiment at: https://dagshub.com/Bardakor/mlflow-with-daghub.mlflow/#/experiments/2


(RandomForestClassifier(max_depth=10, random_state=42), 0.9722222222222222)

In [20]:
# Instructions to visualize results
print("\nTo view the results, run the following command in your terminal:")
print("mlflow ui")
print("Then navigate to http://127.0.0.1:5000 to explore the experiment results.")


To view the results, run the following command in your terminal:
mlflow ui
Then navigate to http://127.0.0.1:5000 to explore the experiment results.


In [21]:
# New cell - Save best model for Docker
import joblib
import os

# Determine which model performed better and save as best_model.pkl
# You'll need to capture the models and accuracies from the previous cells
os.makedirs("models", exist_ok=True)

# Assuming model from cell with n_estimators=100, max_depth=10 performs better
# Load the better performing model and save as best_model.pkl
best_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
best_model.fit(X_train, y_train)
joblib.dump(best_model, "models/best_model.pkl")

print("Best model saved for Docker deployment!")

Best model saved for Docker deployment!
