In [11]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import mlflow
from mlflow.models import infer_signature
from sklearn.metrics import classification_report,accuracy_score

In [2]:
iris = load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['target'] = iris.target


In [3]:
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
print("\nMissing values:\n", data.isnull().sum())

# Standardizing the features
scaler = StandardScaler()
X = scaler.fit_transform(data.iloc[:, :-1])  # Scale features only
y = data['target']



Missing values:
 sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
mlflow.set_tracking_uri(uri=" http://127.0.0.1:5000")

2024/12/28 14:13:19 INFO mlflow.tracking.fluent: Experiment with name 'Basic ML project' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/359615374808457462', creation_time=1735413199073, experiment_id='359615374808457462', last_update_time=1735413199073, lifecycle_stage='active', name='Basic ML project', tags={}>

In [None]:
model = RandomForestClassifier(random_state=42)
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)


print("\nBest parameters found by GridSearchCV:", grid_search.best_params_)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best parameters found by GridSearchCV: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}


In [16]:
mlflow.set_experiment("ML project")

2024/12/28 14:40:30 INFO mlflow.tracking.fluent: Experiment with name 'ML project' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/802956528055571557', creation_time=1735414830297, experiment_id='802956528055571557', last_update_time=1735414830297, lifecycle_stage='active', name='ML project', tags={}>

In [26]:
with mlflow.start_run():
    model = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }

    # Log the grid search parameter grid as a string (since it's nested)
    mlflow.log_param("param_grid", str(param_grid))

    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("\nBest parameters found by GridSearchCV:", grid_search.best_params_)
    best_model = grid_search.best_estimator_

    # Log the best parameters
    mlflow.log_params(grid_search.best_params_)

    # Evaluate on the training set
    y_train_pred = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print("\nTraining Accuracy:", train_accuracy)

    # Evaluate on the test set
    y_test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test Accuracy:", test_accuracy)

    # Log the accuracy as metrics
    mlflow.log_metrics({"train_accuracy": train_accuracy, "test_accuracy": test_accuracy})

    # Set custom tags for the experiment
    mlflow.set_tags({"exp": "basic ML experiment tracking"})

    # Infer signature for the model
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train, y_train)

    # Log the model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="iris",
        signature=signature,
        input_example=X_test,
        registered_model_name="RandomForest"
    )

Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best parameters found by GridSearchCV: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}

Training Accuracy: 0.975
Test Accuracy: 1.0


Successfully registered model 'RandomForest'.
2024/12/28 14:48:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest, version 1


🏃 View run vaunted-calf-865 at:  http://127.0.0.1:5000/#/experiments/802956528055571557/runs/838d609496554cfe9e363f6ec41e8471
🧪 View experiment at:  http://127.0.0.1:5000/#/experiments/802956528055571557


Created version '1' of model 'RandomForest'.


In [25]:
mlflow.end_run()