In [1]:
import mlflow
import os
import pandas as pd

from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
os.getenv("MLFLOW_TRACKING_URI", "No env")

'http://mlflow-service:5000'

In [3]:
mlflow.get_registry_uri()

'http://mlflow-service:5000'

In [4]:
exp_id = mlflow.create_experiment(name="Alex Trigolos")
# mlflow.delete_experiment(experiment_id=exp_id)
exp_id

'921231301415440617'

In [5]:
mlflow.search_experiments(
    filter_string="name = 'Alex Trigolos'"
)

[<Experiment: artifact_location='s3://testmlops/mlflow/921231301415440617', creation_time=1727898920227, experiment_id='921231301415440617', last_update_time=1727898920227, lifecycle_stage='active', name='Alex Trigolos', tags={}>]

In [6]:
data = pd.read_csv('https://raw.githubusercontent.com/edaehn/python_tutorials/main/titanic/train.csv', index_col=0)

prepare_data = data.fillna({"Embarked" : "S"})
prepare_data['Age'] = prepare_data.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
prepare_data = pd.get_dummies(prepare_data, columns=["Sex", "Pclass", "Embarked"])
prepared_data = prepare_data.drop(["Name", "Ticket", "Cabin"], axis=1)
X = prepared_data.drop("Survived", axis=1)
y = prepared_data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

In [7]:
MODELS = {
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": range(50, 500, 50),
            "max_depth": [None, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 30],
            "min_samples_split": [1, 2, 3, 5, 7, 8, 10]
        }
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 30],
            "min_samples_split": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        }
    },
    "GradientBoostingClassifier": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": range(50, 500, 50),
            "learning_rate": [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 1],
            "max_depth": [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
        }
    }
}

In [None]:
with mlflow.start_run(run_name="ursatap", experiment_id = exp_id, description = "parent") as parent_run:
    for model_name in MODELS.keys():
        with mlflow.start_run(run_name=model_name, experiment_id=exp_id, nested=True) as child_run:
            model = MODELS[model_name]["model"]
            param_grid = MODELS[model_name]["params"]
        
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)
            grid_search.fit(X_train, y_train)

            best_model = grid_search.best_estimator_
            prediction = best_model.predict(X_val)

            eval_df = X_val.copy()
            eval_df["target"] = y_val

            signature = infer_signature(X_test, prediction)
            model_info = mlflow.sklearn.log_model(best_model, model_name, signature=signature, registered_model_name=f"sk-learn-{model_name}-model")
            mlflow.evaluate(
                model=model_info.model_uri,
                data=eval_df,
                targets="target",
                model_type="classifier",
                evaluators=["default"],
            )

Fitting 5 folds for each of 1071 candidates, totalling 5355 fits


765 fits failed out of a total of 5355.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
765 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/opt/conda/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
s

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2024/10/02 20:20:52 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/10/02 20:20:52 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/10/02 20:20:52 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/10/02 20:20:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestClassifier at: http://mlflow-service:5000/#/experiments/921231301415440617/runs/34f5300a9f62492c8e7283520a290578.
2024/10/02 20:20:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/921231301415440617.


Fitting 5 folds for each of 170 candidates, totalling 850 fits


85 fits failed out of a total of 850.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
85 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/opt/conda/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2024/10/02 20:21:00 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/10/02 20:21:00 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/10/02 20:21:01 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/10/02 20:21:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run DecisionTreeClassifier at: http://mlflow-service:5000/#/experiments/921231301415440617/runs/9c2e269192644a4aba186f276b09249d.
2024/10/02 20:21:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/921231301415440617.


Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
