In [1]:
import mlflow
import logging
import pandas as pd
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from pathlib import Path
from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [11]:
client = MlflowClient()
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [12]:
all_runs = mlflow.search_runs(search_all_experiments=True)

In [13]:
latest_exp_id = all_runs.experiment_id
latest_exp_name = all_runs

In [15]:
latest_exp_name

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time


In [16]:
experiments = client.search_experiments(view_type=mlflow.entities.ViewType.ALL)
experiments

[<Experiment: artifact_location='file:C:/Users/tomas/Documents/Projects/Diabetes_project/artifacts/0', creation_time=1752086389728, experiment_id='0', last_update_time=1752086389728, lifecycle_stage='active', name='Default', tags={}>]

In [17]:
client.search_experiments(view_type=ViewType.ACTIVE_ONLY)

[<Experiment: artifact_location='file:C:/Users/tomas/Documents/Projects/Diabetes_project/artifacts/0', creation_time=1752086389728, experiment_id='0', last_update_time=1752086389728, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
print(mlflow.get_tracking_uri())

http://127.0.0.1:5000


In [48]:
experiments = client.search_experiments(view_type=mlflow.entities.ViewType.ALL)
experiments_sorted = sorted(experiments, key=lambda x: x.creation_time, reverse=False)
latest_experiment = experiments_sorted
latest_experiment[-1]

<Experiment: artifact_location='mlflow-artifacts:/8', creation_time=1751823938865, experiment_id='8', last_update_time=1751823938865, lifecycle_stage='active', name='classification_experiment_v4', tags={}>

In [49]:
order = ['metrics.recall DESC',	'metrics.f1_macro DESC', 'metrics.accuracy DESC']

top_n = 4
runs = client.search_runs(
        experiment_ids=latest_experiment[-1].experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=order
    )

In [75]:
best_run_id = runs[0].info.run_id
best_run_name = runs[0].info.run_name
best_metrics = runs[0].data.metrics
best_params = runs[0].data.params
best_model = runs[0].data.tags['Model']


In [63]:
runs[0]

<Run: data=<RunData: metrics={'accuracy': 0.7385052034058657,
 'f1_macro': 0.6428998009326592,
 'recall': 0.7234448795577519}, params={'bagging_temperature': '0.5747841229769519',
 'border_count': '48',
 'depth': '9',
 'iterations': '500',
 'l2_leaf_reg': '0.2246009819377022',
 'learning_rate': '0.1785197460396047',
 'loss_function': 'Logloss',
 'random_seed': '42',
 'random_strength': '0.17099029185236003',
 'verbose': '0'}, tags={'Model': 'Catboost',
 'mlflow.runName': 'lyrical-wren-154',
 'mlflow.source.git.commit': 'fbe4afb8c6626d7aba190e51f6dfe8614c236887',
 'mlflow.source.name': '.\\Model\\train.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'David'}>, info=<RunInfo: artifact_uri='mlflow-artifacts:/8/f33d92e405604e4fb4b56a82a9e4dff0/artifacts', end_time=1751823949393, experiment_id='8', lifecycle_stage='active', run_id='f33d92e405604e4fb4b56a82a9e4dff0', run_name='lyrical-wren-154', start_time=1751823943042, status='FINISHED', user_id='David'>, inputs=<RunInputs: dataset_in

In [71]:
best_metrics

{'f1_macro': 0.6428998009326592,
 'accuracy': 0.7385052034058657,
 'recall': 0.7234448795577519}

In [97]:
best_params

{'bagging_temperature': '0.5747841229769519',
 'border_count': '48',
 'depth': '9',
 'iterations': '500',
 'l2_leaf_reg': '0.2246009819377022',
 'learning_rate': '0.1785197460396047',
 'loss_function': 'Logloss',
 'random_seed': '42',
 'random_strength': '0.17099029185236003',
 'verbose': '0'}

In [98]:
best_model

'Catboost'

In [99]:
def load_parquet(prefix: str):
    input_dir = Path.cwd().parent / "Data"

    X = pd.read_parquet(input_dir / f"X_{prefix}.parquet")
    logging.info(f"Loaded X from {input_dir / f'X_{prefix}.parquet'}")

    y = pd.read_parquet(input_dir / f"y_{prefix}.parquet").squeeze()
    logging.info(f"Loaded y from {input_dir / f'y_{prefix}_y.parquet'}")

    return X, y

In [106]:
def dict_change_dtypes(params):
    parsed_params = {}
    for k, v in params.items():
        try:
            parsed_val = eval(v)  # pozor, eval může být nebezpečný u nedůvěryhodných vstupů
        except:
            parsed_val = v

        parsed_params[k] = parsed_val
        
    return parsed_params

In [120]:
def catboost_train(params):
    X_train, y_train = load_parquet(prefix = 'train')
    X_test, y_test = load_parquet(prefix = 'test')

    categorical_features_indices = [X_train.columns.get_loc(col) for col in ['Age', 'GenHlth', 'Education', 'Income']]

    params = dict_change_dtypes(params)
    with mlflow.start_run() as run:
        mlflow.set_tag('Model', 'Catboost')
        mlflow.set_tag("Stage", "Final_model")
        mlflow.log_params(params)

        model = CatBoostClassifier(
            **params,
            cat_features=categorical_features_indices,
            early_stopping_rounds=50,
            eval_metric='TotalF1'
        )
        model.fit(X_train, 
                  y_train, 
                  eval_set=(X_test, y_test), 
                  use_best_model=True, 
                  verbose=0)

        y_pred = model.predict(X_test)  
        # Logging important metrics
        score = f1_score(y_test, y_pred, average='macro')
        mlflow.log_metric('f1_macro', score)
        print(classification_report(y_test, y_pred))    
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy) 
        recall = recall_score(y_test, y_pred, average='macro')
        mlflow.log_metric('recall', recall)

        mlflow.catboost.log_model(model, artifact_path="model")

        run_id = run.info.run_id
        model_uri = f"runs:/{run_id}/model"
        mlflow.register_model(model_uri, name="final-catboost-model")
        logging.info(f'Best Catboost model has been logged into {model_uri}')



In [118]:
mlflow.set_experiment('classification_experiment_v4')

<Experiment: artifact_location='mlflow-artifacts:/8', creation_time=1751823938865, experiment_id='8', last_update_time=1751823938865, lifecycle_stage='active', name='classification_experiment_v4', tags={}>

In [119]:
catboost_train(best_params)

2025-07-06 20:54:27,685 - INFO - Loaded X from c:\Users\tomas\Documents\Projects\Diabetes_project\Data\X_train.parquet
2025-07-06 20:54:27,687 - INFO - Loaded y from c:\Users\tomas\Documents\Projects\Diabetes_project\Data\y_train_y.parquet
2025-07-06 20:54:27,692 - INFO - Loaded X from c:\Users\tomas\Documents\Projects\Diabetes_project\Data\X_test.parquet
2025-07-06 20:54:27,696 - INFO - Loaded y from c:\Users\tomas\Documents\Projects\Diabetes_project\Data\y_test_y.parquet


              precision    recall  f1-score   support

           0       0.93      0.75      0.83    213703
           1       0.34      0.70      0.46     39977

    accuracy                           0.74    253680
   macro avg       0.64      0.72      0.64    253680
weighted avg       0.84      0.74      0.77    253680

🏃 View run spiffy-snipe-633 at: http://127.0.0.1:5000/#/experiments/8/runs/aca2f3d42a7b48fea239f57a6e8ec630
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/8


In [25]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [26]:
client = MlflowClient()
experiments = client.search_experiments()
partial_name = 'classification_experiment'
matching_experiments = sorted(
    [exp for exp in experiments if partial_name.lower() in exp.name.lower()],
    key=lambda x: x.creation_time,
    reverse=True
)
if matching_experiments:
    mlflow.set_experiment(matching_experiments[0].name)
    logging.info(f'Experiment "{matching_experiments[0].name}" selected by partial match.')
else:
    logging.warning(f'No experiment matched partial name "{partial_name}".')

2025-07-08 15:35:19,085 - INFO - Experiment "classification_experiment_v9" selected by partial match.


In [None]:
register_models = client.search_registered_models()

In [30]:
model_name = 'final-xgboost-model'
version = client.get_latest_versions(model_name)

  version = client.get_latest_versions(model_name)


In [45]:
versions = client.search_model_versions(f"name='{model_name}'")
version = versions[0].version

In [47]:
version_id = versions[0].run_id
version_id 

'8b475ef68f1c4493abbaf224814d5aa4'

In [None]:
client.set_model_version_tag(model_name, version, "validated_by", "QA")
client.set_model_version_tag(model_name, version, "stage", "production")   


In [49]:
client.set_registered_model_alias(name=model_name,alias="Champion", version=version)

In [51]:
client.set_model_version_tag(
    name=model_name,
    version=version,
    key="created_by",
    value="David"
)