In [3]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

In [4]:
mlflow.set_experiment("breast_cancer_classification/random_forest/baseline")

2025/10/22 19:18:33 INFO mlflow.tracking.fluent: Experiment with name 'breast_cancer_classification/random_forest/baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/dongnd/MLOps/mlops_tutorial_02/mlruns/726829507557326048', creation_time=1761135513518, experiment_id='726829507557326048', last_update_time=1761135513518, lifecycle_stage='active', name='breast_cancer_classification/random_forest/baseline', tags={}>

In [5]:
data = load_breast_cancer()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
                    test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
import os
import joblib

with mlflow.start_run(run_name='random_forest_tuning_01'):
    mlflow.set_tag('owner', 'ds_team')
    mlflow.set_tag('model_type', 'RandomForestClassifier')
    mlflow.set_tag('dataset', 'breast_cancer')
    mlflow.set_tag('dataset_version', '1.0')
    mlflow.set_tag('environment', 'development')

    params = {
        'n_estimators': 500,
        'max_depth': 8,
        'random_state': 42
    }
    mlflow.log_params(params)
    mlflow.log_param('scaler', 'StandardScaler')

    rf = RandomForestClassifier(**params)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mlflow.log_metrics(
        {
            'accuracy': accuracy,
            'f1_score': f1,
            'precision': precision,
            'recall': recall
        }
    )

    mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path='model',
    )
    os.makedirs('models', exist_ok=True)
    scaler_path = os.path.join('models', 'scaler.pkl')
    joblib.dump(scaler, scaler_path)
    mlflow.log_artifact(
        scaler_path,
         artifact_path='preprocessor'
    )



