In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load the Iris dataset
X, y = datasets.load_iris(return_X_y=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


def train_model(X_train, y_train, params):
    """
    Train a logistic regression model with the given parameters.
    
    Args:
        X_train (array-like): Training features.
        y_train (array-like): Training labels.
        params (dict): Hyperparameters for the logistic regression model.
        
    Returns:
        LogisticRegression: Trained logistic regression model.
    """
    # train the logistic regression model
    lr = LogisticRegression(**params)
    lr.fit(X_train, y_train)
    # Predict on the test set
    y_pred = lr.predict(X_test)
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    # Create a DataFrame to hold the metrics
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "model": lr
    }

In [2]:

# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 43,
}

# Train the model and get the metrics
metrics = train_model(X_train, y_train, params)



In [3]:
metrics

{'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1_score': 1.0,
 'model': LogisticRegression(max_iter=1000, multi_class='auto', random_state=43)}

In [4]:
import mlflow
from mlflow.models import infer_signature
# Set our tracking server uri for logging
mlflow.set_tracking_uri("sqlite:///mlflow.db")
# Create a new MLflow Experiment
mlflow.set_experiment("LogisticRegression")

2025/05/26 17:25:13 INFO mlflow.tracking.fluent: Experiment with name 'LogisticRegression' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/gbemidebe/Documents/GitHub/learning-mlops-zoomcamp/02-experiment-tracking/examples/case04/mlruns/3', creation_time=1748298313573, experiment_id='3', last_update_time=1748298313573, lifecycle_stage='active', name='LogisticRegression', tags={}>

In [9]:
# Start an MLflow run
with mlflow.start_run():
    # Define the model hyperparameters
    params = {
        "solver": "lbfgs",
        "max_iter": 50,
        "multi_class": "auto",
        "random_state": 123,
    }
    # Log the hyperparameters
    mlflow.log_params(params)

    # Train the model
    results = train_model(X_train, y_train, params)

    # Log the loss metric
    mlflow.log_metric("accuracy", results["accuracy"])
    mlflow.log_metric("precision", results["precision"])
    mlflow.log_metric("recall", results["recall"])
    mlflow.log_metric("f1_score", results["f1_score"])

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic LR model for iris data")

    # Infer the model signature
    signature = infer_signature(X_train, results['model'].predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=results["model"],
        artifact_path="iris_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-quickstart",
    )

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Registered model 'tracking-quickstart' already exists. Creating a new version of this model...
Created version '4' of model 'tracking-quickstart'.


In [7]:
# load the model back
logged_model = 'runs:/26ce6c6dccf74635aa74c47b70e0baec/iris_model'
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [8]:
loaded_model.predict(X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [10]:
mlflow.search_experiments()

[<Experiment: artifact_location='/Users/gbemidebe/Documents/GitHub/learning-mlops-zoomcamp/02-experiment-tracking/examples/case04/mlruns/3', creation_time=1748298313573, experiment_id='3', last_update_time=1748298313573, lifecycle_stage='active', name='LogisticRegression', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1748296312074, experiment_id='0', last_update_time=1748296312074, lifecycle_stage='active', name='Default', tags={}>]

In [11]:
# Create a new MLflow Experiment
mlflow.set_experiment("XGBRegression")

2025/05/26 18:14:22 INFO mlflow.tracking.fluent: Experiment with name 'XGBRegression' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/gbemidebe/Documents/GitHub/learning-mlops-zoomcamp/02-experiment-tracking/examples/case04/mlruns/4', creation_time=1748301262517, experiment_id='4', last_update_time=1748301262517, lifecycle_stage='active', name='XGBRegression', tags={}>

In [18]:
from xgboost import XGBClassifier

In [19]:
XGBClassifier.fit??

[0;31mSignature:[0m
[0mXGBClassifier[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mX[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msample_weight[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mAny[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbase_margin[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mAny[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meval_set[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mSequence[0m[0;34m[[0m[0mTuple[0m[0;34m[[0m[0mAny[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mint[0m[0;34m

In [25]:
params

{'solver': 'lbfgs', 'max_iter': 50, 'multi_class': 'auto', 'random_state': 123}

In [26]:
md = XGBClassifier(
    **params)

In [27]:
md.fit(X=X_train, y=y_train, eval_set= [(X_test, y_test)], 
        verbose=False)

Parameters: { "max_iter", "multi_class", "solver" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
from sklearn.metrics import root_mean_squared_error, r2_score

In [40]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        # initialize the model
        md = XGBClassifier(**params)
        # Train the model
        md.fit(X_train, y_train, eval_set= [(X_test, y_test)], verbose=False)
        # Log the model
        y_pred = md.predict(X_test)
        mlflow.log_metric("rmse", root_mean_squared_error(y_test, y_pred))
        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
        mlflow.log_metric('precision', precision_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric('recall', recall_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric('f1_score', f1_score(y_test, y_pred, average='weighted'))
        mlflow.xgboost.log_model(md, artifact_path="xgboost_model")

    return {
            'loss': root_mean_squared_error(y_test, y_pred), 
            'status': STATUS_OK}

In [41]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 100, 10)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

  self.get_booster().save_model(fname)



 10%|█         | 1/10 [00:02<00:21,  2.39s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 20%|██        | 2/10 [00:04<00:16,  2.02s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 30%|███       | 3/10 [00:06<00:13,  1.94s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 40%|████      | 4/10 [00:07<00:11,  1.86s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 50%|█████     | 5/10 [00:09<00:09,  1.84s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 60%|██████    | 6/10 [00:11<00:07,  1.84s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 70%|███████   | 7/10 [00:13<00:05,  1.86s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 80%|████████  | 8/10 [00:15<00:03,  1.83s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



 90%|█████████ | 9/10 [00:16<00:01,  1.77s/trial, best loss: 0.0]



  self.get_booster().save_model(fname)



100%|██████████| 10/10 [00:18<00:00,  1.86s/trial, best loss: 0.0]




