In [3]:
!pip install sklearn flaml mlflow

Collecting mlflow
  Downloading mlflow-2.3.1-py3-none-any.whl (17.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.7/17.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Collecting cloudpickle<3 (from mlflow)
  Using cached cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting databricks-cli<1,>=0.8.7 (from mlflow)
  Downloading databricks-cli-0.17.7.tar.gz (83 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting entrypoints<1 (from mlflow)
  Downloading entrypoints-0.4-py3-none-any.whl (5.3 kB)
Collecting gitpython<4,>=2.1.0 (from mlflow)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
Collecting pro

[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyparsing>=2.3.1 (from matplotlib<4->mlflow)
  Downloading pyparsing-3.0.9-py3-none-any.whl (98 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.3/98.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting greenlet!=0.4.17 (from sqlalchemy<3,>=1.4.0->mlflow)
  Downloading greenlet-2.0.2-cp39-cp39-macosx_11_0_x86_64.whl (241 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.4/241.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython<4,>=2.1.0->mlflow)
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Collecting MarkupSafe>=2.0 (from Jinja2<4,>=2.11->mlflow)
  Downloading MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl (13 kB)
Building wheels for collected packages: databricks-cli
  Building wheel for databricks-c

In [None]:
# # For compilers to find libomp you may need to set:
# import os
# os.environ['LDFLAGS']="-L/usr/local/opt/libomp/lib"
# os.environ['CPPFLAGS']="-I/usr/local/opt/libomp/include"

In [6]:
import pandas as pd
import sklearn.datasets

# df = pd.read_csv("/Users/benepstein/Downloads/diabetes.csv")
data = sklearn.datasets.load_diabetes(as_frame=True)
# df["target"]
df = data["data"]
df["species"] = data["target"]

In [16]:
from flaml import AutoML
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import pandas as pd
from typing import Optional

# Experiment setup
experiment_name = "Default"
label = "species"
df_automl = df.copy()
display(df_automl)
experiment = mlflow.set_experiment(experiment_name)

# No runs can be active before we start. We will create a run
while mlflow.active_run():
    mlflow.end_run()

mlflow.autolog(disable=True)


def set_best_run() -> Optional[str]:
    """Finds the best run and sets the parent run results based on the best run"""
    best_run = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string=f'params.best_config = "{automl.best_config}"',
    )
    if not len(best_run):
        return
    best_run_id = best_run.run_id[0]
    mlflow.set_tag("Best run", best_run_id)

    # Copy the params and metrics from the winning run to the parent
    run_params = best_run[[c for c in best_run.columns if c.startswith("params.")]].to_dict()
    run_params = {k.lstrip("params."): v[0] for k, v in run_params.items()}
    mlflow.log_params(run_params)

    run_metrics = best_run[[c for c in best_run.columns if c.startswith("metrics.")]].to_dict()
    run_metrics = {k.lstrip("metrics."): v[0] for k, v in run_metrics.items()}
    mlflow.log_metrics(run_metrics)
    return best_run_id


def log_best_model(automl: AutoML, df_automl: pd.DataFrame) -> None:
    train = df_automl.drop(label, axis=1)[:5]
    predictions = pd.DataFrame({label: automl.predict(train)})
    mlflow.sklearn.log_model(
        automl, 
        "model", 
        signature=infer_signature(train, predictions), 
        input_example=train
    )


with mlflow.start_run() as run:
    run_name = run.data.tags["mlflow.runName"]
    print("Run name: ", run_name)
    display(dca.automl.OpenExperiment(experiment_name))
    automl = AutoML(metric="accuracy")
    automl.fit(
        dataframe=df_automl, 
        label=label, 
        task="classification", 
        time_budget=300, 
        max_iter=50, 
        eval_method="holdout", 
        split_ratio=0.1
    )
    
    # Save the winning run content
    log_best_model(automl, df_automl)
    best_run_id = set_best_run()

    # Log the notebook cell execution history for reproducibility
    dca.mlflow_log_notebook(run_name)

    
# Set the wining run to be tagged as the winner
# And log the model
if best_run_id:
    with mlflow.start_run(best_run_id):
        mlflow.set_tag("Status", "Winner")
        mlflow.set_tag("Winner", True)
        mlflow.sklearn.log_model(
            automl, 
            "model", 
            signature=infer_signature(train, predictions), 
            input_example=train
        )

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,species
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


Run name:  stately-mule-984
[flaml.automl.logger: 07-02 17:09:52] {1693} INFO - task = classification
[flaml.automl.logger: 07-02 17:09:52] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 07-02 17:09:52] {1703} INFO - Evaluation method: holdout
[flaml.automl.logger: 07-02 17:09:54] {1801} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 07-02 17:09:54] {1911} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 07-02 17:09:54] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 07-02 17:09:54] {2347} INFO - Estimated sufficient time budget=1432s. Estimated necessary time budget=33s.
[flaml.automl.logger: 07-02 17:09:54] {2394} INFO -  at 2.4s,	estimator lgbm's best error=0.4592,	best estimator lgbm's best error=0.4592
[flaml.automl.logger: 07-02 17:09:54] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 07-02 17:09:54] {2394} INFO -  at 2



[flaml.automl.logger: 07-02 17:09:54] {2394} INFO -  at 2.9s,	estimator xgboost's best error=0.5024,	best estimator lgbm's best error=0.3936
[flaml.automl.logger: 07-02 17:09:54] {2221} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 07-02 17:09:55] {2394} INFO -  at 3.2s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:55] {2221} INFO - iteration 5, current learner lgbm
[flaml.automl.logger: 07-02 17:09:55] {2394} INFO -  at 3.4s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:55] {2221} INFO - iteration 6, current learner lgbm
[flaml.automl.logger: 07-02 17:09:55] {2394} INFO -  at 3.5s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:55] {2221} INFO - iteration 7, current learner lgbm
[flaml.automl.logger: 07-02 17:09:55] {2394} INFO -  at 3.8s,	estimator lgbm's best error=0.3392,	bes



[flaml.automl.logger: 07-02 17:09:56] {2394} INFO -  at 4.1s,	estimator xgboost's best error=0.5024,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:56] {2221} INFO - iteration 10, current learner extra_tree
[flaml.automl.logger: 07-02 17:09:56] {2394} INFO -  at 4.1s,	estimator extra_tree's best error=0.8896,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:56] {2221} INFO - iteration 11, current learner extra_tree
[flaml.automl.logger: 07-02 17:09:56] {2394} INFO -  at 4.2s,	estimator extra_tree's best error=0.8896,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:56] {2221} INFO - iteration 12, current learner extra_tree
[flaml.automl.logger: 07-02 17:09:56] {2394} INFO -  at 4.2s,	estimator extra_tree's best error=0.8896,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:56] {2221} INFO - iteration 13, current learner extra_tree
[flaml.automl.logger: 07-02 17:09:56] {2394} INFO -  at



[flaml.automl.logger: 07-02 17:09:57] {2394} INFO -  at 5.0s,	estimator xgboost's best error=0.4064,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:57] {2221} INFO - iteration 17, current learner rf
[flaml.automl.logger: 07-02 17:09:57] {2394} INFO -  at 5.1s,	estimator rf's best error=0.8864,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:57] {2221} INFO - iteration 18, current learner lgbm
[flaml.automl.logger: 07-02 17:09:57] {2394} INFO -  at 5.2s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:57] {2221} INFO - iteration 19, current learner rf
[flaml.automl.logger: 07-02 17:09:57] {2394} INFO -  at 5.2s,	estimator rf's best error=0.8864,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:57] {2221} INFO - iteration 20, current learner rf
[flaml.automl.logger: 07-02 17:09:57] {2394} INFO -  at 5.3s,	estimator rf's best error=0.8864,	best estima



[flaml.automl.logger: 07-02 17:09:58] {2394} INFO -  at 6.4s,	estimator xgboost's best error=0.3904,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:58] {2221} INFO - iteration 25, current learner extra_tree
[flaml.automl.logger: 07-02 17:09:58] {2394} INFO -  at 6.4s,	estimator extra_tree's best error=0.8368,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:58] {2221} INFO - iteration 26, current learner rf
[flaml.automl.logger: 07-02 17:09:58] {2394} INFO -  at 6.5s,	estimator rf's best error=0.8000,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:58] {2221} INFO - iteration 27, current learner xgboost




[flaml.automl.logger: 07-02 17:09:58] {2394} INFO -  at 6.8s,	estimator xgboost's best error=0.3584,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:58] {2221} INFO - iteration 28, current learner extra_tree
[flaml.automl.logger: 07-02 17:09:58] {2394} INFO -  at 6.8s,	estimator extra_tree's best error=0.8368,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:58] {2221} INFO - iteration 29, current learner xgboost




[flaml.automl.logger: 07-02 17:09:59] {2394} INFO -  at 7.0s,	estimator xgboost's best error=0.3584,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:59] {2221} INFO - iteration 30, current learner lgbm
[flaml.automl.logger: 07-02 17:09:59] {2394} INFO -  at 7.2s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:59] {2221} INFO - iteration 31, current learner lgbm
[flaml.automl.logger: 07-02 17:09:59] {2394} INFO -  at 7.5s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:59] {2221} INFO - iteration 32, current learner rf
[flaml.automl.logger: 07-02 17:09:59] {2394} INFO -  at 7.5s,	estimator rf's best error=0.8000,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:09:59] {2221} INFO - iteration 33, current learner extra_tree
[flaml.automl.logger: 07-02 17:09:59] {2394} INFO -  at 7.6s,	estimator extra_tree's best error



[flaml.automl.logger: 07-02 17:10:00] {2394} INFO -  at 8.5s,	estimator xgboost's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:00] {2221} INFO - iteration 36, current learner extra_tree
[flaml.automl.logger: 07-02 17:10:00] {2394} INFO -  at 8.6s,	estimator extra_tree's best error=0.7328,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:00] {2221} INFO - iteration 37, current learner xgboost




[flaml.automl.logger: 07-02 17:10:01] {2394} INFO -  at 9.1s,	estimator xgboost's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:01] {2221} INFO - iteration 38, current learner xgboost




[flaml.automl.logger: 07-02 17:10:01] {2394} INFO -  at 9.4s,	estimator xgboost's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:01] {2221} INFO - iteration 39, current learner xgboost




[flaml.automl.logger: 07-02 17:10:02] {2394} INFO -  at 10.0s,	estimator xgboost's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:02] {2221} INFO - iteration 40, current learner extra_tree
[flaml.automl.logger: 07-02 17:10:02] {2394} INFO -  at 10.0s,	estimator extra_tree's best error=0.7328,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:02] {2221} INFO - iteration 41, current learner extra_tree
[flaml.automl.logger: 07-02 17:10:02] {2394} INFO -  at 10.1s,	estimator extra_tree's best error=0.5472,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:02] {2221} INFO - iteration 42, current learner lgbm
[flaml.automl.logger: 07-02 17:10:02] {2394} INFO -  at 10.4s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:02] {2221} INFO - iteration 43, current learner extra_tree
[flaml.automl.logger: 07-02 17:10:02] {2394} INFO -  at 10.4s,	



[flaml.automl.logger: 07-02 17:10:03] {2394} INFO -  at 11.4s,	estimator xgboost's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 07-02 17:10:03] {2630} INFO - retrain lgbm for 0.2s
[flaml.automl.logger: 07-02 17:10:03] {2633} INFO - retrained model: LGBMClassifier(colsample_bytree=0.9285002286474459,
               learning_rate=0.7260594590615893, max_bin=511,
               min_child_samples=9, n_estimators=9, num_leaves=4,
               reg_alpha=0.0036840681931986645, reg_lambda=0.7532480505730402,
               verbose=-1)
[flaml.automl.logger: 07-02 17:10:03] {1941} INFO - fit succeeded
[flaml.automl.logger: 07-02 17:10:03] {1942} INFO - Time taken to find the best model: 3.17272686958313
