In [6]:
import mlflow
import os

In [7]:
hyperparams = {
    "resume_checkpoint": "None",  # "restored_model_checkpoint/checkpoint_epoch=06.ckpt",
    "resume_run_id": "None",  # "ef539b4138fa4055bf65c58f30249211",
    "max_epochs": 10,
    "gpus": 0,
    "batch_size": 32,
    "lr": 0.0001,
    "num_samples": -1,
    "val_ratio": 0.2,
    "test_ratio": 0,
    "random_seed": "None",
    "dataset": "/dbfs/FileStore/tables/datasets/dummy.json",
    # "dataset": os.path.join(os.getenv("PROCESSED_DATA_DIR"), "dummy.json"),
}

In [8]:
# Two options for the backend_config: None to run locally or configure a cluster to run on databricks
# https://docs.databricks.com/dev-tools/api/latest/clusters.html
# Change the configuration accordingly to AWS, Azure, Google Cloud

# backend_config = None

backend_config = {
    "num_workers": 0,
    "spark_version": "12.1.x-scala2.12",
    "spark_conf": {
        "spark.master": "local[*, 4]",
        "spark.databricks.cluster.profile": "singleNode"
    },
    "azure_attributes": {
        "first_on_demand": 1,
        "availability": "ON_DEMAND_AZURE",
        "spot_bid_max_price": -1
    },
    "node_type_id": "Standard_F4",
    "driver_node_type_id": "Standard_F4",
    "ssh_public_keys": [],
    "custom_tags": {
        "ResourceClass": "SingleNode"
    },
    "spark_env_vars": {
        "PYSPARK_PYTHON": "/databricks/python3/bin/python3",
        "LOGS_DIR": "/logs",
        "LOGGER_LEVEL": "INFO"
    },
    "enable_elastic_disk": "true",
    "cluster_source": "UI",
    "init_scripts": [],
    "single_user_name": "alexandre1.almeida@axians.com",
    "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
    "runtime_engine": "STANDARD"
}

In [9]:
# Set databricks env variables
os.environ["MLFLOW_TRACKING_URI"] = "databricks"

# Set github access uri
user = os.getenv("GIT_USER")
token = os.getenv("GIT_TOKEN")
git_uri = os.getenv("GIT_URI")
uri = f"https://{user}:{token}@{git_uri}"
git_branch = "main"

# uri = os.getenv("ROOT_DIR") # Uncomment if using the local filesystem path

experiment_name = os.getenv("EXP_NAME")
experiment = mlflow.set_experiment(experiment_name)
backend = "databricks" # Comment if using local backend
# backend = "local" # Uncomment if using local backend

In [10]:
lrs = [0.001]  # , 0.0005, 0.0001, 0.00005, 0.00001, 0.000005, 0.000001]

for lr in lrs:
    hyperparams["lr"] = lr

    mlflow.projects.run(
        uri,
        version=git_branch, # Comment if using the local filesystem path
        entry_point="train",
        # env_manager="local",
        parameters=hyperparams,
        backend=backend,
        backend_config=backend_config,
        experiment_id=experiment.experiment_id,
        synchronous=False,  # Set to False, if you don't want to wait for the model to train
    )

2023/03/02 15:24:04 INFO mlflow.projects.utils: === Fetching project from https://alexandre1-almeida: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@github.com/AxiansML/basics-mlops-python.git into /var/folders/p6/5l5fzz397vq3lwg4c8p3t7lw0000gn/T/tmpkuib3806 ===
2023/03/02 15:24:06 INFO mlflow.projects.databricks: === Creating tarball from /var/folders/p6/5l5fzz397vq3lwg4c8p3t7lw0000gn/T/tmpkuib3806 in temp directory /var/folders/p6/5l5fzz397vq3lwg4c8p3t7lw0000gn/T/tmp2uh__v2t ===
2023/03/02 15:24:06 INFO mlflow.projects.databricks: === Total file size to compress: 1267.8 KB ===
2023/03/02 15:24:07 INFO mlflow.projects.databricks: === Uploading project tarball (size: 1020.8 KB) to /dbfs/mlflow-experiments/4457423338043857/projects-code/22cd8988021b5758aa6e8e722feec97876ea040f532f9800ce38f7eb87908795.tar.gz ===
2023/03/02 15:24:07 INFO mlflow.projects.databricks: === Uploading project to DBFS path /dbfs/mlflow-experiments/4457423338043857/projects-code/22cd8988021b5758aa6e8e722feec97876ea040f5