# Define imports

In [24]:
# imports
import os
import mlflow
import argparse

import pandas as pd
import lightgbm as lgbm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Define functions

In [25]:
# define functions
def main(args):
    # enable auto logging
    mlflow.autolog()

    # setup parameters
    num_boost_round = args.num_boost_round
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "boosting": args.boosting,
        "num_iterations": args.num_iterations,
        "num_leaves": args.num_leaves,
        "num_threads": args.num_threads,
        "learning_rate": args.learning_rate,
        "metric": args.metric,
        "seed": args.seed,
        "verbose": args.verbose,
    }

    # read in data
    df = pd.read_csv(args.iris_csv)

    # process data
    X_train, X_test, y_train, y_test, enc = process_data(df)

    # train model
    model = train_model(params, num_boost_round, X_train, X_test, y_train, y_test)


def process_data(df):
    # split dataframe into X and y
    X = df.drop(["species"], axis=1)
    y = df["species"]

    # one-hot encode
    enc = LabelEncoder()
    y = enc.fit_transform(y)

    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # return splits and encoder
    return X_train, X_test, y_train, y_test, enc


def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):
    # create lightgbm datasets
    train_data = lgbm.Dataset(X_train, label=y_train)
    test_data = lgbm.Dataset(X_test, label=y_test)

    # train model
    model = lgbm.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[test_data],
        valid_names=["test"],
    )

    # return model
    return model

# Define function arguments

In [26]:
class args:
    iris_csv = "https://azuremlexamples.blob.core.windows.net/datasets/iris.csv"
    num_boost_round = 10
    boosting = "gbdt"
    num_iterations = 100
    num_leaves = 31
    num_threads = 0
    learning_rate = 0.1
    metric = "multi_logloss"
    seed = 42
    verbose = 0


args

__main__.args

In [27]:
args.iris_csv

'https://azuremlexamples.blob.core.windows.net/datasets/iris.csv'

# Call function(s)

In [28]:
main(args)

2021/05/28 08:54:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2021/05/28 08:54:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2021/05/28 08:54:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9cd285649efe41b282b2e7dee59efa68', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


You can set `force_col_wise=true` to remove the overhead.
[1]	test's multi_logloss: 0.930558
[2]	test's multi_logloss: 0.795536
[3]	test's multi_logloss: 0.68756
[4]	test's multi_logloss: 0.593833
[5]	test's multi_logloss: 0.51883
[6]	test's multi_logloss: 0.454422
[7]	test's multi_logloss: 0.401051
[8]	test's multi_logloss: 0.353053
[9]	test's multi_logloss: 0.313256
[10]	test's multi_logloss: 0.276926
[11]	test's multi_logloss: 0.247315
[12]	test's multi_logloss: 0.221442
[13]	test's multi_logloss: 0.199252
[14]	test's multi_logloss: 0.177485
[15]	test's multi_logloss: 0.160641
[16]	test's multi_logloss: 0.144921
[17]	test's multi_logloss: 0.129971
[18]	test's multi_logloss: 0.117683
[19]	test's multi_logloss: 0.108334
[20]	test's multi_logloss: 0.0977779
[21]	test's multi_logloss: 0.0942463
[22]	test's multi_logloss: 0.0894623
[23]	test's multi_logloss: 0.0851825
[24]	test's multi_logloss: 0.0783286
[25]	test's multi_logloss: 0.0751468
[26]	test's multi_logloss: 0.0715524
[27]	test'