In [None]:
import kcu
import pandas as pd
import numpy as np
import sqlalchemy
import mlflow
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

mlflow.set_tracking_uri("http://mlflow:5000")

# Store dataset in Backend

In [None]:
backend = kcu.utils.get_default_backend_config()
url = kcu.utils.get_sql_url(backend)
engine = sqlalchemy.create_engine(url)

data = load_wine()
pd.DataFrame(np.hstack([data["data"], np.expand_dims(data["target"], 1)]),
             columns=data["feature_names"] + ["label"]).to_sql("wine", engine, if_exists="replace")

# Load Data via PySpark

In [None]:
sess = kcu.utils.get_pyspark_session(backend["dbtype"])

In [None]:
df = kcu.utils.get_df_from_backend("wine", backend=backend, sess=sess).toPandas()

# Run training

In [None]:
mlflow.set_experiment("wine")
mlflow.xgboost.autolog()

with mlflow.start_run():
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["label"]), df['label'], test_size=.2)

    # create model instance
    #mlflow.log_params(key="test_accuracy", value=test_acc)
    bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')

    # fit model
    bst.fit(X_train, y_train)

    # make predictions
    preds = bst.predict(X_test)

    # evaluate
    test_acc = accuracy_score(y_test, preds)
    mlflow.log_metric(key="test_accuracy", value=test_acc)