In [1]:
import kcu
import pandas as pd
import numpy as np
import sqlalchemy
import mlflow
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

mlflow.set_tracking_uri("http://mlflow:5000")

# Store dataset in Backend

In [2]:
backend = kcu.utils.get_default_backend_config()
url = kcu.utils.get_sql_url(backend)
engine = sqlalchemy.create_engine(url)

data = load_wine()
data_pd = pd.DataFrame(np.hstack([data["data"], np.expand_dims(data["target"], 1)]),
             columns=data["feature_names"] + ["label"])
data_pd.to_sql("wine", engine, if_exists="replace")

178

# Load Data via PySpark

In [3]:
sess = kcu.utils.get_pyspark_session(backend["dbtype"])

23/05/25 17:11:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
df = kcu.utils.get_df_from_backend("wine", backend=backend, sess=sess)
pandas_df = df.toPandas()

In [13]:
# Try out FeatureStore and ETL
import pyspark
import dill
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from kcu.functiontransform import FunctionTransform

va = VectorAssembler(inputCols=[
    "alcohol",
    "malic_acid",
    "ash",
    "alcalinity_of_ash",
    "magnesium",
    "total_phenols",
    "flavanoids",
    "nonflavanoid_phenols",
    "proanthocyanins",
    "color_intensity",
    "hue",
    "od280/od315_of_diluted_wines",
    "proline",], outputCol="features")

def feature_identity(df, feature_col,
                  key_out_feature_column):
    import pyspark.sql.functions as F

    df = (
        df
        .withColumn(
            key_out_feature_column,
            F.col(feature_col),
        )
    )
    return df


feature_ash_parameters = {
    "feature_col": "ash",
    "key_out_feature_column": "ash_feature"
}


feature_param_pairs = [
    (feature_identity, feature_ash_parameters)
]

stages = []
dict_to_save = dict()

for fpp in feature_param_pairs:
    ft = FunctionTransform(
        default_value=fpp[0],
        parameter_value=fpp[1]
    )
    stages.append(ft)
    dict_to_save[ft.uid] = fpp[1]
stages.append(va)
pipe = Pipeline(stages=stages)
newpipe = pipe.fit(df)
try:
    newpipe.write().save("models/testpipe")
except:
    newpipe.write().overwrite().save("models/testpipe")

newpipe.transform(df).show()

with open('fpp.pickle', 'wb') as handle:
    dill.dump(dict_to_save, handle, protocol=dill.HIGHEST_PROTOCOL)

{'default_value': <function feature_identity at 0x7f6809028dc0>, 'parameter_value': {'feature_col': 'ash', 'key_out_feature_column': 'ash_feature'}}


23/05/25 17:21:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-----+-----------+--------------------+
|index|alcohol|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|label|ash_feature|            features|
+-----+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-----+-----------+--------------------+
|    0|  14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                        3.92| 1065.0|  0.0|       2.43|[14.23,1.71,2.43,...|
|    1|   13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|                0.26|           1.28|   

In [14]:
from pyspark.ml import PipelineModel

newpipe = PipelineModel.load("models/testpipe")

with open('fpp.pickle', 'rb') as handle:
    fpp = dill.load(handle)

for entry in fpp:
    print(entry)
    for i in range(len(newpipe.stages)):
        if newpipe.stages[i].uid == entry:
            newpipe.stages[i] = newpipe.stages[i].setParameterValue(dill.dumps(fpp[entry]).decode(encoding="raw_unicode_escape"))

transformed = newpipe.transform(df)

transformed.show()

{}
FunctionTransform_c0bdde491ec6
+-----+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-----+-----------+--------------------+
|index|alcohol|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|label|ash_feature|            features|
+-----+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-----+-----------+--------------------+
|    0|  14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                        3.92| 1065.0|  0.0|       2.43|[14.23,1.71,2.43,...|
|    1|   13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|      

# Run training

In [5]:
mlflow.set_experiment("wine")
mlflow.xgboost.autolog()

with mlflow.start_run():
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["label"]), df['label'], test_size=.2)

    # create model instance
    #mlflow.log_params(key="test_accuracy", value=test_acc)
    bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')

    # fit model
    bst.fit(X_train, y_train)

    # make predictions
    preds = bst.predict(X_test)

    # evaluate
    test_acc = accuracy_score(y_test, preds)
    mlflow.log_metric(key="test_accuracy", value=test_acc)

