In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
import mlflow

In [None]:
# read dataframe and set column names
DATA_DIR = "../data/covertype.csv"
column_names = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area', 'Soil_Type', 'Cover_Type']
data = pd.read_csv(DATA_DIR, names=column_names, header=None)
# drop data duplicates
data.drop_duplicates(inplace=True, ignore_index=True)
# show data sample
data.head()

In [None]:
# store target variable as vector y
target_variable = "Cover_Type"
y = data[target_variable]
# set input variables X as original data minus target variable and categorical features
X = data.drop(columns=[target_variable])
# transform variables to improve model performance
categorical_variables = ['Wilderness_Area', 'Soil_Type']
numerical_variables = [feature for feature in X.columns if feature not in categorical_variables]
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', OrdinalEncoder(), categorical_variables),
        ('numerical', StandardScaler(), numerical_variables)
    ]
)

In [None]:
# define classifier as gradient boost machine and ensemble with preprocessor
classifier = HistGradientBoostingClassifier(categorical_features=[0, 1])
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', classifier)
])

In [None]:
import os

# set minio environment variables
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.154:8081"
os.environ['AWS_ACCESS_KEY_ID'] = 'minio_user'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio_pwd'

# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.154:8083")
mlflow.set_experiment("gradient_boosting_covertype")
mlflow.sklearn.autolog(log_model_signatures=True, log_input_examples=True, registered_model_name="final_model")

# mlflow run
with mlflow.start_run(run_name="gradient_boosting_training", desc="Train gradient boosting model") as run:
    # fit model to data
    model.fit(X, y)