In [1]:
import os
import mlflow
import requests
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


In [2]:
## download the dataset
# Directory of the raw data files
_data_root = './data/covertype'
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')
# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

In [3]:
# Load the dataset to a dataframe
df = pd.read_csv(_data_filepath)

# Set the target values
y = df['Cover_Type']#.values

# Set the input values
df.drop('Cover_Type', axis=1, inplace=True)
X = df#.values

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                        ["Wilderness_Area", "Soil_Type"]),
                                      remainder='passthrough') # pass all the numeric values through the pipeline without any changes.

column_trans


In [5]:
pipe = Pipeline(steps=[("column_trans", column_trans),("scaler", StandardScaler(with_mean=False)), ("RandomForestClassifier", RandomForestClassifier())])

pipe

In [6]:
param_grid =  {'RandomForestClassifier__max_depth': [1,2,3,10], 'RandomForestClassifier__n_estimators': [10,11]}

search = GridSearchCV(pipe, param_grid, n_jobs=2)
search


In [7]:
search.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__column_trans', 'estimator__scaler', 'estimator__RandomForestClassifier', 'estimator__column_trans__n_jobs', 'estimator__column_trans__remainder', 'estimator__column_trans__sparse_threshold', 'estimator__column_trans__transformer_weights', 'estimator__column_trans__transformers', 'estimator__column_trans__verbose', 'estimator__column_trans__verbose_feature_names_out', 'estimator__column_trans__onehotencoder', 'estimator__column_trans__onehotencoder__categories', 'estimator__column_trans__onehotencoder__drop', 'estimator__column_trans__onehotencoder__dtype', 'estimator__column_trans__onehotencoder__handle_unknown', 'estimator__column_trans__onehotencoder__max_categories', 'estimator__column_trans__onehotencoder__min_frequency', 'estimator__column_trans__onehotencoder__sparse', 'estimator__column_trans__onehotencoder__sparse_output', 'estimator__scaler__copy', 'estimator__scaler__wit

In [8]:
search.fit(X_train, y_train)

In [15]:
EXPERIMENT_NAME = "mlflow_tutorial"
mlflow.set_tracking_uri("http://10.43.102.109:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

# this is the magical stuff
mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

with mlflow.start_run(run_name="autolog_with_grid_search") as run:
    search.fit(X_train, y_train)

                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Wilderness_Area',
                                                   'Soil_Type'])])),
                ('scaler', StandardScaler(with_mean=False)),
                ('RandomForestClassifier', Rand...`
2023/04/27 00:10:40 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


## the simplest possible example

In [14]:
# run description (just metadata)
desc = "the simplest possible example"

# connects to the Mlflow tracking server that you started above
mlflow.set_tracking_uri("http://10.43.102.109:5000")

# executes the run
with mlflow.start_run(run_name="no_artifacts_logged", description=desc) as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Commanche'