In [1]:
import pandas as pd

import mlflow
from  mlflow.tracking import MlflowClient

from time import time

from utils.grid_search import generate_combination
from utils.log_utils import calculate_elapsed_time
from utils.ml_utils import create_experiment
from utils.file_utils import make_path 

from dataset.make_dataset import make_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [2]:
wine_df = make_dataset()

x = wine_df.drop(columns=['quality'])
y = wine_df['quality']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


In [3]:
model = RandomForestRegressor()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
mean_absolute_error(y_test, y_pred)

0.44393749999999993

In [4]:
param_grid = {
    'n_jobs': [-1],
    'n_estimators': list(range(50, 250, 50)),
    'criterion': ['squared_error', 'absolute_error', 'poisson'],
    'max_depth': [None, *list(range(10, 25, 5))],
    'min_samples_split': list(range(2, 10, 3)),
    'min_samples_leaf': list(range(1, 10, 3)),
}

params_list = generate_combination(param_grid)
param_grid

{'n_jobs': [-1],
 'n_estimators': [50, 100, 150, 200],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [None, 10, 15, 20],
 'min_samples_split': [2, 5, 8],
 'min_samples_leaf': [1, 4, 7]}

In [5]:
total_itter = len(params_list)

itter_template = '{:<' + str(len(str(total_itter))) + '}'

log_table = f'{itter_template} / {itter_template}' + " | Error: {:<5.4f} | Elapsed Time: {}"



In [7]:

exp = create_experiment('Regression Approaches')

client = MlflowClient()

for i, params in enumerate(params_list):
    ini = time()

    run = client.create_run(exp.experiment_id)
    run_id = run.info.run_id

    model = RandomForestRegressor(**params)
    model.fit(x_train, y_train)

    fi = pd.DataFrame([x_train.columns, model.feature_importances_]).T
    fi.columns = ['Feature', 'Importance']
    fi = fi.sort_values('Importance', ascending=False)

    fi_path = make_path('img/', file_name='fi')
    fi.to_csv(fi_path, index=False)

    y_pred = model.predict(x_test)
    error = mean_absolute_error(y_test, y_pred)

    for key in params.keys():
        client.log_param(run_id, key, params[key])

    client.log_metric(run_id, 'MAE', error)
    client.log_artifact(run_id, fi_path)

    print(log_table.format(i + 1, total_itter, error, calculate_elapsed_time(ini)))


1   / 432 | Error: 0.4420 | Elapsed Time: 0.133s
2   / 432 | Error: 0.4589 | Elapsed Time: 0.098s
3   / 432 | Error: 0.4760 | Elapsed Time: 0.112s
4   / 432 | Error: 0.4518 | Elapsed Time: 0.110s
5   / 432 | Error: 0.4634 | Elapsed Time: 0.106s
6   / 432 | Error: 0.4718 | Elapsed Time: 0.102s
7   / 432 | Error: 0.4510 | Elapsed Time: 0.180s
8   / 432 | Error: 0.4644 | Elapsed Time: 0.128s
9   / 432 | Error: 0.4746 | Elapsed Time: 0.115s
10  / 432 | Error: 0.4642 | Elapsed Time: 0.123s
11  / 432 | Error: 0.4702 | Elapsed Time: 0.114s
12  / 432 | Error: 0.4758 | Elapsed Time: 0.134s
13  / 432 | Error: 0.4608 | Elapsed Time: 0.118s
14  / 432 | Error: 0.4708 | Elapsed Time: 0.121s
15  / 432 | Error: 0.4698 | Elapsed Time: 0.104s
16  / 432 | Error: 0.4584 | Elapsed Time: 0.114s
17  / 432 | Error: 0.4639 | Elapsed Time: 0.109s
18  / 432 | Error: 0.4736 | Elapsed Time: 0.105s
19  / 432 | Error: 0.4463 | Elapsed Time: 0.119s
20  / 432 | Error: 0.4591 | Elapsed Time: 0.127s
21  / 432 | Error: 0

KeyboardInterrupt: 

<Run: data=<RunData: metrics={}, params={}, tags={}>, info=<RunInfo: artifact_uri='file:///C:/Users/danie/Desktop/Red-Wine-Quality/mlruns/2/44426afb41c24b498ea539cdaab1b656/artifacts', end_time=None, experiment_id='2', lifecycle_stage='active', run_id='44426afb41c24b498ea539cdaab1b656', run_uuid='44426afb41c24b498ea539cdaab1b656', start_time=1635472188386, status='RUNNING', user_id='unknown'>>