In [1]:
import mlflow
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

from hyperopt import fmin, tpe, Trials, STATUS_OK, hp, space_eval
from functools import partial

from pickle import dump

from scripts.Preprocessing import Preprocessing
from scripts.BinaryClassificationTraining import BinaryClassificationTraining

from scripts.config import (year_month_train, 
    input_data_path_train,
    seed)

In [2]:
local_path_save = './local_artifacts_tmp/05_Decision_Tree/'
year_month = year_month_train
input_data_path = input_data_path_train

### MLFlow setting

In [3]:
if not os.path.exists(local_path_save):
    os.makedirs(local_path_save)

#save all metadata in a sqlite db. Artifacts will be saved on local folder ./mlflow    
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Name of the experiment
exp_name = "05 - Decision Tree"
# set up MlFlow axperiment
experiment_id = mlflow.set_experiment(exp_name)

2023/04/25 15:19:28 INFO mlflow.tracking.fluent: Experiment with name '05 - Decision Tree' does not exist. Creating a new experiment.


### Experiments

In [5]:
run_name = 'april2023_base'

In [6]:
decision_tree_training = BinaryClassificationTraining(
    input_data_path,
    local_path_save,
    year_month,
    'decision_tree')

In [7]:
prepr = Preprocessing(input_data_path_train, task_type='classification')
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=seed)


#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe, scaler = prepr.preprocess_for_classification(df=X_train, fit_ohe=True, perform_scaling=True)
X_test_ohe, _, _ = prepr.preprocess_for_classification(df=X_test, 
                                                    fit_ohe=False, 
                                                    ohe=ohe,
                                                    perform_scaling=True,
                                                    scaler=scaler)
assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save + run_name + '_ohe.pkl', 'wb'))
dump(scaler, open(local_path_save + run_name + '_scaler.pkl', 'wb'))

  df['lpep_pickup_datetime_week'] = df['lpep_pickup_datetime'].dt.week
  df['lpep_pickup_datetime_week'] = df['lpep_pickup_datetime'].dt.week


In [8]:
# Calculating the baseline
decision_tree_training.calculate_classification_baseline(Y_train=Y_train, Y_test=Y_test, run_name=run_name)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'loss': 0.5, 'status': 'ok'}

In [9]:
# TODO complete the experiment using your implementation from BinaryClassificationTraining

In [11]:
for d in np.arange(2,20,3, dtype=int):

    max_evals = 1

    # Here we can decide which hyperparameters we want to tune
    classification_tree_parameters_search = {
        'max_depth': d,
        'random_state': seed
    }

    decision_tree_training.set_hyperparameter_space(classification_tree_parameters_search)

    trials = Trials()

    best_result = fmin(
        fn=partial(decision_tree_training.objective_decision_tree, 
            X_train=X_train_ohe,
            X_test=X_test_ohe,
            Y_train=Y_train,
            Y_test=Y_test,
            run_name=run_name,
            threshold=0.5),
        space=decision_tree_training.hp_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials,
        rstate=np.random.default_rng(seed)
    )

    best_result

100%|██████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.89s/trial, best loss: 0.5850768124270515]
  0%|                                                                                    | 0/1 [00:00<?, ?trial/s, best loss=?]




100%|██████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.31s/trial, best loss: 0.5966125820801937]
100%|██████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.41s/trial, best loss: 0.5847588427689119]
100%|██████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.28s/trial, best loss: 0.5549439076537586]
100%|███████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.58s/trial, best loss: 0.539816999389641]
100%|██████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.61s/trial, best loss: 0.5314398820210614]
