# Load libraries

In [None]:

# General libraries
import numpy as np
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
import logging


# Custom libraries
from shared_core_data_processing import * 
from spec_core_data_processing import *

from shared_testing_data_processing import * 
from spec_testing_data_processing import *


# Other setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)


# Test custom libraries

In [None]:

## Test shared function ds_common_pandas_dataframe_split_main
run_unit_test_00_ds_common_pandas_dataframe_split_main()

## Test spec function extract_expiring_minutes
run_unit_test_00_extract_expiring_minutes()


# Define parameters

In [None]:

# Processing parameters
local_path_input          = 'USA_cars_datasets.csv'

# Modelling parameters
lst_features_categorical = ['brand', 'model', 'title_status', 'state', 'country', 'color']
lst_features_numerical   = ['year', 'mileage', 'condition']
str_target_variable      = 'price'
int_seed                  = 20                                
str_data_id               = 'data_id'

# Train/test and CV split parameters
dict_metrics              = {'price': ['mean']}
dict_metrics_tolerance    = {'price_mean': 0.01}
int_max_number_iterations = 100
dict_splits               = {'train': 0.75, 'test': 0.25}
dict_splits_cv            = {'1': 0.33, '2': 0.33, '3': 0.34}
str_colname_split         = 'flag_split_train_test'
str_colname_split_cv      = 'flag_split_folds'

# Training parameters
dict_h2o_automl = {
                   'max_models':         10, 
                   'max_runtime_secs':   600,    # 10 Minutes
                   'seed':               int_seed,
                   'sort_metric':        'RMSE',
                   'stopping_tolerance': 0.001,
                   'exclude_algos':      ['XGBoost'], # Not available on windows
                   'exploitation_ratio': 0.1
                  }


# Main processing

In [None]:

### Data processing
## Load data
df = pd.read_csv(local_path_input)

## Extract expiring minutes
df['condition'] = df['condition'].map(lambda x: extract_expiring_minutes(x))

## Create data id column
df[str_data_id] = df.index


### Data split
## Add train/test and CV split columns
# train/test split column
list_input_columns   = [key for key in dict_metrics.keys()] + [str_data_id]
df_split_col         = ds_common_pandas_dataframe_split_main(df[list_input_columns].copy(), str_colname_split , dict_splits, dict_metrics, dict_metrics_tolerance, int_seed, int_max_number_iterations)
df                   = pd.merge(left = df, right = df_split_col[[str_data_id , str_colname_split]], how = "left", on = 'data_id')

# CV split column (remark: it must be done on training set only)
list_input_columns       = [key for key in dict_metrics.keys()] + [str_data_id]
df_split_col_cv          = ds_common_pandas_dataframe_split_main(df[df[str_colname_split]=='train'][list_input_columns].copy(), str_colname_split_cv , dict_splits_cv, dict_metrics, dict_metrics_tolerance, int_seed, int_max_number_iterations)
df                       = pd.merge(left = df, right = df_split_col_cv[[str_data_id , str_colname_split_cv]], how = "left", on = 'data_id')
df[str_colname_split_cv] = df[str_colname_split_cv].map(lambda x: 'missing' if pd.isnull(x) else x)


# H2O Modeling

In [5]:

# Remove constant features from list of explanatory features
lst_features          = lst_features_numerical + lst_features_categorical
lst_constant_features = ds_common_get_list_constant_features_main(df, lst_features)
lst_features          = list(set(lst_features) - set(lst_constant_features))


## Initialize H2O
h2o.init(nthreads = 1)


## Convert pandas frame to H2O frame
df_h2o = h2o.H2OFrame(df)


## Split train / test H2O
mask_train   = df_h2o[str_colname_split] == "train"
mask_test    = df_h2o[str_colname_split] == "test"
df_h2o_train = df_h2o[mask_train, :]
df_h2o_test  = df_h2o[mask_test, :] 


## Convert fold columns to factor
df_h2o_train[str_colname_split_cv] = df_h2o_train[str_colname_split_cv].asfactor()


## Explore splits target variable mean and variance
logging.info(f'Train vs test target variable mean:     {df_h2o_train[str_target_variable].mean()[0]} vs {df_h2o_test[str_target_variable].mean()[0]}')
logging.info(f'Train vs test target variable variance: {df_h2o_train[str_target_variable].var()} vs {df_h2o_test[str_target_variable].var()}')


## Optimize memory usage
del df
h2o.remove(df_h2o)


## Fit H2O Automl on train set
aml = H2OAutoML(**dict_h2o_automl)
aml.train(x=lst_features, y=str_target_variable, training_frame=df_h2o_train, fold_column = str_colname_split_cv)


## Explore Automl cross-validation results
# Remark: the leaderboard contains the cross-validated metrics
lb = aml.leaderboard
lb.head(rows=lb.nrows)


  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.4
H2O_cluster_version_age:,"1 year, 10 months and 21 days !!!"
H2O_cluster_name:,H2O_from_python_david_4frkb8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.762 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,1


Parse progress: |█████████████████████████████████████████████████████████| 100%


INFO:root:Train vs test target variable mean:     18769.669185105253 vs 18761.941176470602
INFO:root:Train vs test target variable variance: 149831594.49363422 vs 138321938.14227074


AutoML progress: |
15:48:49.718: Fold column flag_split_folds will be used for cross-validation. nfolds parameter will be ignored.
15:48:49.719: Stopping tolerance set by the user is < 70% of the recommended default of 0.02323069967623409, so models may take a long time to converge or may not converge at all.

████████████████████████████████████████████████████████| 100%


model_id,rmse,mean_residual_deviance,mse,mae,rmsle
StackedEnsemble_BestOfFamily_AutoML_20220422_154849,6745.12,45496600.0,45496600.0,4194.34,
StackedEnsemble_AllModels_AutoML_20220422_154849,6746.21,45511400.0,45511400.0,4182.58,
GBM_1_AutoML_20220422_154849,6904.85,47677000.0,47677000.0,4186.71,
GBM_2_AutoML_20220422_154849,6935.78,48105100.0,48105100.0,4204.46,
GBM_3_AutoML_20220422_154849,6955.95,48385200.0,48385200.0,4211.05,
GBM_grid__1_AutoML_20220422_154849_model_1,6981.26,48738000.0,48738000.0,4386.39,
DRF_1_AutoML_20220422_154849,6985.82,48801700.0,48801700.0,4156.29,1.07768
GBM_4_AutoML_20220422_154849,7005.66,49079300.0,49079300.0,4225.1,
GBM_5_AutoML_20220422_154849,7091.99,50296300.0,50296300.0,4464.77,
DeepLearning_1_AutoML_20220422_154849,7545.77,56938600.0,56938600.0,5113.93,




# Explore top model 

In [6]:

top_ml_model = h2o.get_model(lb.as_data_frame()['model_id'][0])


In [7]:

# Test set
top_ml_model.model_performance(df_h2o_test)



ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 30051598.834669862
RMSE: 5481.933859019996
MAE: 3626.344927210771
RMSLE: NaN
R^2: 0.7824047951828679
Mean Residual Deviance: 30051598.834669862
Null degrees of freedom: 645
Residual degrees of freedom: 641
Null deviance: 89217688682.25267
Residual deviance: 19413332847.19673
AIC: 12968.372054379854




In [8]:

# Train set
top_ml_model.model_performance(df_h2o_train)



ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 10610582.884841708
RMSE: 3257.3889673850294
MAE: 2053.077049419546
RMSLE: NaN
R^2: 0.9291450366183611
Mean Residual Deviance: 10610582.884841708
Null degrees of freedom: 1852
Residual degrees of freedom: 1848
Null deviance: 277488113002.2109
Residual deviance: 19661410085.611687
AIC: 35247.23881721631




# Explain H2O Automl

In [9]:

aml.explain(df_h2o_test)


AttributeError: 'H2OAutoML' object has no attribute 'explain'