### This notebook is a brief analysis on how AutoML RMSE changes when training with different DataFrame size sets.
The objective is to do a robustness assessment.
#### 20 models are used

In [8]:
import os 
import sys
import pickle
import h2o

import pandas as pd

from h2o.automl import H2OAutoML

In [4]:
pd.set_option("display.max_rows", 6)

In [5]:
df = pd.read_pickle("../../data/structured/df.pkl")
#IA Filter
df = df.loc[df.type_bool == True]
X = df.loc[:,'f1':'REDSHIFT_SPEC']
y = df.loc[:,'REDSHIFT_SPEC']

In [21]:
h2o.init(ip="localhost", port=54322)

Checking whether there is an H2O instance running at http://localhost:54322 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.9.1" 2020-11-04; OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /home/fmoliveira/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp3oyartz4
  JVM stdout: /tmp/tmp3oyartz4/h2o_fmoliveira_started_from_python.out
  JVM stderr: /tmp/tmp3oyartz4/h2o_fmoliveira_started_from_python.err
  Server is running at http://127.0.0.1:54326
Connecting to H2O server at http://127.0.0.1:54326 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,1 month and 19 days
H2O_cluster_name:,H2O_from_python_fmoliveira_lri8wg
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.881 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


### Preparing different size DFs

In [22]:
divisions = [500,1000,1500,2000,2500,3000,3500,4000,4500,5078]
amls = {}
for i in divisions:
    aml = H2OAutoML(max_models=20, seed=1)
    aml.train(x=list(df.loc[:,'f1':'f20'].columns), y='REDSHIFT_SPEC', training_frame=h2o.H2OFrame(X.iloc[0:i]))
    print(aml.leaderboard.as_data_frame(use_pandas=True))
    amls[f'size_{i}'] = aml

Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%
                                             model_id  mean_residual_deviance  \
0    StackedEnsemble_AllModels_AutoML_20201116_235016                0.009922   
1   StackedEnsemble_BestOfFamily_AutoML_20201116_2...                0.009967   
2                        GBM_2_AutoML_20201116_235016                0.010381   
3                        GBM_4_AutoML_20201116_235016                0.010488   
4                        GBM_3_AutoML_20201116_235016                0.010525   
5          GBM_grid__1_AutoML_20201116_235016_model_1                0.010598   
6                        GBM_1_AutoML_20201116_235016                0.010772   
7          GBM_grid__1_AutoML_20201116_235016_model_2                0.010835   
8                        XRT_1_AutoML_20201116_235016                0.010990   
9                        DRF

In [34]:
amls

{'size_500': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b5576d8>,
 'size_1000': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b5567b8>,
 'size_1500': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b54aef0>,
 'size_2000': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b531cc0>,
 'size_2500': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b51d668>,
 'size_3000': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b51f358>,
 'size_3500': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b524b38>,
 'size_4000': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b5703c8>,
 'size_4500': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b569e48>,
 'size_5078': <h2o.automl.autoh2o.H2OAutoML at 0x7f006b5b8198>}

In [35]:
results = {}
for key in amls.keys():
    results[key] = amls[key].leaderboard.as_data_frame()

In [39]:
results

{'size_500':                                              model_id  mean_residual_deviance  \
 0    StackedEnsemble_AllModels_AutoML_20201116_235016                0.009922   
 1   StackedEnsemble_BestOfFamily_AutoML_20201116_2...                0.009967   
 2                        GBM_2_AutoML_20201116_235016                0.010381   
 3                        GBM_4_AutoML_20201116_235016                0.010488   
 4                        GBM_3_AutoML_20201116_235016                0.010525   
 5          GBM_grid__1_AutoML_20201116_235016_model_1                0.010598   
 6                        GBM_1_AutoML_20201116_235016                0.010772   
 7          GBM_grid__1_AutoML_20201116_235016_model_2                0.010835   
 8                        XRT_1_AutoML_20201116_235016                0.010990   
 9                        DRF_1_AutoML_20201116_235016                0.011121   
 10                   XGBoost_3_AutoML_20201116_235016                0.011597   
 11 

In [44]:
pickle.dump(results, open('../../data/enriched/evaluation_set_sizes_leaderboards.pkl', 'wb'))

In [2]:
results = pickle.load(open('../../data/enriched/evaluation_set_sizes_leaderboards.pkl, 'rb'))

## ANS: The change varying DataFrame set size is:

In [10]:
for k in results:
    print(f'RMSE from {k} : {results[k].iloc[0]["rmse"]}')

RMSE from size_500 : 0.09961174060221616
RMSE from size_1000 : 0.09651164679233867
RMSE from size_1500 : 0.09437005674909003
RMSE from size_2000 : 0.09211935261201423
RMSE from size_2500 : 0.09125601655626163
RMSE from size_3000 : 0.09025433133199964
RMSE from size_3500 : 0.09013429283580278
RMSE from size_4000 : 0.0881795003174765
RMSE from size_4500 : 0.0865652232731532
RMSE from size_5078 : 0.08696055831503532
