### Setup

In [1]:
import pandas as pd
import numpy as np
import os
from time import time

In [2]:
from vb_estimators import  LinRegSupreme,LinSVR,RBFSVR,ENet,L1Lars,GBR,HGBR,FlexiblePipe
from vb_helper import VBHelper
from vb_cross_validator import regressor_q_stratified_cv
from missing_val_transformer import missingValHandler

no daal4py


In [3]:

from warnings import filterwarnings
filterwarnings('ignore')


### setup the experiment/project
#### note the 'run_stacked' kwarg that can be set to create the stacked_regressor

In [4]:
gridpoints=5
kwargs=dict(
    run_stacked=True,
    test_share=0,#keep at 0 for small datasets
    cv_folds=5,
    cv_reps=2,
    #cv_groupcount=5,
    cv_strategy=('quantile',5), # for stratified cv
    random_state=2 # random_state for reproducibility
)
vbhelper=VBHelper(**kwargs)

In [5]:
scorer_list=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'] #cross_validate wants strings
vbhelper.scorer_list=scorer_list

##### User Import Dataset Step

In [6]:
data_path=os.path.join('sample_data','ex1.csv')
df=pd.read_csv(data_path)
all_vars=list(df.columns)
print(all_vars)

['STA_ID', 'LONG', 'LAT', 'OrigHabCode', 'Date', 'THG_Fish', 'YEAR', 'SEASON', 'SUBAREA', 'HABCODE', 'Floc_Depth_ft', 'AFDW_Floc', 'MEHG_Floc', 'THG_floc', 'Tot_Phos_floc', 'Bulk_Dens_Floc', 'Soil_Thickness_FT', 'AFDW_Soil', 'Bulk_Dens_Soil', 'PH_soil', 'SO4_soil', 'MEHG_soil', 'THG_soil', 'Tot_Carbon_Soil_%', 'Tot_Nitrogen_Soil_%', 'Tot_Phos_soil', 'Wat_Depth_ft', 'COND_SW', 'DO_SW', 'TEMP_SW', 'PH_SW', 'TURB_SW', 'REDOX_SW', 'Alk_Phos_SW', 'CHLA_SW', 'CL_SW', 'MEHG_SW', 'NH4_SW', 'NO2_SW', 'NO3_SW', 'SO4_SW', 'Sol_Reac_Phos_SW', 'THG_SW', 'TOC_SW', 'Tot_Nitrogen_SW', 'Tot_Phos_SW', 'REDOX_PW', 'H2S_PW', 'Sol_Reac_Phos_PW', 'MEHG_Peri_AVG', 'THG_epi_peri']


#### user has option to specify "regulatory standard"

In [7]:
#load data
data_path=os.path.join('sample_data','ex1.csv')
df=pd.read_csv(data_path)

#select variables
y_name='THG_Fish'
loc_vars=['LAT','LONG']
drop_vars=['Date','OrigHabCode','STA_ID']
drop_vars.extend(loc_vars)
drop_vars.append(y_name)
all_vars=list(df.columns)
x_vars=[var for var in all_vars if var not in drop_vars]

#extract random rows for prediction
predict_n=3
shuf=np.arange(df.shape[0])
seed=0
rng = np.random.default_rng(seed)
rng.shuffle(shuf)
predict_select=shuf[:predict_n]
X_predict=df.loc[:,x_vars].iloc[predict_select].copy()
y_predict=df.loc[:,y_name].iloc[predict_select].copy()
df.drop(index=predict_select,inplace=True)
X_df=df.loc[:,x_vars]
y_df=df.loc[:,[y_name]]

In [8]:
vbhelper.setData(X_df,y_df)

# of duplicate rows of data: 0
# of duplicate rows of X: 0


Take a look at the data

### setup the analytical pipelines
#### note the inner_cv_dict and prep_dict that are used to consolidate vb_estimator kwargs and to facilitate the divison between prep and post steps.

In [9]:
inner_cv_dict={'cv_reps':1,'cv_folds':5,'cv_strategy':('quantile',5)}
inner_cv=vbhelper.getCV(cv_dict=inner_cv_dict)

prep_dict={'impute_strategy':'impute_knn5','cat_idx':vbhelper.cat_idx}

needs_prep=not vbhelper.run_stacked

pipe_kwargs=dict(do_prep=needs_prep,prep_dict=prep_dict,inner_cv=inner_cv,gridpoints=gridpoints,cat_idx=vbhelper.cat_idx,float_idx=vbhelper.float_idx,bestT=False)
pipe_dict={
    'gradient-boosting-reg':{
        'pipe':GBR,
        'pipe_kwargs':dict(prep_dict=prep_dict,do_prep=needs_prep)
        },
    'rbf-svr-cv':{'pipe':RBFSVR,'pipe_kwargs':pipe_kwargs}, 
    'lassolars':{'pipe':L1Lars,'pipe_kwargs':pipe_kwargs},
    }
vbhelper.setPipeDict(pipe_dict) #formerly setEstimatorDict
vbhelper.setModelDict()


run cross validation

In [10]:
start=time()
vbhelper.runCrossValidate(try_load=True)
end=time()
print(f'runtime:{(end-start)/60} min.\n')


jhash:  aa5f20a9db137c08122785e91c79edd4
multi_pipe,[('neg_mean_squared_error', -7007.219952970123), ('neg_mean_absolute_error', -60.27451280124912), ('r2', 0.4843500488332192)], runtime:1.2658527851104737 min.
runtime:1.2667533953984578 min.



---

### Fit the final models for all estimators

In [11]:
vbhelper.refitPredictiveModels(selected_models=['multi_pipe'])

In [18]:
yhat=vbhelper.predict(X_predict)
yhat

{'multi_pipe': array([193.77343614, 151.60281416, 114.2266863 ]),
 'weights': {'multi_pipe': {'neg_mean_squared_error': 1.0,
   'neg_mean_absolute_error': 1.0,
   'r2': 1.0}},
 'prediction': {'neg_mean_squared_error': array([193.77343614, 151.60281416, 114.2266863 ]),
  'neg_mean_absolute_error': array([193.77343614, 151.60281416, 114.2266863 ]),
  'r2': array([193.77343614, 151.60281416, 114.2266863 ])}}

In [21]:
y_predict

420    162.6
652    140.0
575     48.4
Name: THG_Fish, dtype: float64

In [16]:
(yhat['prediction']['r2']-y_predict)/y_predict

420    0.191719
652    0.082877
575    1.360056
Name: THG_Fish, dtype: float64

In [19]:
yhat_cv=vbhelper.predict(X_predict,model_type='cross_validation')

In [20]:
yhat_cv

{'multi_pipe': [array([192.62183397, 154.95313496, 114.38257264]),
  array([181.60079651, 139.8912016 , 129.62712158]),
  array([191.55153927, 163.08953749, 153.15662228]),
  array([176.21036286, 141.46260861,  77.13397768]),
  array([188.35610353, 139.27311688, 152.36084702]),
  array([176.0662644 , 136.76068423, 162.45384229]),
  array([198.55881302, 162.55107182, 113.37473735]),
  array([193.26532751, 149.17485897, 122.18682008]),
  array([171.88484434, 131.04093889, 133.13670021]),
  array([172.84199616, 139.46822921,  65.0461568 ])],
 'weights': {'multi_pipe': {'neg_mean_squared_error': 1.0,
   'neg_mean_absolute_error': 1.0,
   'r2': 1.0}},
 'prediction': [{'neg_mean_squared_error': array([192.62183397, 154.95313496, 114.38257264]),
   'neg_mean_absolute_error': array([192.62183397, 154.95313496, 114.38257264]),
   'r2': array([192.62183397, 154.95313496, 114.38257264])},
  {'neg_mean_squared_error': array([181.60079651, 139.8912016 , 129.62712158]),
   'neg_mean_absolute_error':