### Setup

In [1]:
import pandas as pd
import numpy as np
import os
from time import time

In [2]:
#import sys
#sys.path.append(os.path.abspath('..'))#sys.path[0] + '/..') 
from vb_estimators import  LinRegSupreme,LinSVR,RBFSVR,ENet,L1Lars,GBR,HGBR,FlexiblePipe
from vb_helper import VBHelper
from vb_cross_validator import regressor_q_stratified_cv

no daal4py


In [3]:

from warnings import filterwarnings
filterwarnings('ignore')


### setup the experiment/project
#### note the 'run_stacked' kwarg that can be set to create the stacked_regressor

In [4]:
gridpoints=5
kwargs=dict(
    run_stacked=True,
    test_share=0,#keep at 0 for small datasets
    cv_folds=5,
    cv_reps=2,
    #cv_groupcount=5,
    cv_strategy=('quantile',5), # for stratified cv
    random_state=2 # random_state for reproducibility
)
vbhelper=VBHelper(**kwargs)

In [5]:
scorer_list=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'] #cross_validate wants strings
vbhelper.scorer_list=scorer_list

##### User Import Dataset Step

In [6]:
data_path=os.path.join('sample_data','ex1.csv')
df=pd.read_csv(data_path)
all_vars=list(df.columns)
print(all_vars)

['STA_ID', 'LONG', 'LAT', 'OrigHabCode', 'Date', 'THG_Fish', 'YEAR', 'SEASON', 'SUBAREA', 'HABCODE', 'Floc_Depth_ft', 'AFDW_Floc', 'MEHG_Floc', 'THG_floc', 'Tot_Phos_floc', 'Bulk_Dens_Floc', 'Soil_Thickness_FT', 'AFDW_Soil', 'Bulk_Dens_Soil', 'PH_soil', 'SO4_soil', 'MEHG_soil', 'THG_soil', 'Tot_Carbon_Soil_%', 'Tot_Nitrogen_Soil_%', 'Tot_Phos_soil', 'Wat_Depth_ft', 'COND_SW', 'DO_SW', 'TEMP_SW', 'PH_SW', 'TURB_SW', 'REDOX_SW', 'Alk_Phos_SW', 'CHLA_SW', 'CL_SW', 'MEHG_SW', 'NH4_SW', 'NO2_SW', 'NO3_SW', 'SO4_SW', 'Sol_Reac_Phos_SW', 'THG_SW', 'TOC_SW', 'Tot_Nitrogen_SW', 'Tot_Phos_SW', 'REDOX_PW', 'H2S_PW', 'Sol_Reac_Phos_PW', 'MEHG_Peri_AVG', 'THG_epi_peri']


#### user has option to specify "regulatory standard"

In [7]:
#The user sets the variables to use for x and y.
y_name='THG_Fish'
loc_vars=['LAT','LONG']
drop_vars=['Date','OrigHabCode','STA_ID']
drop_vars.extend(loc_vars)
drop_vars.append(y_name)
x_vars=[var for var in all_vars if var not in drop_vars]
X_df=df.loc[:,x_vars]
y_df=df.loc[:,y_name]

In [8]:

shuf=np.arange(y_df.shape[0])
seed=0
rng = np.random.default_rng(seed)
rng.shuffle(shuf)
X_df=X_df.iloc[shuf]
y_df=y_df.iloc[shuf]
vbhelper.setData(X_df,y_df)

### setup the analytical pipelines
#### note the inner_cv_dict and prep_dict that are used to consolidate vb_estimator kwargs and to facilitate the divison between prep and post steps.

In [9]:
inner_cv_dict={'cv_reps':1,'cv_folds':5,'cv_strategy':('quantile',5)}
inner_cv=vbhelper.getCV(cv_dict=inner_cv_dict)

prep_dict={'impute_strategy':'impute_knn5','cat_idx':vbhelper.cat_idx}

pipe_kwargs=dict(do_prep=not vbhelper.run_stacked,prep_dict=prep_dict,inner_cv=inner_cv,gridpoints=gridpoints,cat_idx=vbhelper.cat_idx,float_idx=vbhelper.float_idx,bestT=False)
pipe_dict={
    'gradient-boosting-reg':{'pipe':GBR,'pipe_kwargs':dict(
        prep_dict=prep_dict,do_prep=not vbhelper.run_stacked)},
    #'lin-reg-supreme':{'pipe':LinRegSupreme,'pipe_kwargs':pipe_kwargs}, 
    #'powXB-least-sq':{'pipe':FlexiblePipe,'pipe_kwargs':{**pipe_kwargs,'flex_kwargs':{'form':'powXB'}}}),
    #'expXB-least-sq':{'pipe':FlexiblePipe,'pipe_kwargs':{**pipe_kwargs,'flex_kwargs':{'form':'expXB'}}}), #expXB is default
    #'nonlinear-search-least-sq': {'pipe':FlexiblePipe,'pipe_kwargs':{**pipe_kwargs,'functional_form_search':True}},
    #'robust-powXB-least-sq':{'pipe':FlexiblePipe,'pipe_kwargs':{**pipe_kwargs,'flex_kwargs':{'form':'powXB','robust':True}}},
    #'robust-expXB-least-sq':{'pipe':FlexiblePipe,'pipe_kwargs':{**pipe_kwargs,'flex_kwargs':{'form':'expXB','robust':True}}}, #expXB is default
    #'robust-nonlinear-search-least-sq': {'pipe':FlexiblePipe,'pipe_kwargs':{**pipe_kwargs,'functional_form_search':True,'flex_kwargs':{'robust':True}}},
    #'histogram-gradient-boosting-reg':{'pipe':HGBR,'pipe_kwargs':{'prep_dict':{'cat_idx':vbhelper.cat_idx}}},
    
    
    #'elastic-net':{'pipe':ENet,'pipe_kwargs':pipe_kwargs}, 
    #'linear-svr-cv':{'pipe':LinSVR,'pipe_kwargs':pipe_kwargs}
    'rbf-svr-cv':{'pipe':RBFSVR,'pipe_kwargs':pipe_kwargs}, 
    'lassolars':{'pipe':L1Lars,'pipe_kwargs':pipe_kwargs},
    }


#estimator_dict={'multi_pipe':{'pipe':MultiPipe(pipelist=[(k,v) for k,v in estimator_dict.items()],cat_idx=vbhelper.cat_idx)}
vbhelper.setPipeDict(pipe_dict) #formerly setEstimatorDict
vbhelper.setModelDict()
#vbhelper.model_dict={key:val() for key,val in vbhelper.estimator_dict.items()} # they will be models once .fit is called

### create a smaller test run to check runtime, debug, etc.

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_df,y_df,test_size=0.5) #just for debugging

In [11]:
runtest=True
if runtest:
    for name,est in vbhelper.model_dict.items():
        start=time()
        print(name)
        est.fit(X_train,y_train)
        print('train R2:',est.score(X_train,y_train))
        if not X_test is None:print('test R2:',est.score(X_test,y_test))
        end=time()
        print(f'runtime:{(end-start)/60} min.\n')


multi_pipe
train R2: 0.6402879841330247
test R2: 0.44742136567652846
runtime:0.8441261132558187 min.



In [12]:
if vbhelper.run_stacked and runtest:
    mp=vbhelper.model_dict['multi_pipe']
    fitted_ipipe_dict=mp.build_individual_fitted_pipelines()
    print(fitted_ipipe_dict['lassolars'].score(X_test,y_test))
    print(fitted_ipipe_dict['gradient-boosting-reg'].score(X_test,y_test))
    print(fitted_ipipe_dict['rbf-svr-cv'].score(X_test,y_test))
    
    

0.4367832529081146
0.3684239525511005
0.3758403231631384


### end small test-run debugging

---

### Fit the final models for all estimators

In [23]:
vbhelper.runCrossValidate(try_load=True) #try_load speeds things up by reloading results if they've been run before with same setup and data

multi_pipe,[('neg_mean_squared_error', -7067.8298618903455), ('neg_mean_absolute_error', -60.758502475688786), ('r2', 0.4807790619025935)], runtime:4.273568173249562 min.
est_n yhat test: [166.9090638  115.99347027]
est_n yhat test: [194.85110925 166.76664962]
est_n yhat test: [174.50179047 126.68168385]
est_n yhat test: [139.70812401 146.33550868]
est_n yhat test: [173.52362085 143.40533929]
est_n yhat test: [174.2710678  123.15098936]
est_n yhat test: [142.4653838  118.00589346]
est_n yhat test: [178.29265167 142.93621325]
est_n yhat test: [164.2416895  125.44156218]
est_n yhat test: [145.76483228 155.64500039]
est_n yhat test: [180.10927944 159.42072967]
est_n yhat test: [175.03078962 129.73049527]
est_n yhat test: [148.42457009 121.28402836]
est_n yhat test: [169.51104573 149.15323151]
est_n yhat test: [162.94852257 123.21175675]
est_n yhat test: [135.66019173 133.44997247]
est_n yhat test: [179.01571005 143.43809631]
est_n yhat test: [164.52333665 128.55306188]
est_n yhat test: [1

In [14]:
#vbhelper.fitFinalModelDict()

In [15]:
#plot cv_yhat

#### graphs and table to summarize results

In [16]:
vbhelper.buildCVScoreDict()

In [17]:
vbhelper.cv_score_dict

{'gradient-boosting-reg': {'neg_mean_squared_error': array([-7691.48925745, -8168.19560515]),
  'neg_mean_absolute_error': array([-63.63023003, -65.15833111]),
  'r2': array([0.43296533, 0.39782142])},
 'rbf-svr-cv': {'neg_mean_squared_error': array([-7712.59685983, -7967.87033901]),
  'neg_mean_absolute_error': array([-60.0983606 , -61.36529471]),
  'r2': array([0.43140923, 0.41258987])},
 'lassolars': {'neg_mean_squared_error': array([-7964.42817191, -8256.18530277]),
  'neg_mean_absolute_error': array([-65.14373736, -66.73024803]),
  'r2': array([0.41284363, 0.39133461])},
 'multi_pipe': {'neg_mean_squared_error': array([-6907.36477077, -7226.58512824]),
  'neg_mean_absolute_error': array([-60.04320193, -61.47049705]),
  'r2': array([0.49077283, 0.46723916])}}

In [18]:
vbhelper.viewCVScoreDict()

scores for scorer: neg_mean_squared_error:
    multi_pipe:-7066.974949502992
scores for scorer: neg_mean_absolute_error:
    multi_pipe:-60.756849491296045
scores for scorer: r2:
    multi_pipe:0.4790059956594133


In [19]:
vbhelper.predictCVYhat()

In [20]:
vbhelper.jsonifyProjectCVResults()

setting plotter data


In [21]:
vbhelper.cv_score_dict

{'gradient-boosting-reg': {'neg_mean_squared_error': array([-7691.48925745, -8168.19560515]),
  'neg_mean_absolute_error': array([-63.63023003, -65.15833111]),
  'r2': array([0.43296533, 0.39782142])},
 'rbf-svr-cv': {'neg_mean_squared_error': array([-7712.59685983, -7967.87033901]),
  'neg_mean_absolute_error': array([-60.0983606 , -61.36529471]),
  'r2': array([0.43140923, 0.41258987])},
 'lassolars': {'neg_mean_squared_error': array([-7964.42817191, -8256.18530277]),
  'neg_mean_absolute_error': array([-65.14373736, -66.73024803]),
  'r2': array([0.41284363, 0.39133461])},
 'multi_pipe': {'neg_mean_squared_error': array([-6907.36477077, -7226.58512824]),
  'neg_mean_absolute_error': array([-60.04320193, -61.47049705]),
  'r2': array([0.49077283, 0.46723916])}}

In [22]:
assert False, 'plots moved elsewhere'

AssertionError: plots moved elsewhere

In [None]:
#vbhelper.pickleSelf() #for development in other notebooks

In [None]:
vbhelper.plotCVScores(sort=1)

In [None]:
vbhelper.plotCVYhatVsY(regulatory_standard=False,decision_criteria=False)

#### Add table of results for the pipelines, coefficients, stats, etc. 

In [None]:
vbhelper.plotCVYhat(single_plot=True) 
# make Y line faint and make dots at actual values
# add the mean of each series and make more visible
# sort by original row order or by increasing value of Y

In [None]:
vbhelper.plotCVYhat(single_plot=False)

### Sensitivity analysis of rows
Outlier plots
DfFits, leverage

### Sensitivity analysis of columns/features
#### partial dependence plots (PDP in GBM)

## Next, user selects pipeline, final model is fit and ready for prediction

In [None]:
###