### Setup

In [1]:
import pandas as pd
import numpy as np
import os
from time import time

In [2]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.linear_model import ElasticNetCV, LinearRegression, Lars,LassoLarsCV
from sklearn.svm import LinearSVR, SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, train_test_split, RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt


In [3]:
from sklearn.model_selection import cross_validate, train_test_split, RepeatedKFold, GridSearchCV

In [4]:
#import sys
#sys.path.append(os.path.abspath('..'))#sys.path[0] + '/..') 
from vb_estimators import LinRegSupreme,LinSVR,RBFSVR,ENet,L1Lars
from vb_helper import VBHelper
from vb_cross_validator import regressor_q_stratified_cv

no daal4py


In [5]:

from warnings import filterwarnings
filterwarnings('ignore')


In [6]:
gridpoints=5
test_share=0.2
cv_folds=5
cv_reps=3
cv_count=cv_folds*cv_reps
group_count=5;strategy='quantile' # for stratified cv
rs=2 # random_state for reproducibility
vbhelper=VBHelper(test_share,cv_folds,cv_reps,cv_count,rs)

In [7]:
scorer_list=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'] #cross_validate wants strings
#cv=RepeatedKFold(n_splits=cv_folds, n_repeats=cv_reps, random_state=rs) # define separately to ensure same cv data used for each model
vbhelper.setCV(group_count=group_count,strategy=strategy)

vbhelper.scorer_list=scorer_list
# allow/generate water quality thresholds for stratified kfold sub-sampling to ensure cross-validation folds have full range of water quality

##### Example Dataset

In [8]:
data_path=os.path.join('sample_data','ex1.csv')
df=pd.read_csv(data_path)
all_vars=list(df.columns)
print(all_vars)

['STA_ID', 'LONG', 'LAT', 'OrigHabCode', 'Date', 'THG_Fish', 'YEAR', 'SEASON', 'SUBAREA', 'HABCODE', 'Floc_Depth_ft', 'AFDW_Floc', 'MEHG_Floc', 'THG_floc', 'Tot_Phos_floc', 'Bulk_Dens_Floc', 'Soil_Thickness_FT', 'AFDW_Soil', 'Bulk_Dens_Soil', 'PH_soil', 'SO4_soil', 'MEHG_soil', 'THG_soil', 'Tot_Carbon_Soil_%', 'Tot_Nitrogen_Soil_%', 'Tot_Phos_soil', 'Wat_Depth_ft', 'COND_SW', 'DO_SW', 'TEMP_SW', 'PH_SW', 'TURB_SW', 'REDOX_SW', 'Alk_Phos_SW', 'CHLA_SW', 'CL_SW', 'MEHG_SW', 'NH4_SW', 'NO2_SW', 'NO3_SW', 'SO4_SW', 'Sol_Reac_Phos_SW', 'THG_SW', 'TOC_SW', 'Tot_Nitrogen_SW', 'Tot_Phos_SW', 'REDOX_PW', 'H2S_PW', 'Sol_Reac_Phos_PW', 'MEHG_Peri_AVG', 'THG_epi_peri']


In [9]:
y_name='THG_Fish'
loc_vars=['LAT','LONG']
drop_vars=['Date','OrigHabCode','STA_ID']
drop_vars.extend(loc_vars)
drop_vars.append(y_name)
x_vars=[var for var in all_vars if var not in drop_vars]
X_df=df.loc[:,x_vars]
y_df=df.loc[:,y_name]

In [10]:

shuf=np.arange(y_df.shape[0])
np.random.seed(0)
np.random.shuffle(shuf)
X_df=X_df.iloc[shuf]
y_df=y_df.iloc[shuf]
vbhelper.setData(X_df,y_df)
X_train, X_test, y_train, y_test=vbhelper.train_test_split()

(0, 1, 2, 3) ('YEAR', 'SEASON', 'SUBAREA', 'HABCODE')


In [11]:
est_kwargs=dict(gridpoints=gridpoints,cat_idx=vbhelper.cat_idx,float_idx=vbhelper.float_idx,bestT=False)
estimator_dict1={
    'lin-reg-supreme':lambda: LinRegSupreme(**est_kwargs),
    'elastic-net':lambda: ENet(**est_kwargs), 
    'linear-svr-cv':lambda: LinSVR(**est_kwargs), 
    'rbf-svr-cv':lambda: RBFSVR(**est_kwargs), 
    'lassolars':lambda: L1Lars(**est_kwargs),
    #'HistGradientBoostingRegressor':HistGradientBoostingRegressor,
    #'gradient-boosting-reg':gradient_boosting_reg(bestT=True)
    }
est_kwargs_bestT=dict(gridpoints=gridpoints,cat_idx=vbhelper.cat_idx,float_idx=vbhelper.float_idx,bestT=True)
estimator_dict2={
    'lin-reg-supreme-bestT':lambda: LinRegSupreme(**est_kwargs_bestT),
    'elastic-net-bestT':lambda: ENet(**est_kwargs_bestT), 
    'linear-svr-cv-bestT':lambda: LinSVR(**est_kwargs_bestT), 
    'rbf-svr-cv-bestT':lambda: RBFSVR(**est_kwargs_bestT), 
    'lassolars-bestT':lambda: L1Lars(**est_kwargs_bestT),
    #'gradient-boosting-reg':gradient_boosting_reg(bestT=True)
    }
estimator_dict={**estimator_dict1,**estimator_dict2}
vbhelper.estimator_dict=estimator_dict
vbhelper.model_dict={key:val() for key,val in estimator_dict.items()} # they will be models once .fit is called

In [12]:
i=0
for name,est in vbhelper.model_dict.items():
    start=time()
    i+=1;print(name)
    est.fit(X_train,y_train)
    print('train R2:',est.score(X_train,y_train))
    if not X_test is None:print('test R2:',est.score(X_test,y_test))
    end=time()
    print(f'runtime:{(end-start)/60} min.\n')


lin-reg-supreme
train R2: 0.5436511787162075
test R2: 0.46007867335940555
runtime:0.3394910534222921 min.

elastic-net
train R2: 0.4649052799382922
test R2: 0.4555737126220476
runtime:0.5796512007713318 min.

linear-svr-cv
train R2: 0.5099788586436047
test R2: 0.4254598294175347
runtime:0.42827335198720295 min.

rbf-svr-cv
train R2: 0.5856578645935512
test R2: 0.4204126653959883
runtime:0.09013941685358683 min.

lassolars
train R2: 0.4522273776533162
test R2: 0.4536512580917407
runtime:0.03291046222050985 min.

lin-reg-supreme-bestT
train R2: 0.47202091442785343
test R2: 0.42102591051863447
runtime:0.10888811747233072 min.

elastic-net-bestT
train R2: 0.4884531711435921
test R2: 0.47573340034473266
runtime:0.5205970406532288 min.

linear-svr-cv-bestT
train R2: 0.4638112270199837
test R2: 0.4575537709707327
runtime:0.4306750218073527 min.

rbf-svr-cv-bestT
train R2: 0.593720647926708
test R2: 0.4561142637693395
runtime:0.08299835125605265 min.

lassolars-bestT
train R2: 0.49904382619039

In [13]:
vbhelper.runCrossValidate(n_jobs=10)

NameError: name 'time' is not defined

In [None]:
#plot cv_yhat

#### graphs and table to summarize results

In [None]:
cv_score_dict={}
cv_score_dict_means={}
for idx,(estimator_name,result) in enumerate(cv_results.items()):
    #cv_estimators=result['estimator']
    model_idx_scoredict={scorer:result[f'test_{scorer}'] for scorer in scorer_list}# fstring bc how cross_validate stores list of metrics
    cv_score_dict[estimator_name]=model_idx_scoredict 
    model_idx_mean_scores={scorer:np.mean(scores) for scorer,scores in model_idx_scoredict.items()}
    cv_score_dict_means[estimator_name]=model_idx_mean_scores

In [None]:
for scorer in scorer_list:
    print(f'scores for scorer: {scorer}:')
    for estimator_name in model_dict:
        print(f'    {estimator_name}:{cv_score_dict_means[estimator_name][scorer]}')

In [None]:
vbhelper.plotCVScores(cv_score_dict,sort=1)