In [1]:
import numpy as np
import pandas as pd

import pickle
import ast
from itertools import combinations

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn import set_config
from sklearn.utils import shuffle
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer, PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_predict, cross_val_score ,train_test_split, LeaveOneOut
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, SelectPercentile, mutual_info_regression

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor

import category_encoders as ce
import missingno as msno

from patsy import dmatrices
from statsmodels.compat import lzip
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.compat import lzip

from constants import *

%matplotlib inline

# Setup

In [2]:
set_config(transform_output="pandas")

In [3]:
html_folder = 'C:\\Users\\Brayden\\Desktop\\Personal Website\\Brayden-L.github.io\\_includes\\complex_linear_regression_routes\\'

In [4]:
df = pd.read_pickle('All_Loc_Cleaned_Stripped.pkl')

In [5]:
# This undoes the complex length imputation performed in "Length_Imputation.ipynb" for nullity analysis

undo_length_imp_bool = False

if undo_length_imp_bool:
        df1 = pd.read_csv('All_Loc.csv')
        # Create Route ID Column for joining and remove duplicates
        if "Route ID" not in df1.columns:
                df1.insert(len(df1.columns), "Route ID", "")
        df1["Route ID"] = df1["URL"].apply(lambda x: int(x.split("/")[4]))

        df1.drop_duplicates('Route ID', inplace=True)
        df1[['Route ID', 'Length']]
        df.drop('Length', axis=1, inplace=True)
        df = df.merge(df1[['Route ID', 'Length']], how='left', on='Route ID')

In [6]:
df = shuffle(df)

In [7]:
star_vote_cutoff = 5
bayesian_stars=True

# Cleaning

In [8]:
# Route ID
df.set_index('Route ID', inplace=True)

## Response Variable Cleaning

In [9]:
o_size = df.shape[0]

In [10]:
# Drop values with not enough vote ratings
df.drop(df[df['Num Star Ratings']<=star_vote_cutoff].index, axis=0, inplace=True)

In [11]:
# Drop MP entries
df.drop(df[df['SP/MP']=='MP'].index, axis=0, inplace=True)

In [12]:
df.shape[0]

12990

In [13]:
df.shape[0]/o_size

0.449620989235402

In [14]:
if bayesian_stars:
    m = df['Avg Stars'].mean()
    c = df['Num Star Ratings'].quantile(0.25)
    df['Bayesian Stars'] = ((df['Avg Stars'] * df['Num Star Ratings']) + (c*m)) / (df['Num Star Ratings'] + c)

    # Compare old rating scale to new
    x0 = df['Avg Stars']
    x1 = df['Bayesian Stars']
    rate_comp_df =pd.DataFrame(dict(
        series=np.concatenate((["Average Stars"]*len(x0), ["Bayesian Stars"]*len(x1))), 
        data  =np.concatenate((x0,x1))
    ))
    fig = px.histogram(rate_comp_df, color='series', barmode='overlay', marginal='box')
    fig.update_layout(title='Avg Vs. Bayesian Stars Histogram', title_x=0.5)
    fig.update_xaxes(title="Stars", row=1,col=1)
    fig.update_yaxes(title="Count")
    # fig.write_html(html_folder + 'Avg_Vs_Baye_Hist.html')
    fig.show()
else:
    fig = px.histogram(df['Avg Stars'])
    fig.show()

In [15]:
# Homogenize Rating
def grade_homo(df_source, r_type, r_direction, b_type, b_direction):
    """
    Reassigns grades to a single YDS or Vgrade schema.

    Parameters
    ----------
    df_source : df
        Original route df.
    r_type : str [letter, sign]
        YDS letter or sign style grades.
    r_direction : str [up, down, even_rand, manual]
        Unused if r_type='letter'. Which way to assign grades. even_rand rounds a randomly selected half up and the randomly remaining half down.
    b_type : str [flat, sign]
        Vgrade flat grades or include sign grades.
    b_direction : str [up, down, even_rand, manual]
        Used for both b_type.

    Return
    ------
    df_source : df
        Original df with grade homogenization
    """
    rating_isolate = df_source["Original Rating"].apply(
        lambda row: [val for val in row.split()][0]
    )  # This is a fail-safe to ensure we are only looking at the part of the rating we care about, not risk or sub-ratings.

    # Reset 'Rating' column so this mapping can be re-run
    df_source["Rating"] = df_source["Original Rating"]

    # Roped Grades
    def grademoderate():
        grade_change_subset = rating_isolate.isin(list(rgrademoderatemap.keys()))
        df_source.loc[grade_change_subset, "Rating"] = df_source.loc[
            grade_change_subset
        ]["Original Rating"].map(rgrademoderatemap)

    def grade_split(upmap, downmap):
        grade_change_subset = rating_isolate.isin(list(upmap.keys()))
        grade_change_subset_df = df_source[grade_change_subset]
        for grade in grade_change_subset_df["Original Rating"].unique():
            to_change = grade_change_subset_df[
                grade_change_subset_df["Original Rating"] == grade
            ]
            changed_up = to_change.sample(frac=0.5)["Original Rating"].map(upmap)
            df_source.loc[changed_up.index, "Rating"] = changed_up
        grade_change_subset = rating_isolate.isin(list(downmap.keys()))
        grade_change_subset_df = df_source[grade_change_subset]
        for grade in grade_change_subset_df["Original Rating"].unique():
            to_change = grade_change_subset_df[
                grade_change_subset_df["Original Rating"] == grade
            ]
            changed_down = to_change["Original Rating"].map(downmap)
            df_source.loc[changed_down.index, "Rating"] = changed_down

    if r_type == "sign":
        grade_change_subset = rating_isolate.isin(list(rgradecompmap.keys()))
        df_source.loc[grade_change_subset, "Rating"] = df_source[grade_change_subset][
            "Original Rating"
        ].map(rgradecompmap)
    else:
        if r_direction == "up":
            grademoderate()
            grade_change_subset = rating_isolate.isin(list(rgradeupmap.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(rgradeupmap)
        if r_direction == "down":
            grademoderate()
            grade_change_subset = rating_isolate.isin(list(rgradedownmap.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(rgradedownmap)
        if r_direction == "even_rand":
            grademoderate()
            grade_split(rgradeupmap, rgradedownmap)

    # Boulder Grades
    if b_type == "flat":
        # Remove all + and - grades
        grade_change_subset = rating_isolate.isin(list(bgradeconmapflat.keys()))
        df_source.loc[grade_change_subset, "Rating"] = df_source[grade_change_subset][
            "Original Rating"
        ].map(bgradeconmapflat)

        if b_direction == "up":
            grade_change_subset = rating_isolate.isin(list(bgradeupmapflat.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradeupmapflat)
        if b_direction == "down":
            grade_change_subset = rating_isolate.isin(list(bgradedownmapflat.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradedownmapflat)
        if b_direction == "even_rand":
            grade_split(bgradeupmapflat, bgradedownmapflat)

    if b_type == "sign":
        if b_direction == "up":
            grade_change_subset = rating_isolate.isin(list(bgradeupmapsign.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradeupmapsign)
        if b_direction == "down":
            grade_change_subset = rating_isolate.isin(list(bgradedownmapsign.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradedownmapsign)
        if b_direction == "even_rand":
            grade_split(bgradeupmapsign, bgradedownmapsign)

    return df_source

df['Original Rating'] = df['Rating']
df = grade_homo(df, 'letter', 'even_rand', 'flat', 'down')
df.drop('Original Rating', axis=1, inplace=True)

## Predictor Cleaning

### Null Handling

In [16]:
null_plot_bool = False

nulldf = df[['Rating', 'Length', 'Area Latitude', 'Area Longitude', 'Risk', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP', 'Route Type']]

In [17]:
if null_plot_bool:
    msno.matrix(nulldf)

In [18]:
if null_plot_bool:
    msno.heatmap(nulldf)

In [19]:
if null_plot_bool:
    msno.dendrogram(nulldf)

In [20]:
(nulldf.isnull().sum()/df.shape[0])*100

Rating                  0.246343
Length                  0.000000
Area Latitude           0.000000
Area Longitude          0.000000
Risk                   93.941493
Num Ticks               0.046189
Lead Ratio              0.400308
OS Ratio                2.609700
Repeat Sender Ratio     3.903002
Mean Attempts To RP    28.221709
Route Type              0.007698
dtype: float64

In [21]:
# Remove null rating and route type values which are likely 3rd, 4th class etc. Not relevant.
df.drop(df[df['Rating'].isna()].index, inplace=True)
df.drop(df[df['SP/MP'].isna()].index, inplace=True)
df.drop(df[df['Route Type'].isna()].index, inplace=True)
df.drop(df[df['Route Type']=='Boulder'].index, inplace=True)

In [22]:
df.shape[0]

12946

### Encoding

In [23]:
# Risk
# Replace NA with 'G'
df['Risk'] = df['Risk'].cat.set_categories(['G', 'PG13', 'R', 'X'], ordered=True)
df.loc[df['Risk'].isna(), 'Risk'] = 'G'

In [24]:
# Change Trad/Sport column to "Is Trad"
df['Is Trad'] = df['Route Type'].map({"Trad":1, "Sport":0}).astype(int)
df.drop('Route Type', axis=1, inplace=True)
# Change SP/MP solumn to "Is SP"
df['Is SP'] = df['SP/MP'].map({'SP':1, 'MP':0}).astype(int)
df.drop('SP/MP', axis=1, inplace=True)
# Change Length Missing to binary
df['Length Missing'] = df['Length Missing'].map({True:1, False:0}).astype(int)

In [25]:
# Encode Risk Ordinal
ce_ord_risk = ce.OrdinalEncoder(cols=['Risk'])
df = ce_ord_risk.fit_transform(df)

In [26]:
# Encode Rating Ordinal
ce_ord_rating = ce.OrdinalEncoder(cols=['Rating'])
df = ce_ord_rating.fit_transform(df)

In [27]:
# sns.pairplot(df[['Avg Stars', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']])

### Imputation

In [28]:
# Length imputation was done prior to this on the whole data set. It is a pretty involved custom imputation.

In [29]:
# Num Ticks and Num Tickers
df.loc[df['Num Ticks'].isna(), 'Num Ticks'] = 0
df.loc[df['Num Tickers'].isna(), 'Num Tickers'] = 0

In [30]:
preproc_feature_list = ['Rating', 'Length', 'Area Latitude', 'Area Longitude', 'Risk', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP', 'Is Trad']

tick_metric_simp_imp_bool = True
missing_ind_bool = False

if tick_metric_simp_imp_bool:
    feat_impute_method = make_column_transformer((SimpleImputer(strategy='median', add_indicator=missing_ind_bool), ['OS Ratio', 'Lead Ratio']), 
                                                 (SimpleImputer(strategy='constant', fill_value=1, add_indicator=missing_ind_bool), ['Repeat Sender Ratio', 'Mean Attempts To RP']), 
                                                 remainder='passthrough', verbose_feature_names_out=False)
else:
    feat_impute_method = make_column_transformer((IterativeImputer(min_value=0, add_indicator=missing_ind_bool), preproc_feature_list), 
                                                 remainder='passthrough', verbose_feature_names_out=False)
feat_impute_method.fit_transform(df[preproc_feature_list])

Unnamed: 0_level_0,OS Ratio,Lead Ratio,Repeat Sender Ratio,Mean Attempts To RP,Rating,Length,Area Latitude,Area Longitude,Risk,Num Ticks,Is Trad
Route ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
105721801,0.857143,0.666667,1.000000,1.000000,1,70.0,34.04253,-116.14939,1,69.0,0
107694338,0.666667,0.750000,1.000000,1.000000,2,75.0,41.22760,-105.40710,1,34.0,0
106129192,0.687500,0.466667,1.166667,1.500000,3,80.0,37.52838,-118.57396,1,103.0,1
111730442,0.000000,0.312500,1.000000,1.000000,2,55.0,49.70461,-123.14131,1,19.0,0
106224333,0.750000,0.875000,1.000000,2.000000,4,100.0,37.88397,-119.43086,1,28.0,0
...,...,...,...,...,...,...,...,...,...,...,...
107874765,0.779412,0.575342,1.140351,1.428571,3,60.0,38.11730,-109.57570,1,195.0,1
108523687,0.714286,0.600000,1.000000,1.000000,3,130.0,36.15151,-115.43125,1,26.0,0
110165436,0.272727,0.181818,1.000000,1.000000,15,50.0,47.81956,-121.57498,1,91.0,1
105723895,0.860000,0.580952,1.021739,1.333333,7,60.0,34.00470,-116.05874,1,143.0,1


In [31]:
(df.isnull().sum()/df.shape[0])*100

Route                   0.000000
Location                0.000000
Avg Stars               0.000000
Num Star Ratings        0.000000
Rating                  0.000000
Pitches                 0.000000
Length                  0.000000
Length Missing          0.000000
Area Latitude           0.000000
Area Longitude          0.000000
Risk                    0.000000
Base Location           0.000000
Num Ticks               0.000000
Num Tickers             0.000000
Lead Ratio              0.347598
OS Ratio                2.549050
Repeat Sender Ratio     3.823575
Mean Attempts To RP    28.070446
Bayesian Stars          0.000000
Is Trad                 0.000000
Is SP                   0.000000
dtype: float64

## Outliers

In [32]:
# Remove outliers
repeat_sender_ratio_cutoff = 1.6
mean_attempt_to_rp_cutoff = 3.1
print(df[df['Repeat Sender Ratio']>repeat_sender_ratio_cutoff].shape[0])
df.drop(df[df['Repeat Sender Ratio']>repeat_sender_ratio_cutoff].index, axis=0, inplace=True)
print(df[df['Mean Attempts To RP']>mean_attempt_to_rp_cutoff].shape[0])
df.drop(df[df['Mean Attempts To RP']>mean_attempt_to_rp_cutoff].index, axis=0, inplace=True)

93
59


In [33]:
# 28% of values are null
# 29% are a near default 1
# 2.1% are outliers

In [34]:
# px.histogram(df['Mean Attempts To RP'], marginal='box')

In [35]:
# px.histogram(df[df['Mean Attempts To RP']>1]['Mean Attempts To RP'], marginal='box')

In [36]:
# 3.8% are null
# 58.5% are near default 1
# 3.4% outliers

In [37]:
# px.histogram(df['Repeat Sender Ratio'], marginal='box')

In [38]:
# px.histogram(df[df['Repeat Sender Ratio']>1]['Repeat Sender Ratio'], marginal='box')

In [39]:
df.shape[0]

12794

## EDA

In [40]:
# sns.pairplot(df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']], kind='kde', diag_kind='kde', corner=True)

In [41]:
# sns.pairplot(df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']], kind='reg', diag_kind='kde', corner=True, plot_kws={'line_kws':{'color':'red'}})

In [42]:
### Correlation Heatmap
# Calculate correlation using the default method ( "pearson")
corr = df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']].corr()
# optimize aesthetics: generate mask for removing duplicate / unnecessary info
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
# Generate a custom diverging colormap as indicator for correlations:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Plot
sns.set(rc={"figure.figsize":(12, 12)})
# sns.heatmap(corr, mask=mask, cmap=cmap, annot=True,  square=True, annot_kws={"size": 12}, vmin=-1, vmax=1)

In [43]:
df.drop(['Location', 'Base Location', 'Num Tickers', 'Pitches', 'Is SP', 'Route'], axis=1, inplace=True)

## Scale

In [44]:
std_scaler = StandardScaler()
normalize_scaler = MinMaxScaler(feature_range = (0, 1))
robust_scaler = RobustScaler()
quant_scaler = QuantileTransformer()
power_scaler = PowerTransformer()

scaler_list = (std_scaler, normalize_scaler, robust_scaler, quant_scaler, power_scaler)

scaler_sel_std = make_column_transformer((std_scaler, preproc_feature_list), remainder='passthrough', verbose_feature_names_out=False)
scaler_sel_norm = make_column_transformer((normalize_scaler, preproc_feature_list), remainder='passthrough', verbose_feature_names_out=False)
scaler_sel_robust = make_column_transformer((robust_scaler, preproc_feature_list), remainder='passthrough', verbose_feature_names_out=False)
scaler_sel_quant = make_column_transformer((quant_scaler, preproc_feature_list), remainder='passthrough', verbose_feature_names_out=False)
scaler_sel_power = make_column_transformer((power_scaler, preproc_feature_list), remainder='passthrough', verbose_feature_names_out=False)

scaler_sel_list = [scaler_sel_std, scaler_sel_norm, scaler_sel_robust, scaler_sel_quant, scaler_sel_power]

## Preproc Pipeline

In [45]:
# Define preproc pipeline
preproc_pipe = Pipeline([
    ("imputer", feat_impute_method),
    ("scaler", scaler_sel_std)
])

# Pipeline Setup

In [46]:
# Create hold-out test set
feature_col_ful = df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']]
X = feature_col_ful
y = df['Bayesian Stars']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

# Utility Functions

In [47]:
def find_prop_of_param(param, prop):
    param_len_list = [len(value) for value in param.values()]
    num_comb = np.prod(param_len_list)
    num_prop = prop*num_comb
    return num_prop

# Model Settings

In [48]:
num_cpu_util = 3 # -1 uses all cores
sklearn_verbosity = 10

rand_prop = 0.2 # Proportion of total combinations to try randomly
rand_bool = True # Whether to use random over a full grid search


# Simple Tree

In [49]:
simple_tree_model = DecisionTreeRegressor()
simple_tree_pipe = Pipeline([('preproc', preproc_pipe), ('stree_model', simple_tree_model)])
parameters = {'stree_model__criterion': ['squared_error'],
              'stree_model__splitter': ['best'],
              }
stree_reg= GridSearchCV(simple_tree_pipe, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=1)
stree_reg.fit(X_train, y_train)
print(f'RMSE: {mean_squared_error(y_test, stree_reg.best_estimator_.predict(X_test), squared=False)}')
print(f'R-Squared: {stree_reg.best_estimator_.score(X_test, y_test)}')
print(stree_reg.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
RMSE: 0.43065058440555926
R-Squared: 0.07822729269918416
{'stree_model__criterion': 'squared_error', 'stree_model__splitter': 'best'}


In [50]:
simple_tree_model = DecisionTreeRegressor()
simple_tree_pipe = Pipeline([('preproc', preproc_pipe), ('stree_model', simple_tree_model)])
parameters = {'stree_model__criterion': ['squared_error', 'friedman_mse', 'poisson'],
              'stree_model__splitter': ['best'],
              'stree_model__min_samples_split': np.linspace(2,40,20, dtype=int),
              'stree_model__min_samples_leaf': np.linspace(1,40,20, dtype=int),
              'stree_model__max_features': [1.0, 'sqrt', 'log2'],
            #   'stree_model__ccp_alpha': np.linspace(0,0.5,10),
              }
num_prop = find_prop_of_param(parameters, rand_prop)

if rand_bool:
    stree_reg= RandomizedSearchCV(simple_tree_pipe, parameters, n_iter=num_prop, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
else:
    stree_reg= GridSearchCV(simple_tree_pipe, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
    
stree_reg.fit(X_train, y_train)
print(f'RMSE: {mean_squared_error(y_test, stree_reg.best_estimator_.predict(X_test), squared=False)}')
print(f'R-Squared: {stree_reg.best_estimator_.score(X_test, y_test)}')
print(stree_reg.best_params_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
RMSE: 0.33745759073892384
R-Squared: 0.434005558228838
{'stree_model__splitter': 'best', 'stree_model__min_samples_split': 18, 'stree_model__min_samples_leaf': 33, 'stree_model__max_features': 1.0, 'stree_model__criterion': 'poisson'}


# Random Forest

In [53]:

rf_model = RandomForestRegressor()
rf_pipe = Pipeline([('preproc', preproc_pipe), ('rf_model', rf_model)])
parameters = {'rf_model__criterion': ['squared_error', 'friedman_mse', 'poisson'],
              'rf_model__min_samples_split': np.linspace(2,40,10, dtype=int),
              'rf_model__min_samples_leaf': np.linspace(1,40,10, dtype=int),
              'rf_model__max_features': [1.0, 'sqrt', 'log2'],
              'rf_model__n_estimators': np.linspace(20, 2000, 5, dtype=int),
              # 'rf_model__ccp_alpha': np.linspace(0,0.5,10),
              }
num_prop = find_prop_of_param(parameters, rand_prop)

if rand_bool:
    rf_reg= RandomizedSearchCV(rf_pipe, parameters, n_iter=num_prop, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
else:
    rf_reg= GridSearchCV(rf_pipe, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
    
rf_reg.fit(X_train, y_train)
print(f'RMSE: {mean_squared_error(y_test, rf_reg.best_estimator_.predict(X_test), squared=False)}')
print(f'R-Squared: {rf_reg.best_estimator_.score(X_test, y_test)}')
print(rf_reg.best_params_)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits
RMSE: 0.2961129232286437
R-Squared: 0.5641987350055515
{'rf_model__n_estimators': 1505, 'rf_model__min_samples_split': 2, 'rf_model__min_samples_leaf': 1, 'rf_model__max_features': 'sqrt', 'rf_model__criterion': 'squared_error'}


# Extremely Random Forest

In [55]:
xrf_model = ExtraTreesRegressor()
xrf_pipe = Pipeline([('preproc', preproc_pipe), ('xrf_model', xrf_model)])
parameters = {'xrf_model__criterion': ['squared_error', 'friedman_mse', 'poisson'],
              'xrf_model__min_samples_split': np.linspace(2,40,10, dtype=int),
              'xrf_model__min_samples_leaf': np.linspace(1,40,10, dtype=int),
              'xrf_model__max_features': [1.0, 'sqrt', 'log2'],
              'xrf_model__n_estimators': np.linspace(20, 2000, 5, dtype=int),
            #   'xrf_model__ccp_alpha': np.linspace(0,0.5,10),
              }
num_prop = find_prop_of_param(parameters, rand_prop)

if rand_bool:
    xrf_reg= RandomizedSearchCV(xrf_pipe, parameters, n_iter=num_prop, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
else:
    xrf_reg= GridSearchCV(xrf_pipe, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
    
xrf_reg.fit(X_train, y_train)
print(f'RMSE: {mean_squared_error(y_test, xrf_reg.best_estimator_.predict(X_test), squared=False)}')
print(f'R-Squared: {xrf_reg.best_estimator_.score(X_test, y_test)}')
print(xrf_reg.best_params_)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits
RMSE: 0.29462663672282746
R-Squared: 0.5685626105188284
{'xrf_model__n_estimators': 2000, 'xrf_model__min_samples_split': 2, 'xrf_model__min_samples_leaf': 1, 'xrf_model__max_features': 'sqrt', 'xrf_model__criterion': 'poisson'}


# AdaBoost

In [56]:
base = DecisionTreeRegressor()
ada_model = AdaBoostRegressor(estimator=base)
ada_pipe = Pipeline([('preproc', preproc_pipe), ('ada_model', ada_model)])
parameters = {'ada_model__n_estimators': [10, 50, 100, 500, 1000],
              'ada_model__learning_rate': [0.01, 0.05, 0.1, 0.5, 2, 10, 50],
              'ada_model__estimator__max_depth': [1, 2, 3, 5, 10],
              }
num_prop = find_prop_of_param(parameters, rand_prop)

if rand_bool:
    ada_reg= RandomizedSearchCV(ada_pipe, parameters, n_iter=num_prop, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
else:
    ada_reg= GridSearchCV(ada_pipe, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)

ada_reg.fit(X_train, y_train)
print(f'RMSE: {mean_squared_error(y_test, ada_reg.best_estimator_.predict(X_test), squared=False)}')
print(f'R-Squared: {ada_reg.best_estimator_.score(X_test, y_test)}')
print(ada_reg.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
RMSE: 0.31769517723941787
R-Squared: 0.49835671015213834
{'ada_model__n_estimators': 10, 'ada_model__learning_rate': 0.5, 'ada_model__estimator__max_depth': 10}


# Gradient Boost

In [57]:
grad_model = GradientBoostingRegressor()
grad_pipe = Pipeline([('preproc', preproc_pipe), ('grad_model', grad_model)])
# parameters = {'grad_model__loss': ['squared_error', 'huber', 'quantile'],
#               'grad_model__learning_rate': [0.01, 0.05, 0.1, 0.5, 2, 10, 50],
#               'grad_model__n_estimators': [10, 50, 100, 500, 1000],
#               'grad_model__min_samples_split': [2, 100, 1000],
#               'grad_model__min_samples_leaf': [1, 10, 100],
#               'grad_model__max_depth': [1, 2, 3, 5, 10],
#               'grad_model__max_features': [1.0, 'sqrt', 'log2']
#               }
parameters = {'grad_model__loss': ['huber'],
              'grad_model__learning_rate': [0.01, 0.05, 0.1, 0.5, 2, 10, 50],
              'grad_model__n_estimators': [10, 50, 100, 500, 1000],
              'grad_model__min_samples_split': [2, 100, 1000],
              'grad_model__min_samples_leaf': [1, 10, 100],
              'grad_model__max_depth': [1, 2, 3, 5, 10],
              'grad_model__max_features': ['log2']
              }
num_prop = find_prop_of_param(parameters, rand_prop)

if rand_bool:
    grad_reg= RandomizedSearchCV(grad_pipe, parameters, n_iter=num_prop, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)
else:
    grad_reg= GridSearchCV(grad_pipe, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=num_cpu_util, verbose=sklearn_verbosity)


grad_reg.fit(X_train, y_train)
print(f'RMSE: {mean_squared_error(y_test, grad_reg.best_estimator_.predict(X_test), squared=False)}')
print(f'R-Squared: {grad_reg.best_estimator_.score(X_test, y_test)}')
print(grad_reg.best_params_)

Fitting 5 folds for each of 78 candidates, totalling 390 fits



One or more of the test scores are non-finite: [-1.70603256e-01 -1.68008205e-01             nan -8.46871581e-02
 -9.42052665e-02 -1.45504951e-01 -8.27428831e-02 -8.94476640e-02
 -8.25674436e-02 -1.88907613e-01 -9.91917392e-02             nan
 -8.70261489e-02 -1.18354127e-01 -1.58195475e-01 -8.30031433e-02
 -9.32612880e-02             nan -1.30043493e+31 -8.53809695e-02
 -4.24825751e+92 -1.03996329e-01 -2.35479640e-01 -1.41606243e-01
 -3.99107472e-01 -1.18431725e-01 -1.72315209e-01 -1.26953840e+31
 -2.33436631e-01 -4.79276702e-01 -1.04032658e-01 -1.04943940e-01
            -inf -1.05860243e-01 -1.22941390e-01 -1.89126028e-01
 -2.68549635e-01 -1.87557539e-01 -8.21848441e-02 -9.27355845e-02
 -5.20849899e-01 -8.09460665e-02 -2.31796153e-01 -1.51859450e-01
 -2.43889718e+00 -9.04853678e-02 -8.32465117e-02 -1.03103812e-01
 -9.65638847e-02            -inf             nan -4.49167705e-01
 -1.92370867e+00 -9.27083008e-02 -8.81235553e-02 -9.65794334e-02
 -1.14896992e-01 -1.55881679e-01 -2.027344

RMSE: 0.2706247213133648
R-Squared: 0.6359938710743643
{'grad_model__n_estimators': 1000, 'grad_model__min_samples_split': 1000, 'grad_model__min_samples_leaf': 10, 'grad_model__max_features': 'log2', 'grad_model__max_depth': 10, 'grad_model__loss': 'huber', 'grad_model__learning_rate': 0.05}


# XGBoost