In [3776]:
import numpy as np
import pandas as pd

import pickle
import ast
from itertools import combinations

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn import set_config
from sklearn.utils import shuffle
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score ,train_test_split, LeaveOneOut
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, mutual_info_regression

import category_encoders as ce
import missingno as msno

from patsy import dmatrices
from statsmodels.compat import lzip
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.compat import lzip

from constants import *

%matplotlib inline

In [3777]:
set_config(transform_output="pandas")

In [3778]:
html_folder = 'C:\\Users\\Brayden\\Desktop\\Personal Website\\Brayden-L.github.io\\_includes\\complex_linear_regression_routes\\'

In [3779]:
df = pd.read_pickle('All_Loc_Cleaned_Stripped.pkl')

In [3780]:
# This undoes the complex length imputation performed in "Length_Imputation.ipynb" for nullity analysis

undo_length_imp_bool = False

if undo_length_imp_bool:
        df1 = pd.read_csv('All_Loc.csv')
        # Create Route ID Column for joining and remove duplicates
        if "Route ID" not in df1.columns:
                df1.insert(len(df1.columns), "Route ID", "")
        df1["Route ID"] = df1["URL"].apply(lambda x: int(x.split("/")[4]))

        df1.drop_duplicates('Route ID', inplace=True)
        df1[['Route ID', 'Length']]
        df.drop('Length', axis=1, inplace=True)
        df = df.merge(df1[['Route ID', 'Length']], how='left', on='Route ID')

In [3781]:
df = shuffle(df)

In [3782]:
star_vote_cutoff = 5
bayesian_stars=True


# Cleaning

In [3783]:
# Route ID
df.set_index('Route ID', inplace=True)

## Response Variable Cleaning

In [3784]:
o_size = df.shape[0]

In [3785]:
# Drop values with not enough vote ratings
df.drop(df[df['Num Star Ratings']<=star_vote_cutoff].index, axis=0, inplace=True)

In [3786]:
# Drop MP entries
df.drop(df[df['SP/MP']=='MP'].index, axis=0, inplace=True)

In [3787]:
df.shape[0]

12990

In [3788]:
df.shape[0]/o_size

0.449620989235402

In [3789]:
if bayesian_stars:
    m = df['Avg Stars'].mean()
    c = df['Num Star Ratings'].quantile(0.25)
    df['Bayesian Stars'] = ((df['Avg Stars'] * df['Num Star Ratings']) + (c*m)) / (df['Num Star Ratings'] + c)

    # Compare old rating scale to new
    x0 = df['Avg Stars']
    x1 = df['Bayesian Stars']
    rate_comp_df =pd.DataFrame(dict(
        series=np.concatenate((["Average Stars"]*len(x0), ["Bayesian Stars"]*len(x1))), 
        data  =np.concatenate((x0,x1))
    ))
    fig = px.histogram(rate_comp_df, color='series', barmode='overlay', marginal='box')
    fig.update_layout(title='Avg Vs. Bayesian Stars Histogram', title_x=0.5)
    fig.update_xaxes(title="Stars", row=1,col=1)
    fig.update_yaxes(title="Count")
    # fig.write_html(html_folder + 'Avg_Vs_Baye_Hist.html')
    fig.show()
else:
    fig = px.histogram(df['Avg Stars'])
    fig.show()

In [3790]:
# Homogenize Rating
def grade_homo(df_source, r_type, r_direction, b_type, b_direction):
    """
    Reassigns grades to a single YDS or Vgrade schema.

    Parameters
    ----------
    df_source : df
        Original route df.
    r_type : str [letter, sign]
        YDS letter or sign style grades.
    r_direction : str [up, down, even_rand, manual]
        Unused if r_type='letter'. Which way to assign grades. even_rand rounds a randomly selected half up and the randomly remaining half down.
    b_type : str [flat, sign]
        Vgrade flat grades or include sign grades.
    b_direction : str [up, down, even_rand, manual]
        Used for both b_type.

    Return
    ------
    df_source : df
        Original df with grade homogenization
    """
    rating_isolate = df_source["Original Rating"].apply(
        lambda row: [val for val in row.split()][0]
    )  # This is a fail-safe to ensure we are only looking at the part of the rating we care about, not risk or sub-ratings.

    # Reset 'Rating' column so this mapping can be re-run
    df_source["Rating"] = df_source["Original Rating"]

    # Roped Grades
    def grademoderate():
        grade_change_subset = rating_isolate.isin(list(rgrademoderatemap.keys()))
        df_source.loc[grade_change_subset, "Rating"] = df_source.loc[
            grade_change_subset
        ]["Original Rating"].map(rgrademoderatemap)

    def grade_split(upmap, downmap):
        grade_change_subset = rating_isolate.isin(list(upmap.keys()))
        grade_change_subset_df = df_source[grade_change_subset]
        for grade in grade_change_subset_df["Original Rating"].unique():
            to_change = grade_change_subset_df[
                grade_change_subset_df["Original Rating"] == grade
            ]
            changed_up = to_change.sample(frac=0.5)["Original Rating"].map(upmap)
            df_source.loc[changed_up.index, "Rating"] = changed_up
        grade_change_subset = rating_isolate.isin(list(downmap.keys()))
        grade_change_subset_df = df_source[grade_change_subset]
        for grade in grade_change_subset_df["Original Rating"].unique():
            to_change = grade_change_subset_df[
                grade_change_subset_df["Original Rating"] == grade
            ]
            changed_down = to_change["Original Rating"].map(downmap)
            df_source.loc[changed_down.index, "Rating"] = changed_down

    if r_type == "sign":
        grade_change_subset = rating_isolate.isin(list(rgradecompmap.keys()))
        df_source.loc[grade_change_subset, "Rating"] = df_source[grade_change_subset][
            "Original Rating"
        ].map(rgradecompmap)
    else:
        if r_direction == "up":
            grademoderate()
            grade_change_subset = rating_isolate.isin(list(rgradeupmap.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(rgradeupmap)
        if r_direction == "down":
            grademoderate()
            grade_change_subset = rating_isolate.isin(list(rgradedownmap.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(rgradedownmap)
        if r_direction == "even_rand":
            grademoderate()
            grade_split(rgradeupmap, rgradedownmap)

    # Boulder Grades
    if b_type == "flat":
        # Remove all + and - grades
        grade_change_subset = rating_isolate.isin(list(bgradeconmapflat.keys()))
        df_source.loc[grade_change_subset, "Rating"] = df_source[grade_change_subset][
            "Original Rating"
        ].map(bgradeconmapflat)

        if b_direction == "up":
            grade_change_subset = rating_isolate.isin(list(bgradeupmapflat.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradeupmapflat)
        if b_direction == "down":
            grade_change_subset = rating_isolate.isin(list(bgradedownmapflat.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradedownmapflat)
        if b_direction == "even_rand":
            grade_split(bgradeupmapflat, bgradedownmapflat)

    if b_type == "sign":
        if b_direction == "up":
            grade_change_subset = rating_isolate.isin(list(bgradeupmapsign.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradeupmapsign)
        if b_direction == "down":
            grade_change_subset = rating_isolate.isin(list(bgradedownmapsign.keys()))
            df_source.loc[grade_change_subset, "Rating"] = df_source[
                grade_change_subset
            ]["Original Rating"].map(bgradedownmapsign)
        if b_direction == "even_rand":
            grade_split(bgradeupmapsign, bgradedownmapsign)

    return df_source

df['Original Rating'] = df['Rating']
df = grade_homo(df, 'letter', 'even_rand', 'flat', 'down')
df.drop('Original Rating', axis=1, inplace=True)

## Predictor Cleaning

### Null Handling

In [3791]:
null_plot_bool = False

nulldf = df[['Rating', 'Length', 'Area Latitude', 'Area Longitude', 'Risk', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP', 'Route Type']]

In [3792]:
if null_plot_bool:
    msno.matrix(nulldf)

In [3793]:
if null_plot_bool:
    msno.heatmap(nulldf)

In [3794]:
if null_plot_bool:
    msno.dendrogram(nulldf)

In [3795]:
(nulldf.isnull().sum()/df.shape[0])*100

Rating                  0.246343
Length                  0.000000
Area Latitude           0.000000
Area Longitude          0.000000
Risk                   93.941493
Num Ticks               0.046189
Lead Ratio              0.400308
OS Ratio                2.609700
Repeat Sender Ratio     3.903002
Mean Attempts To RP    28.221709
Route Type              0.007698
dtype: float64

In [3796]:
# Remove null rating and route type values which are likely 3rd, 4th class etc. Not relevant.
df.drop(df[df['Rating'].isna()].index, inplace=True)
df.drop(df[df['SP/MP'].isna()].index, inplace=True)
df.drop(df[df['Route Type'].isna()].index, inplace=True)
df.drop(df[df['Route Type']=='Boulder'].index, inplace=True)

In [3797]:
df.shape[0]

12946

### Encoding

In [3798]:
# Risk
# Replace NA with 'G'
df['Risk'] = df['Risk'].cat.set_categories(['G', 'PG13', 'R', 'X'], ordered=True)
df.loc[df['Risk'].isna(), 'Risk'] = 'G'

In [3799]:
# Change Trad/Sport column to "Is Trad"
df['Is Trad'] = df['Route Type'].map({"Trad":1, "Sport":0}).astype(int)
df.drop('Route Type', axis=1, inplace=True)
# Change SP/MP solumn to "Is SP"
df['Is SP'] = df['SP/MP'].map({'SP':1, 'MP':0}).astype(int)
df.drop('SP/MP', axis=1, inplace=True)
# Change Length Missing to binary
df['Length Missing'] = df['Length Missing'].map({True:1, False:0}).astype(int)

In [3800]:
# Encode Risk Ordinal
ce_ord_risk = ce.OrdinalEncoder(cols=['Risk'])
df = ce_ord_risk.fit_transform(df)

In [3801]:
# Encode Rating Ordinal
ce_ord_rating = ce.OrdinalEncoder(cols=['Rating'])
df = ce_ord_rating.fit_transform(df)

In [3802]:
# sns.pairplot(df[['Avg Stars', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']])

### Imputation

In [3803]:
# Length imputation was done prior to this on the whole data set. It is a pretty involved custom imputation.

In [3804]:
# Num Ticks and Num Tickers
df.loc[df['Num Ticks'].isna(), 'Num Ticks'] = 0
df.loc[df['Num Tickers'].isna(), 'Num Tickers'] = 0

In [3805]:
tick_metric_simp_imp_bool = False
missing_ind_bool = False
if tick_metric_simp_imp_bool:
    col_trans_tick_met = make_column_transformer((SimpleImputer(strategy='median', add_indicator=missing_ind_bool), ['OS Ratio', 'Lead Ratio']), 
                                                 (SimpleImputer(strategy='constant', fill_value=1, add_indicator=missing_ind_bool), ['Repeat Sender Ratio', 'Mean Attempts To RP']), 
                                                 remainder='passthrough', verbose_feature_names_out=False)
else:
    col_trans_tick_met = make_column_transformer((IterativeImputer(min_value=0, add_indicator=missing_ind_bool), ['Rating', 'Length', 'Area Latitude', 'Area Longitude', 'Risk', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP', 'Is Trad']), 
                                                 remainder='passthrough', verbose_feature_names_out=False)
col_trans_tick_met.fit_transform(df[['Rating', 'Length', 'Area Latitude', 'Area Longitude', 'Risk', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP', 'Is Trad']])

Unnamed: 0,Rating,Length,Area Latitude,Area Longitude,Risk,Num Ticks,Lead Ratio,OS Ratio,Repeat Sender Ratio,Mean Attempts To RP,Is Trad
0,1.0,85.0,49.64460,-123.20520,1.0,37.0,0.787879,0.192308,1.000000,1.066667,0.0
1,2.0,100.0,37.65292,-83.72495,1.0,142.0,1.000000,0.104348,1.018868,1.162791,0.0
2,3.0,40.0,37.64780,-119.00300,1.0,227.0,0.678571,0.680851,1.056338,1.900000,0.0
3,4.0,95.0,38.57140,-109.58150,1.0,98.0,0.802817,0.770833,1.000000,1.250000,0.0
4,1.0,45.0,37.81449,-83.66388,1.0,147.0,0.991453,0.088496,1.016667,1.300000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12941,9.0,70.0,37.64669,-83.72282,1.0,24.0,1.000000,0.842105,1.000000,1.000000,0.0
12942,6.0,65.0,39.32001,-120.32196,1.0,146.0,0.930693,0.186813,1.111111,1.184211,0.0
12943,23.0,90.0,34.03943,-116.13434,1.0,4.0,1.000000,0.000000,1.000000,1.000000,1.0
12944,14.0,60.0,44.13293,-107.25738,1.0,385.0,0.901887,0.772321,1.022989,1.400000,0.0


In [3806]:
(df.isnull().sum()/df.shape[0])*100

Route                   0.000000
Location                0.000000
Avg Stars               0.000000
Num Star Ratings        0.000000
Rating                  0.000000
Pitches                 0.000000
Length                  0.000000
Length Missing          0.000000
Area Latitude           0.000000
Area Longitude          0.000000
Risk                    0.000000
Base Location           0.000000
Num Ticks               0.000000
Num Tickers             0.000000
Lead Ratio              0.347598
OS Ratio                2.549050
Repeat Sender Ratio     3.823575
Mean Attempts To RP    28.070446
Bayesian Stars          0.000000
Is Trad                 0.000000
Is SP                   0.000000
dtype: float64

## Outliers

In [3807]:
# Remove outliers
repeat_sender_ratio_cutoff = 1.6
mean_attempt_to_rp_cutoff = 3.1
print(df[df['Repeat Sender Ratio']>repeat_sender_ratio_cutoff].shape[0])
df.drop(df[df['Repeat Sender Ratio']>repeat_sender_ratio_cutoff].index, axis=0, inplace=True)
print(df[df['Mean Attempts To RP']>mean_attempt_to_rp_cutoff].shape[0])
df.drop(df[df['Mean Attempts To RP']>mean_attempt_to_rp_cutoff].index, axis=0, inplace=True)

93
59


In [3808]:
# 28% of values are null
# 29% are a near default 1
# 2.1% are outliers

In [3809]:
# px.histogram(df['Mean Attempts To RP'], marginal='box')

In [3810]:
# px.histogram(df[df['Mean Attempts To RP']>1]['Mean Attempts To RP'], marginal='box')

In [3811]:
# 3.8% are null
# 58.5% are near default 1
# 3.4% outliers

In [3812]:
# px.histogram(df['Repeat Sender Ratio'], marginal='box')

In [3813]:
# px.histogram(df[df['Repeat Sender Ratio']>1]['Repeat Sender Ratio'], marginal='box')

In [3814]:
df.shape[0]

12794

## EDA

In [3815]:
# sns.pairplot(df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']], kind='kde', diag_kind='kde', corner=True)

In [3816]:
# sns.pairplot(df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']], kind='reg', diag_kind='kde', corner=True, plot_kws={'line_kws':{'color':'red'}})

In [3817]:
### Correlation Heatmap
# Calculate correlation using the default method ( "pearson")
corr = df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']].corr()
# optimize aesthetics: generate mask for removing duplicate / unnecessary info
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
# Generate a custom diverging colormap as indicator for correlations:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Plot
sns.set(rc={"figure.figsize":(12, 12)})
# sns.heatmap(corr, mask=mask, cmap=cmap, annot=True,  square=True, annot_kws={"size": 12}, vmin=-1, vmax=1)

In [3818]:
df.drop(['Location', 'Base Location', 'Num Tickers', 'Pitches', 'Is SP', 'Route'], axis=1, inplace=True)

## Scale

In [3819]:
std_scaler = StandardScaler()
normalize_scaler = MinMaxScaler(feature_range = (0, 1))
robust_scaler = RobustScaler()
quant_scaler = QuantileTransformer()
power_scaler = PowerTransformer()

scaler_list = (std_scaler, normalize_scaler, robust_scaler, quant_scaler, power_scaler)
scaler_sel = make_column_transformer((quant_scaler, ['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']), remainder='passthrough', verbose_feature_names_out=False)

## Preproc Pipeline

In [3820]:
# Define preproc pipeline
preproc_pipe = Pipeline([
    ("imputer", col_trans_tick_met),
    ("scaler", scaler_sel)
])

# Pipeline Setup

In [3823]:
# Create hold-out test set
col_ful = df[['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude', 'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio', 'Repeat Sender Ratio', 'Mean Attempts To RP']]
X = col_ful
y = df['Bayesian Stars']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

## Basic Linear Regression

In [3824]:
basic_linreg = LinearRegression()
basic_linreg_pipe = Pipeline([('preproc', preproc_pipe), ('model', basic_linreg)])
print(np.mean(np.sqrt(-cross_val_score(basic_linreg_pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))))
print(np.mean(cross_val_score(basic_linreg_pipe, X_train, y_train, cv=5)))

0.3677901509108435
0.3793335758060593


In [3715]:
X_train_forward_subselect = preproc_pipe.fit_transform(X_train)
sfs = SequentialFeatureSelector(basic_linreg, n_features_to_select='auto', tol=0.001, cv=5, scoring='neg_mean_squared_error')
sfs.fit(X_train_forward_subselect, y_train)
sfs.get_support
sfs.transform(X_train_forward_subselect)

Unnamed: 0,Is Trad,Length,Num Ticks,Lead Ratio,OS Ratio
0,0.0,0.480480,0.437437,0.636637,0.826326
1,0.0,0.374875,0.790791,0.472815,0.508366
2,0.0,0.570070,0.421421,0.865365,0.098829
3,1.0,0.374875,0.634134,0.150216,0.826326
4,1.0,0.706707,0.485986,0.052252,1.000000
...,...,...,...,...,...
11509,0.0,0.570070,0.164665,0.853353,0.000000
11510,0.0,0.783784,0.024525,1.000000,0.000000
11511,0.0,0.570070,0.903904,0.809794,0.397248
11512,0.0,0.706707,0.777778,0.582119,0.570571


In [3716]:
X_train_mi = preproc_pipe.fit_transform(X_train)
sfs = SelectKBest(mutual_info_regression, k=5)
sfs.fit_transform(X_train_mi, y_train)
sfs.get_support
sfs.transform(X_train_mi)

Unnamed: 0,Rating,Num Ticks,Lead Ratio,OS Ratio,Repeat Sender Ratio
0,0.185686,0.436937,0.636136,0.826827,0.296296
1,0.773273,0.791291,0.473763,0.508366,0.895896
2,0.638639,0.422422,0.863363,0.095943,0.296296
3,0.092092,0.634635,0.148078,0.826827,0.824324
4,0.092092,0.486987,0.051781,1.000000,0.296296
...,...,...,...,...,...
11509,0.988488,0.164164,0.850851,0.000000,0.296296
11510,0.988488,0.024024,1.000000,0.000000,0.296296
11511,0.513514,0.905572,0.808702,0.394562,0.877534
11512,0.867367,0.778779,0.583060,0.569052,0.956690


In [3717]:
print(X_train_forward_subselect.columns)
pd.DataFrame(mutual_info_regression(X_train_forward_subselect, y_train), X_train_forward_subselect.columns).sort_values(0, ascending=False)

Index(['Rating', 'Is Trad', 'Risk', 'Length', 'Area Latitude',
       'Area Longitude', 'Num Ticks', 'Lead Ratio', 'OS Ratio',
       'Repeat Sender Ratio', 'Mean Attempts To RP'],
      dtype='object')


Unnamed: 0,0
Num Ticks,0.544759
OS Ratio,0.246584
Rating,0.221365
Repeat Sender Ratio,0.139014
Lead Ratio,0.128011
Area Latitude,0.118916
Mean Attempts To RP,0.118625
Area Longitude,0.118038
Length,0.092261
Is Trad,0.017737


In [3825]:
pls_model = Ridge()
ridge_pipe = Pipeline([('preproc', preproc_pipe), ('ridge_model', ridge)])

pls_mse=[]
for i in np.arange(1, X_train.shape[1]+1):
    pls_model = PLSRegression(n_components=i)
    pls_pipe = Pipeline([('preproc', preproc_pipe), ('pls_model', pls_model)])
    score = cross_val_score(pls_pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    pls_mse.append(-score)
    
fig = px.scatter(pd.DataFrame(np.sqrt(pls_mse), np.arange(1, X_train.shape[1]+1)))
fig.update_xaxes(title='Number of Features')
fig.update_xaxes(title='RMSE')
fig.update_layout(showlegend=False, title='RMSE Vs. Num Features by PLS', title_x=0.5, width=800)
fig.write_html(html_folder + 'PLS.html')
fig

## Ridge

In [3830]:
ridge = Ridge()
ridge_pipe = Pipeline([('preproc', preproc_pipe), ('ridge_model', ridge)])
parameters = {'ridge_model__alpha':np.logspace(-5, 3, 100)}
ridge_reg= GridSearchCV(ridge_pipe, parameters, scoring='neg_mean_squared_error',cv=5)
ridge_reg.fit(X_train, y_train)
print(mean_squared_error(y_test, ridge_reg.best_estimator_.predict(X_test), squared=False))
print(ridge_reg.best_estimator_.score(X_test, y_test))

0.3689514915443525
0.3821700544784029


In [3831]:
ridge_reg.best_estimator_['ridge_model'].coef_

array([ 1.02071130e-01,  5.67533038e-01,  6.23148279e-02,  2.81693663e-01,
        2.20942272e-01, -6.68509444e-01, -3.51487201e-03, -2.38535191e-04,
       -2.44665225e-02, -1.96087640e-01,  5.28063923e-02])

## LASSO

In [3832]:
lasso = Lasso()
lasso_pipe = Pipeline([('preproc', preproc_pipe), ('lasso_model', lasso)])
parameters = {'lasso_model__alpha':np.logspace(-5, 3, 100)}
lasso_reg= GridSearchCV(lasso_pipe, parameters, scoring='neg_mean_squared_error',cv=5)
lasso_reg.fit(X_train, y_train)
print(mean_squared_error(y_test, lasso_reg.best_estimator_.predict(X_test), squared=False))
print(lasso_reg.best_estimator_.score(X_test, y_test))

0.3687951173437637
0.3826936582158955


In [3833]:
lasso_reg.best_estimator_['lasso_model'].coef_

array([ 1.02842997e-01,  5.67865996e-01,  6.20702912e-02,  2.85297248e-01,
        2.22172617e-01, -6.71348320e-01, -3.51781813e-03, -2.56266872e-04,
       -2.42724735e-02, -2.13337722e-01,  5.25110865e-02])

## Elastic

In [3834]:
elastic = ElasticNet()
elastic_pipe = Pipeline([('preproc', preproc_pipe), ('elastic_model', elastic)])
parameters = {'elastic_model__alpha':np.logspace(-5, 3, 100), 'elastic_model__l1_ratio':np.linspace(0,1,10)}
elastic_reg= GridSearchCV(elastic_pipe, parameters, scoring='neg_mean_squared_error',cv=5)
elastic_reg.fit(X_train, y_train)
print(mean_squared_error(y_test, elastic_reg.best_estimator_.predict(X_test), squared=False))
print(elastic_reg.best_estimator_.score(X_test, y_test))


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.195e+02, tolerance: 2.005e-01 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.263e+02, tolerance: 2.009e-01 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.219e+02, tolerance: 2.006e-01 Linear regression models with null

0.36895750336104166
0.3821499200665708



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.797e+02, tolerance: 2.510e-01 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.



In [3835]:
elastic_reg.best_estimator_['elastic_model'].coef_

array([ 1.01859463e-01,  5.66333751e-01,  6.27485204e-02,  2.81264899e-01,
        2.20549909e-01, -6.67446111e-01, -3.50920480e-03, -2.35048561e-04,
       -2.43778425e-02, -1.91476855e-01,  5.28217202e-02])

In [3837]:
elastic_reg.best_estimator_['elastic_model'].get_params

<bound method BaseEstimator.get_params of ElasticNet(alpha=0.00041320124001153384, l1_ratio=0.0)>