In [44]:
import pandas as pd
import numpy as np

from numpy import mean
from numpy import std

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
#from sklearn.svm import SVR


from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import RepeatedKFold

from matplotlib import pyplot


In [45]:
# Basic Dummified Dataset for Linear Models
base = pd.read_csv('../Data/BaseData.csv')
base.shape

(2580, 299)

In [46]:
X = base.drop(['SalePrice'], axis=1).values
y = base.SalePrice.values

scaler = StandardScaler()
X = scaler.fit_transform(X)
#y=np.log10(y)

In [47]:
# list of penalized regression models to evaluate

def get_models():
    models = dict()
    models['ridge'] = Ridge() 
    models['lasso'] = Lasso() 
    models['elastic_net'] = ElasticNet()
    return models


In [48]:
# Use cross-validation to evaluate model performance

def evaluate_model(model, X, y):
    
    # define the evaluation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
    
    # evaluate the model and collect the results
    scores = cross_validate(model, X, y, scoring=['r2', 'neg_mean_absolute_error'], 
                                  cv=cv, n_jobs=-1) 
    
    return scores

In [49]:
models = get_models()

In [50]:
scores = evaluate_model(Ridge(), X, y)

In [51]:
scores

{'fit_time': array([0.03617692, 0.01906991, 0.02771401, 0.02484894, 0.02312922,
        0.02729797, 0.01761889, 0.02449894, 0.02235293, 0.02082992]),
 'score_time': array([0.00154305, 0.00038195, 0.00099993, 0.00042105, 0.00035191,
        0.00098014, 0.00042486, 0.00107598, 0.00041318, 0.00043488]),
 'test_r2': array([0.90794069, 0.91835231, 0.92737793, 0.79440718, 0.86954043,
        0.8834173 , 0.92623184, 0.91807102, 0.95360891, 0.9334045 ]),
 'test_neg_mean_absolute_error': array([-15324.29766639, -14576.88213751, -13822.34838153, -15680.85328561,
        -15493.87253426, -16324.49987266, -13910.66264679, -14802.2469141 ,
        -12289.71114789, -12904.29339444])}

In [52]:
pd.DataFrame(scores['test_r2']).describe()

Unnamed: 0,0
count,10.0
mean,0.903235
std,0.045218
min,0.794407
25%,0.889548
50%,0.918212
75%,0.927091
max,0.953609


In [53]:
scores['test_neg_mean_absolute_error']

array([-15324.29766639, -14576.88213751, -13822.34838153, -15680.85328561,
       -15493.87253426, -16324.49987266, -13910.66264679, -14802.2469141 ,
       -12289.71114789, -12904.29339444])

In [54]:
np.abs(scores['test_neg_mean_absolute_error'])

array([15324.29766639, 14576.88213751, 13822.34838153, 15680.85328561,
       15493.87253426, 16324.49987266, 13910.66264679, 14802.2469141 ,
       12289.71114789, 12904.29339444])

In [10]:
models

{'ridge': Ridge(), 'lasso': Lasso(), 'elastic_net': ElasticNet()}

In [40]:
#results_r2, results_neg_mse, names = list(), list(), list()

for name, model in models.items():
    
    # evaluate the model
    scores = evaluate_model(model, X, y)
    #print(scores)
    
    # store the results
    results_r2 = []
    results_neg_mse = []
    names = []
    results_r2.append(scores['test_r2'])
    results_neg_mse.append(scores['test_neg_mean_absolute_error'])
    names.append(name)
    
    # summarize the performance along the way
    print('>%s %.3f (%.3f)' % (name, mean(results_r2), mean(np.abs(results_neg_mse))))

>ridge 0.910 (0.033)
>lasso -0.004 (0.130)
>elastic_net -0.004 (0.130)
