# ISLR Sec 6-6 Ridge and Lasso Regression

In [41]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV,LassoCV,LinearRegression
from sklearn.cross_validation import cross_val_score

# load data that has been cleaned
df = pd.read_csv('../Data/Hitters-cleaned.csv',index_col=0)
print(df.shape)
df.head(3)

(263, 20)


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475,N
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480,A
-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500,N


In [2]:
# convert qualitative data to "one-hot-encoding"
df = pd.get_dummies(df,columns=['League','Division','NewLeague'])
print(df.shape)
df.head(3)

(263, 23)


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,...,PutOuts,Assists,Errors,Salary,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,...,632,43,10,475,0,1,0,1,0,1
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,...,880,82,14,480,1,0,0,1,1,0
-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,...,200,11,3,500,0,1,1,0,0,1


### Ridge Regression (L2 Regularization)

Note that the regularization parameter `alpha` is defined differently in scikit learn than it is in R. Results will vary. The "null model" is a model where all the coefficients are equal to (or nearly equal to) zero. This is the model that results from very large values of `alpha`.

In [37]:
# generate a range of 100 alpha parameters
alpha = 10**np.linspace(10,-2,100)
#alpha = [10**10] # null model

LR_ridge = RidgeCV(alphas=alpha,normalize=True)

x = df.drop('Salary',axis=1).values
y = df.Salary.values
y = np.reshape(y,(len(y),1))
LR_ridge.fit(x,y)
print('optimal alpha = %0.3f' % LR_ridge.alpha_)
print('intercept = %0.3f' % LR_ridge.intercept_)
col = df.drop('Salary',axis=1).columns
print(pd.DataFrame(LR_ridge.coef_,columns=col).T)
v = LR_ridge.coef_
print('magnitude of coefficients vector %0.3f' % np.sqrt((v*v).sum()))

optimal alpha = 0.013
intercept = 95.085
                     0
AtBat        -1.465962
Hits          5.134054
HmRun         0.210809
Runs         -0.014820
RBI           0.184246
Walks         4.930611
Years       -10.759929
CAtBat       -0.041510
CHits         0.186562
CHmRun        0.706264
CRuns         0.605212
CRBI          0.356873
CWalks       -0.553759
PutOuts       0.277622
Assists       0.270305
Errors       -3.819765
League_A    -31.376282
League_N     31.376282
Division_E   62.171514
Division_W  -62.171514
NewLeague_A  14.641322
NewLeague_N -14.641322
magnitude of coefficients vector 101.553


The features: `Hits, Walks, Years, League, Division, NewLeague` standout as important features. Uncomment `alpha = [10**10]` line to see what happens for very large alpha. You should get the `null model`.

### Lasso Regression (L1 Regularization)

In [39]:
# generate a range of 100 alpha parameters
alpha = 10**np.linspace(10,-2,100)
#alpha = [10**10] # null model

LR_lasso = LassoCV(alphas=alpha,normalize=True)

x = df.drop('Salary',axis=1).values
y = df.Salary.values
#y = np.reshape(y,(len(y),1))
LR_lasso.fit(x,y)
print('optimal alpha = %0.3f' % LR_lasso.alpha_)
print('intercept = %0.3f' % LR_lasso.intercept_)
col = df.drop('Salary',axis=1).columns
# need to reshape coef array for some reason.
coef = np.reshape(LR_lasso.coef_,(1,len(LR_lasso.coef_))) 
print(pd.DataFrame(coef,columns=col).T)
v = coef
print('magnitude of coefficients vector %0.3f' % np.sqrt((v*v).sum()))

optimal alpha = 0.285
intercept = -12.770
                        0
AtBat       -1.017647e+00
Hits         4.500007e+00
HmRun        0.000000e+00
Runs         0.000000e+00
RBI          0.000000e+00
Walks        3.768155e+00
Years       -6.357509e+00
CAtBat      -0.000000e+00
CHits        0.000000e+00
CHmRun       3.202674e-01
CRuns        4.886699e-01
CRBI         3.981709e-01
CWalks      -3.110171e-01
PutOuts      2.582885e-01
Assists      7.206160e-02
Errors      -1.168519e+00
League_A    -2.903631e+01
League_N     0.000000e+00
Division_E   1.187075e+02
Division_W  -2.103449e-14
NewLeague_A -0.000000e+00
NewLeague_N  0.000000e+00
magnitude of coefficients vector 122.526


A different combination of features standout as important. Uncomment `alpha = [10**10]` line to see what happens for very large alpha. You should get the `null model`.

### Comparisions using 10 fold cross-validation.

In [51]:
LR = LinearRegression(normalize=True)
LR.fit(x,y)
MSE = -1.0*cross_val_score(LR,x,y,scoring='mean_squared_error',cv=10)
print('ordinary regression MSE = %0.3f' % MSE.mean())

LR_ridge = RidgeCV(alphas=[0.013],normalize=True)
LR_ridge.fit(x,y)
MSE = -1.0*cross_val_score(LR_ridge,x,y,scoring='mean_squared_error',cv=10)
print('ridge (L2) regression MSE = %0.3f' % MSE.mean())

LR_lasso = LassoCV(alphas=[0.285],normalize=True)
LR_lasso.fit(x,y)
MSE = -1.0*cross_val_score(LR_lasso,x,y,scoring='mean_squared_error',cv=10)
print('lasso (L1) regression MSE = %0.3f' % MSE.mean())

LR_null = LassoCV(alphas=[10**10],normalize=True)
LR_null.fit(x,y)
MSE = -1.0*cross_val_score(LR_null,x,y,scoring='mean_squared_error',cv=10)
print('null model MSE = %0.3f' % MSE.mean())

ordinary regression MSE = 116599.014
ridge (L2) regression MSE = 114211.536
lasso (L1) regression MSE = 115289.938
null model MSE = 204591.764


L2 regularization has lowest MSE, but L1 is easier to interpret because of the many exactly zero coefficients. The null model (all coefficients equal to 0) is the worst.