In [252]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.preprocessing import scale
from sklearn import cross_validation

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'svg',}
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('darkgrid')

# Lab

### § 6.6.1 Ridge Regression 

In [14]:
df = pd.read_csv('Data/Hitters.csv').dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 21 columns):
Unnamed: 0    263 non-null object
AtBat         263 non-null int64
Hits          263 non-null int64
HmRun         263 non-null int64
Runs          263 non-null int64
RBI           263 non-null int64
Walks         263 non-null int64
Years         263 non-null int64
CAtBat        263 non-null int64
CHits         263 non-null int64
CHmRun        263 non-null int64
CRuns         263 non-null int64
CRBI          263 non-null int64
CWalks        263 non-null int64
League        263 non-null object
Division      263 non-null object
PutOuts       263 non-null int64
Assists       263 non-null int64
Errors        263 non-null int64
Salary        263 non-null float64
NewLeague     263 non-null object
dtypes: float64(1), int64(16), object(4)
memory usage: 41.1+ KB


In [234]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 6 columns):
League_A       263 non-null float64
League_N       263 non-null float64
Division_E     263 non-null float64
Division_W     263 non-null float64
NewLeague_A    263 non-null float64
NewLeague_N    263 non-null float64
dtypes: float64(6)
memory usage: 14.4 KB


In [293]:
X_ = df.drop(['Unnamed: 0', 'Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
y = df.Salary
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 19 columns):
AtBat          263 non-null float64
Hits           263 non-null float64
HmRun          263 non-null float64
Runs           263 non-null float64
RBI            263 non-null float64
Walks          263 non-null float64
Years          263 non-null float64
CAtBat         263 non-null float64
CHits          263 non-null float64
CHmRun         263 non-null float64
CRuns          263 non-null float64
CRBI           263 non-null float64
CWalks         263 non-null float64
PutOuts        263 non-null float64
Assists        263 non-null float64
Errors         263 non-null float64
League_N       263 non-null float64
Division_W     263 non-null float64
NewLeague_N    263 non-null float64
dtypes: float64(19)
memory usage: 41.1 KB


In [262]:
# Tuning parameter array
alpha = 10**np.linspace(10, -2, 100)

#### I can't seem to get close to the intercept and coefficients from the book with alpha 11498. Have to look into the R and scikit-learn docs to look for possible differences in parameters. But maybe I am doing something else wrong...

In [338]:
clf = linear_model.Ridge(alpha=11498)
clf.fit(scale(X), y)

Ridge(alpha=11498, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [339]:
clf.intercept_

535.92588212927751

In [340]:
clf.coef_

array([ 3.49446172,  3.95260562,  2.9838061 ,  3.75456785,  3.98094819,
        3.99172605,  3.46110101,  4.61699477,  4.84809468,  4.62583947,
        4.96977862,  5.00879671,  4.27539184,  2.83788836,  0.23643382,
       -0.0696133 , -0.03422498, -1.87773486,  0.04848511])

### § 6.6.2 The Lasso

In [323]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, train_size=131)

In [336]:
clf = linear_model.LassoCV(alphas=alpha, cv=10, verbose=True)
clf.fit(scale(X_train), scale(y_train))

....................................................................................................[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

LassoCV(alphas=array([  1.00000e+10,   7.56463e+09, ...,   1.32194e-02,   1.00000e-02]),
    copy_X=True, cv=10, eps=0.001, fit_intercept=True, max_iter=1000,
    n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=True)