In [191]:
# ridge model
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
Prostate=pd.read_csv('Prostate.txt', sep=' ')
Prostate.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783
2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519
3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519
4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519
5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564


In [133]:
# define x and y
X=Prostate[['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']]
Y=Prostate['lpsa']

In [192]:
# standardize X
Prostate_std=((Prostate - Prostate.mean(0)) / Prostate.std(0))
Prostate_std.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
1,-1.637356,-2.006212,-1.862426,-1.024706,-0.522941,-0.863171,-1.042157,-0.864467,-2.520226
2,-1.98898,-0.722009,-0.787896,-1.024706,-0.522941,-0.863171,-1.042157,-0.864467,-2.287827
3,-1.578819,-2.188784,1.361163,-1.024706,-0.522941,-0.863171,0.342627,-0.155348,-2.287827
4,-2.166917,-0.807994,-0.787896,-1.024706,-0.522941,-0.863171,-1.042157,-0.864467,-2.287827
5,-0.507874,-0.458834,-0.250631,-1.024706,-0.522941,-0.863171,-1.042157,-0.864467,-1.82515


In [193]:
print Prostate.mean(0)
print Prostate.std(0)

lcavol      1.350010
lweight     3.628943
age        63.865979
lbph        0.100356
svi         0.216495
lcp        -0.179366
gleason     6.752577
pgg45      24.381443
lpsa        2.478387
dtype: float64
lcavol      1.178625
lweight     0.428411
age         7.445117
lbph        1.450807
svi         0.413995
lcp         1.398250
gleason     0.722134
pgg45      28.204035
lpsa        1.154329
dtype: float64


In [194]:
print Prostate_std.mean(0)
print Prostate_std.std(0)

lcavol     7.325183e-17
lweight    8.011919e-16
age        5.036063e-16
lbph       2.231892e-17
svi        3.296332e-16
lcp        3.296332e-16
gleason   -3.319223e-17
pgg45     -4.091801e-17
lpsa      -7.554095e-16
dtype: float64
lcavol     1.0
lweight    1.0
age        1.0
lbph       1.0
svi        1.0
lcp        1.0
gleason    1.0
pgg45      1.0
lpsa       1.0
dtype: float64


In [195]:
# centralize y
Prostate_ctl=Prostate - Prostate.mean(0)
Prostate_ctl.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
1,-1.929828,-0.859484,-13.865979,-1.48665,-0.216495,-1.206929,-0.752577,-24.381443,-2.90917
2,-2.344262,-0.309317,-5.865979,-1.48665,-0.216495,-1.206929,-0.752577,-24.381443,-2.640906
3,-1.860835,-0.9377,10.134021,-1.48665,-0.216495,-1.206929,0.247423,-4.381443,-2.640906
4,-2.553982,-0.346154,-5.865979,-1.48665,-0.216495,-1.206929,-0.752577,-24.381443,-2.640906
5,-0.598593,-0.19657,-1.865979,-1.48665,-0.216495,-1.206929,-0.752577,-24.381443,-2.106823


In [197]:
X_std=Prostate_std[['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']]
Y_ctl=Prostate_ctl['lpsa']

In [196]:
print Prostate_ctl.mean(0)
print Prostate.mean(0)

lcavol     1.831296e-17
lweight    3.158985e-16
age        3.076577e-15
lbph       5.036063e-17
svi        2.037316e-16
lcp        5.402322e-16
gleason   -4.669804e-16
pgg45     -1.065814e-14
lpsa      -7.599877e-16
dtype: float64
lcavol      1.350010
lweight     3.628943
age        63.865979
lbph        0.100356
svi         0.216495
lcp        -0.179366
gleason     6.752577
pgg45      24.381443
lpsa        2.478387
dtype: float64


In [169]:
print Y_ctl[:5]
print Y[:5]

1   -2.909170
2   -2.640906
3   -2.640906
4   -2.640906
5   -2.106823
Name: lpsa, dtype: float64
1   -0.430783
2   -0.162519
3   -0.162519
4   -0.162519
5    0.371564
Name: lpsa, dtype: float64


In [198]:
# split to training/test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [199]:
# split to training/test set
from sklearn.cross_validation import train_test_split
X_trainstd, X_teststd, y_trainctl, y_testctl = train_test_split(X_std, Y_ctl, test_size=0.2, random_state=42)

In [172]:
#Grid Search Parameter Tuning
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model1 = Ridge()
grid1 = GridSearchCV(estimator=model1, param_grid=dict(alpha=alphas),cv=5)
grid1.fit(X_train, y_train)
# summarize the results of the grid search
print grid1
# summarize the results of the grid search
print(grid1.best_score_)
print(grid1.best_estimator_.alpha)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.434291640231
0.1


In [173]:
#Grid Search Parameter Tuning
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model2 = Ridge()
grid2 = GridSearchCV(estimator=model2, param_grid=dict(alpha=alphas), cv=5)
grid2.fit(X_trainstd, y_trainctl)
print(grid2)
# summarize the results of the grid search
print(grid2.best_score_)
print(grid2.best_estimator_.alpha)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.435218492106
1.0


In [174]:
# fit the model
ridge1 = Ridge(alpha=0.1)
ridge1.fit(X_train, y_train)
print "R^2 for training set:",
print ridge1.score(X_train, y_train)

print '-'*50

print "R^2 for test set:",
print ridge1.score(X_test, y_test)

print '-'*50

print "RSS: %.2f" % np.sum((ridge1.predict(X_test) - y_test) ** 2)

R^2 for training set: 0.629995283603
--------------------------------------------------
R^2 for test set: 0.757481079807
--------------------------------------------------
RSS: 6.94


In [175]:
ridge1.coef_

array([ 0.56928441,  0.55222504, -0.02263402,  0.07106148,  0.78005884,
       -0.12959716,  0.06915147,  0.00415408])

In [176]:
colnames = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
result1 = pd.DataFrame(ridge1.coef_).transpose()
result1.columns = colnames
result1['intercept'] = ridge1.intercept_ 
result1 = result1.transpose()
result1.columns = ['coefficient']
result1

Unnamed: 0,coefficient
lcavol,0.569284
lweight,0.552225
age,-0.022634
lbph,0.071061
svi,0.780059
lcp,-0.129597
gleason,0.069151
pgg45,0.004154
intercept,0.405359


In [177]:
# fit the model
ridge2 = Ridge(alpha=1.0)
ridge2.fit(X_trainstd, y_trainctl)
print "R^2 for training set:",
print ridge2.score(X_trainstd, y_trainctl)

print '-'*50

print "R^2 for test set:",
print ridge2.score(X_teststd, y_testctl)

print '-'*50

print "RSS: %.2f" % np.sum((ridge2.predict(X_teststd) - y_testctl) ** 2)

R^2 for training set: 0.629815215247
--------------------------------------------------
R^2 for test set: 0.758109326274
--------------------------------------------------
RSS: 6.92


In [178]:
colnames = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
result2 = pd.DataFrame(ridge2.coef_).transpose()
result2.columns = colnames
result2['intercept'] = ridge2.intercept_ 
result2 = result2.transpose()
result2.columns = ['coefficient']
result2

Unnamed: 0,coefficient
lcavol,0.654348
lweight,0.237256
age,-0.161959
lbph,0.10099
svi,0.320171
lcp,-0.164235
gleason,0.050796
pgg45,0.109944
intercept,0.020865


In [179]:
# lasso model
from sklearn.linear_model import Lasso
#Grid Search Parameter Tuning
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a lasso regression model, testing each alpha
model3 = Lasso()
grid3 = GridSearchCV(estimator=model3, param_grid=dict(alpha=alphas),cv=5)
grid3.fit(X_train, y_train)
print(grid3)
# summarize the results of the grid search
print(grid3.best_score_)
print(grid3.best_estimator_.alpha)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.435259527531
0.01


In [186]:
# lasso model
from sklearn.linear_model import Lasso
#Grid Search Parameter Tuning
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a lasso regression model, testing each alpha
model4 = Lasso()
grid4 = GridSearchCV(estimator=model4, param_grid=dict(alpha=alphas), cv=5)
grid4.fit(X_trainstd, y_trainctl)
print(grid4)
# summarize the results of the grid search
print(grid4.best_score_)
print(grid4.best_estimator_.alpha)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.437597484705
0.1


In [187]:
# fit the model
lasso1 = Lasso(alpha=0.01)
lasso1.fit(X_train, y_train)
print "R^2 for training set:",
print lasso1.score(X_train, y_train)

print '-'*50

print "R^2 for test set:",
print lasso1.score(X_test, y_test)

print '-'*50

print "RSS: %.2f" % np.sum((lasso1.predict(X_test) - y_test) ** 2)

R^2 for training set: 0.627625020466
--------------------------------------------------
R^2 for test set: 0.756182139353
--------------------------------------------------
RSS: 6.98


In [188]:
colnames = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
result3 = pd.DataFrame(lasso1.coef_).transpose()
result3.columns = colnames
result3['intercept'] = lasso1.intercept_ 
result3 = result3.transpose()
result3.columns = ['coefficient']
result3

Unnamed: 0,coefficient
lcavol,0.566727
lweight,0.499849
age,-0.019827
lbph,0.069023
svi,0.660666
lcp,-0.088302
gleason,0.0
pgg45,0.004676
intercept,0.906338


In [189]:
# shrink the coefficients of gleason to 0

In [190]:
# fit the model
lasso2 = Lasso(alpha=0.1)
lasso2.fit(X_trainstd, y_trainctl)
print "R^2 for training set:",
print lasso2.score(X_trainstd, y_trainctl)

print '-'*50

print "R^2 for test set:",
print lasso2.score(X_teststd, y_testctl)

print '-'*50

print "RSS: %.2f" % np.sum((lasso2.predict(X_teststd) - y_testctl) ** 2)

R^2 for training set: 0.590395701258
--------------------------------------------------
R^2 for test set: 0.687950325307
--------------------------------------------------
RSS: 8.93


In [185]:
colnames = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
result4 = pd.DataFrame(lasso2.coef_).transpose()
result4.columns = colnames
result4['intercept'] = lasso2.intercept_ 
result4 = result4.transpose()
result4.columns = ['coefficient']
result4

Unnamed: 0,coefficient
lcavol,0.562257
lweight,0.178074
age,-0.0
lbph,0.0
svi,0.202819
lcp,0.0
gleason,0.0
pgg45,0.0
intercept,0.00808


In [11]:
# shrink some coefficients of age, lbph,lcp, gleason, pgg45 to 0