In [93]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import t, f, multivariate_normal, norm
import numpy as np
import statsmodels.api as sm
from Linear_Reg_Diagnostic import Linear_Reg_Diagnostic
import statsmodels.stats.outliers_influence as smoi
from statsmodels.graphics.api import interaction_plot, qqplot
from statsmodels.stats.anova import anova_lm
import statsmodels.formula.api as smf
from statsmodels.genmod.generalized_linear_model import GLMResults
import patsy
from sklearn import model_selection, linear_model, metrics




In [94]:
data = pd.read_csv("https://hastie.su.domains/ElemStatLearn/datasets/prostate.data",  delim_whitespace=True)
data.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa,train
1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783,T
2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519,T
3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519,T
4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519,T
5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564,T


sol 1:

In [95]:
def add_noise_features(dmatrix, k):
    return dmatrix.assign(**{f'noise_{i}': norm.rvs(size=dmatrix.shape[0]) for i in range(k)})

In [96]:
def get_noisy(x, y, k):
    var = sm.OLS(y, x).fit().mse_resid
    noisy_y = y + norm.rvs(scale=var, size=y.size)
    noisy_x = add_noise_features(x, k)
    return noisy_x, noisy_y

In [97]:
x, y = data.drop(columns=['train', 'lpsa']), data['lpsa']

In [98]:
noisy_x, noisy_y = get_noisy(x, y, 20)

In [99]:
X1, X2, Y1, Y2 = model_selection.train_test_split(noisy_x, noisy_y, test_size=0.5, random_state=42)

sol 2:

In [100]:
lasso = linear_model.Lasso()
parameters = {'alpha':[0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3]}
clf = model_selection.GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', )
clf.fit(X1, Y1,)
print(pd.DataFrame(clf.cv_results_)[['param_alpha', 'mean_test_score']])
beta_lasso = clf.best_estimator_.coef_
lasso_model = clf.best_estimator_

  param_alpha  mean_test_score
0       0.001        -3.066264
1       0.003        -2.817068
2        0.01        -2.330086
3        0.03        -1.787618
4         0.1        -1.281062
5         0.3        -1.130332
6           1        -1.721935
7           3        -1.698885


In [101]:
ridge = linear_model.Ridge()
parameters = {'alpha':[0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]}
clf = model_selection.GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', )
clf.fit(X1, Y1,)
print(pd.DataFrame(clf.cv_results_)[['param_alpha', 'mean_test_score']])
beta_ridge = clf.best_estimator_.coef_
ridge_model = clf.best_estimator_

   param_alpha  mean_test_score
0        0.001        -3.239941
1        0.003        -3.234293
2         0.01        -3.215012
3         0.03        -3.163703
4          0.1        -3.017571
5          0.3        -2.749483
6            1        -2.302759
7            3        -1.843990
8           10        -1.500448
9           30        -1.413104
10         100        -1.508258
11         300        -1.630660


sol 3:

In [102]:
metrics.mean_squared_error(lasso_model.predict(X2), Y2)

0.7324019852636844

In [103]:
metrics.mean_squared_error(ridge_model.predict(X2), Y2)

0.7368060048928802

we can see that lasso got here a slightly better score

sol 4:

In [111]:
x, y = data.drop(columns=['train', 'lpsa']), data['lpsa']

In [112]:
noisy_x, noisy_y = get_noisy(x, y, 50)

In [113]:
X1, X2, Y1, Y2 = model_selection.train_test_split(noisy_x, noisy_y, test_size=0.5, random_state=42)

In [122]:
lasso = linear_model.Lasso(max_iter=5000)
parameters = {'alpha':[0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3]}
clf = model_selection.GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', )
clf.fit(X1, Y1,)
print(pd.DataFrame(clf.cv_results_)[['param_alpha', 'mean_test_score']])
beta_lasso = clf.best_estimator_.coef_
lasso_model = clf.best_estimator_

  param_alpha  mean_test_score
0       0.001        -2.671088
1       0.003        -2.512719
2        0.01        -2.080017
3        0.03        -1.413301
4         0.1        -1.022331
5         0.3        -1.172024
6           1        -1.854838
7           3        -1.839602


In [123]:
ridge = linear_model.Ridge()
parameters = {'alpha':[0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]}
clf = model_selection.GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', )
clf.fit(X1, Y1,)
print(pd.DataFrame(clf.cv_results_)[['param_alpha', 'mean_test_score']])
beta_ridge = clf.best_estimator_.coef_
ridge_model = clf.best_estimator_

   param_alpha  mean_test_score
0        0.001        -2.889300
1        0.003        -2.887870
2         0.01        -2.882887
3         0.03        -2.868815
4          0.1        -2.821421
5          0.3        -2.700135
6            1        -2.391155
7            3        -1.952030
8           10        -1.521462
9           30        -1.359946
10         100        -1.464597
11         300        -1.660897


In [124]:
metrics.mean_squared_error(lasso_model.predict(X2), Y2)

1.09400901541346

In [125]:
metrics.mean_squared_error(ridge_model.predict(X2), Y2)

1.1802662619010904

lasso was better again