In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib notebook

from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

  from pandas.core import datetools


In [2]:
# Load the spam data
data = pd.read_csv('../data/prostate.data',sep='\t',index_col=0)

X_cols = [col for col in data.columns if (col not in ['train','lpsa'])]
y_cols = ['lpsa']

# Scale the columns to mean 0 and var 
scaler = StandardScaler()
data[X_cols] = scaler.fit_transform(data[X_cols])

### Split into training and test data

In [3]:
# # # Split in train/test
train_data = data[data.train=='T']
test_data = data[data.train=='F']

train_data = train_data.drop(['train'],axis=1)
test_data = test_data.drop(['train'],axis=1)

### Training data correlation matix (Table 3.1 on page 50)

In [4]:
train_data[X_cols].corr()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45
lcavol,1.0,0.300232,0.286324,0.063168,0.592949,0.692043,0.426414,0.483161
lweight,0.300232,1.0,0.316723,0.437042,0.181054,0.156829,0.023558,0.074166
age,0.286324,0.316723,1.0,0.287346,0.128902,0.172951,0.365915,0.275806
lbph,0.063168,0.437042,0.287346,1.0,-0.139147,-0.088535,0.032992,-0.030404
svi,0.592949,0.181054,0.128902,-0.139147,1.0,0.67124,0.306875,0.481358
lcp,0.692043,0.156829,0.172951,-0.088535,0.67124,1.0,0.476437,0.662533
gleason,0.426414,0.023558,0.365915,0.032992,0.306875,0.476437,1.0,0.757056
pgg45,0.483161,0.074166,0.275806,-0.030404,0.481358,0.662533,0.757056,1.0


### Fitting a linear regression (Table 3.2 page 50)

In [5]:
X_train = train_data[X_cols].values
y_train = train_data[y_cols].values

In [6]:
# Linear regression OLS
X_train_ols = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train_ols)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.694
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     16.47
Date:                Sun, 24 Mar 2019   Prob (F-statistic):           2.04e-12
Time:                        21:30:57   Log-Likelihood:                -67.505
No. Observations:                  67   AIC:                             153.0
Df Residuals:                      58   BIC:                             172.9
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.4649      0.089     27.598      0.0

### Best subset selection through exhaustive search $2^{nfeatures}$

In [12]:
def fit_linear_reg(X,y,fit_intercept=True):
    #Fit linear regression model and return mse and R squared values
    model_k = LinearRegression(fit_intercept)
    model_k.fit(X,y)
    MSE = mean_squared_error(y,model_k.predict(X)) * len(y)
    R_squared = model_k.score(X,y)
    return MSE, R_squared

def fit_stat_linear_reg(X,y):
    X_train_ols = sm.add_constant(X)
    model = sm.OLS(y, X_train_ols)
    results = model.fit()
    return results

In [15]:
import itertools

n_features = len(X_cols)
total_combinations = 2**n_features

results_df = pd.DataFrame(columns=['features','mse','r2','n_features'])
stuff = np.arange(n_features)

rownum = 0
for L in range(0, len(stuff)+1):
    for subset in itertools.combinations(stuff, L):
        if len(subset)>0:
            mse,r2 = fit_linear_reg(X_train[:,subset],y_train)
            results_df.loc[rownum] = [subset, mse, r2, len(subset)]
            rownum = rownum + 1
        else: # Just fit the intercept for 0 size
            mse,r2 = fit_linear_reg(np.ones(y_train.shape),y_train,fit_intercept=False)
            results_df.loc[rownum] = [subset, mse, r2, len(subset)]
            rownum = rownum + 1

### Figure 3.5 (page 58): Plot all RSS for all models and select the best for each subset size

In [24]:
best_mse = results_df.groupby('n_features')['mse'].min()
fig,ax = plt.subplots()
ax.plot(results_df.n_features,results_df.mse,'r.')
ax.plot(best_mse.index,best_mse.values,'go-')
ax.set_xlabel('# of features')
ax.set_ylabel('mse')

<IPython.core.display.Javascript object>

Text(0,0.5,'mse')

## Now choose the best models among these using cross validation

In [32]:
best_models = results_df[results_df.groupby('n_features')['mse'].transform(min)==results_df['mse']]

In [33]:
best_models

Unnamed: 0,features,mse,r2,n_features
0,(),96.281445,1.110223e-16,0
1,"(0,)",44.528583,0.5375165,1
9,"(0, 1)",37.091846,0.614756,2
39,"(0, 1, 4)",34.907749,0.6374405,3
98,"(0, 1, 3, 4)",32.814995,0.6591763,4
175,"(0, 1, 3, 4, 7)",32.069447,0.6669198,5
230,"(0, 1, 3, 4, 5, 7)",30.539778,0.6828072,6
248,"(0, 1, 2, 3, 4, 5, 7)",29.4373,0.6942578,7
255,"(0, 1, 2, 3, 4, 5, 6, 7)",29.426384,0.6943712,8


In [43]:
from sklearn.model_selection import cross_val_score

mean_CV_error = {}
std_CV_error = {}

model = LinearRegression()
for ind in range(best_models.shape[0]):
    row = best_models.iloc[ind]
    if row.n_features>0:
        scores = cross_val_score(model, X_train[:,row.features], y_train, cv=5)
    else:
        scores = cross_val_score(model, np.ones(y_train.shape), y_train, cv=5)
    
    mean_CV_error[row.n_features] = np.mean(scores)
    std_CV_error[row.n_features] = np.std(scores)
    print(scores)

[ -9.48374589 -11.50130146  -0.94792447 -14.68846983 -13.4413298 ]
[ -3.67997145 -10.99371548 -10.26667086  -8.77431186  -4.15020233]
[ -2.86506763 -13.106284    -6.35091259  -6.2267032   -3.98743112]
[ -3.03799703 -13.03371854  -9.61463632  -6.97557557  -3.91701092]
[ -2.83931189 -15.50115236 -11.07932933  -6.79135855  -3.83813068]
[ -2.82735373 -15.57246565 -11.86124528  -6.42679413  -4.07057405]
[ -2.80479078 -14.5544904  -10.802942    -6.19631344  -3.44592275]
[ -2.649473   -15.13155607  -9.40641216  -7.72033969  -3.30079344]
[ -2.74533614 -16.63145179 -10.07079694  -7.72076706  -3.51815561]


In [37]:
row

features      (0, 1, 2, 3, 4, 5, 6, 7)
mse                            29.4264
r2                            0.694371
n_features                           8
Name: 255, dtype: object