In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib notebook

from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

  from pandas.core import datetools


In [2]:
# Load the spam data
data = pd.read_csv('../data/prostate.data',sep='\t',index_col=0)
data = data.sample(frac=1).reset_index(drop=True)


X_cols = [col for col in data.columns if (col not in ['train','lpsa'])]
y_cols = ['lpsa']

# Scale the columns to mean 0 and var 
scaler = StandardScaler()
data[X_cols] = scaler.fit_transform(data[X_cols])

### Split into training and test data

In [3]:
# # # Split in train/test
train_data = data[data.train=='T']
test_data = data[data.train=='F']

train_data = train_data.drop(['train'],axis=1)
test_data = test_data.drop(['train'],axis=1)

### Training data correlation matix (Table 3.1 on page 50)

In [4]:
train_data[X_cols].corr()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45
lcavol,1.0,0.300232,0.286324,0.063168,0.592949,0.692043,0.426414,0.483161
lweight,0.300232,1.0,0.316723,0.437042,0.181054,0.156829,0.023558,0.074166
age,0.286324,0.316723,1.0,0.287346,0.128902,0.172951,0.365915,0.275806
lbph,0.063168,0.437042,0.287346,1.0,-0.139147,-0.088535,0.032992,-0.030404
svi,0.592949,0.181054,0.128902,-0.139147,1.0,0.67124,0.306875,0.481358
lcp,0.692043,0.156829,0.172951,-0.088535,0.67124,1.0,0.476437,0.662533
gleason,0.426414,0.023558,0.365915,0.032992,0.306875,0.476437,1.0,0.757056
pgg45,0.483161,0.074166,0.275806,-0.030404,0.481358,0.662533,0.757056,1.0


### Test Error : Fitting a linear regression (Table 3.2 page 50)

In [5]:
X_train = train_data[X_cols].values
y_train = train_data[y_cols].values
X_test = test_data[X_cols].values
y_test = test_data[y_cols].values

In [6]:
# Linear regression

model = LinearRegression(fit_intercept=True)
model.fit(X_train[:,[0,1]],y_train)

y_pred = np.array(model.predict(X_test[:,[0,1]]))
# The mean squared error
print("Mean squared error: %.3f"
      % mean_squared_error(y_test, y_pred))


Mean squared error: 0.492


### Bootstrap Error estimate (eq 3.57 on page 251)
### $\hat{Err}^{(.632)} = .368 \bar{err} + .632 \hat{Err}^{(1)}$
### Where $\hat{Err}^{(1)}$ is the Leave-one-out bootstrap estimate of prediction error

In [52]:
# Training error:
training_error = mean_squared_error(y_train, model.predict(X_train[:,[0,1]]))

# Creat Bootstrap samples
bootstrap_samples = 100000
Replications = np.array([np.random.choice(np.arange(len(y_train)), len(y_train), replace = True) for _ in range(bootstrap_samples)])

# Compute equation 7.56 (pg 251)
train_ind = set(np.arange(len(y_train)))
loo_error = np.zeros(y_train.shape)
C_i = np.zeros(y_train.shape) # How many times an index was left out and used for error computation
for n in range(bootstrap_samples):
    boot_ind = Replications[n,:]
    X_boot = X_train[boot_ind,:]
    X_boot = X_boot[:,[0,1]]
    y_boot = y_train[boot_ind]
    model = LinearRegression(fit_intercept=True)
    model.fit(X_boot,y_boot)
    
    ind_for_error = list(train_ind.difference(set(boot_ind)))
    y_true = y_train[ind_for_error]
    
    X_pred = X_train[ind_for_error,:]
    y_pred = model.predict(X_pred[:,[0,1]])

    loo_error[ind_for_error] = loo_error[ind_for_error] + (y_pred-y_true)**2
    C_i[ind_for_error] = C_i[ind_for_error] + 1

loo_error = loo_error/C_i
loo_error = loo_error/len(y_train)
loo_error = np.sum(loo_error)
total_error = 0.368*training_error + 0.632*loo_error
print('Boostrap .632 error = ',total_error)

NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

Boostrap .632 error =  0.60517277745


### 5 fold cross validation

In [51]:
from sklearn.model_selection import cross_val_score

model = LinearRegression(fit_intercept=True)
scores = -cross_val_score(model, X_train[:,[0,1]], y_train, cv=5, scoring='neg_mean_absolute_error')
print('CV 5-fold error = ',np.mean(scores))

CV 5-fold error =  0.614724659451


### 10 fold cross validation

In [50]:
from sklearn.model_selection import cross_val_score

model = LinearRegression(fit_intercept=True)
scores = -cross_val_score(model, X_train[:,[0,1]], y_train, cv=10, scoring='neg_mean_absolute_error')
print('CV 10-fold error = ',np.mean(scores))

CV 10-fold error =  0.627722501022
