In [296]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import statsmodels.api as sm
import statsmodels.formula.api as smf
np.random.seed(1)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Lab

In [297]:
df = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Auto.csv'
                 , na_values='?').dropna() # drop columns with NaN
print("Shape of dataframe: " + str(df.shape))
df.head()

Shape of dataframe: (392, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


### Pandas test/train split method

Do not need to define your response (y) and predictors (X).

In [298]:
train = df.sample(frac=0.5, replace=False, random_state=1)
test = df.loc[df.index.difference(train.index)]
print(train.shape)
print(test.shape)

(196, 9)
(196, 9)


### Sklearn test/train split method

You must define your response (y) and predictors (X).

In [299]:
X = df.iloc[:,1:] #all columns except 'mpg'
y = df['mpg']
print(X.shape)
print(y.shape)

(392, 8)
(392,)


In [300]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=1)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(196, 8)
(196,)
(196, 8)
(196,)


#### 5.3.1 The Validation Set Approach

* statmodels style

In [301]:
# build model with train set
lm = smf.ols('mpg ~ horsepower', data=train).fit()

# predict on test set and get MSE
np.mean((test['mpg']-lm.predict(test))**2)

23.36190289258723

Resample our original df to create new test/train sets.

In [302]:
# adjust 'random_state'. equiv. to set.seed
train = df.sample(frac=0.5, replace=False, random_state=2)
test = df.loc[df.index.difference(train.index)]

Slightly different test MSE bc we are using a different training set.

In [303]:
# build model with train set
lm = smf.ols('mpg ~ horsepower', data=train).fit()

# predict on test set and get MSE
np.mean((test['mpg']-lm.predict(test))**2)

25.10853905288965

#### The Validation Set Approach

* sklearn style

The LinearRegression module (and other modules) in sklearn requires both the `fit()` and `predict()` to be an array with shape `(n_samples, n_features)`. Since we are only using one feature in this example from the book, we use `reshape` to get our data in the correct shape. If we use >1 predictors we do not need to take the `reshape` path.

In [304]:
X = df['horsepower'].values.reshape(-1,1)
y = df['mpg'].values.reshape(-1,1)

# split data. adjust 'random_state' for new seed
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=1)

# instantiate and train model
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [305]:
# predict on our test set
y_pred = clf.predict(X_test)

# get MSE on our test set
mean_squared_error(y_test, y_pred)

24.80212062059356

#### 5.3.2 Leave-One-Out Cross-Validation

In [306]:
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

clf = LinearRegression()
loo = LeaveOneOut()
loo.get_n_splits(X) # = n

392

In [307]:
scores = cross_val_score(clf, X, y, cv=loo, scoring='neg_mean_squared_error')
print(abs(scores.mean()))

24.23151351792923


#### 5.3.3 k-fold Cross-Validation

In [308]:
# k=10
# no polynomials used in this example. unlike in the book
kf = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(clf, X, y, cv=kf, scoring='neg_mean_squared_error')
print(abs(scores.mean()))
print(abs(scores))

24.09767573188306
[25.65015985 30.29781332 23.56550989 20.75383058 23.42372298 32.93506361
 21.89071342 27.35148938 17.85005823 17.25839605]


In [309]:
# polynomials loop for horsepower

from sklearn.preprocessing import PolynomialFeatures
import time

p_order = np.arange(1,11)
r_state = np.arange(0,10)

# LeaveOneOut CV
regr = LinearRegression()
loo = LeaveOneOut()
loo.get_n_splits(df)
scores = list()

# loop through polynomials to the the 10th degree
start = time.time()
for i in range(1,11):
    poly = PolynomialFeatures(i)
    X_poly = poly.fit_transform(df.horsepower.values.reshape(-1,1))
    score = cross_val_score(regr, X_poly, df.mpg, cv=loo, scoring='neg_mean_squared_error').mean()
    scores.append(abs(score))
end = time.time()    
print(scores)
print("Took: " + str(end-start) + " seconds to run.")
#loo takes much longer to run than k=10

[24.231513517929226, 19.248213124489787, 19.334984064106756, 19.42443031166752, 19.033207024628698, 18.99500404220338, 19.125478734084673, 19.224245984955925, 19.133856717639716, 18.94686917818659]
Took: 7.097242832183838 seconds to run.


In [310]:
# polynomials loop for horsepower

p_order = np.arange(1,11)
r_state = np.arange(0,10)

# k=10
regr = LinearRegression()
kf = KFold(n_splits=10, random_state=1, shuffle=True)
scores = list()

# loop through polynomials to the the 10th degree
start = time.time()
for i in range(1,11):
    poly = PolynomialFeatures(i)
    X_poly = poly.fit_transform(df.horsepower.values.reshape(-1,1))
    score = cross_val_score(regr, X_poly, df.mpg, cv=kf, scoring='neg_mean_squared_error').mean()
    scores.append(abs(score))
end = time.time()    
print(scores)
print("Took: " + str(end-start) + " seconds to run.")
# runs much quicker than loocv

[24.097675731883058, 19.17888986488969, 19.213859523750337, 19.212807015950848, 18.757987366593643, 18.64228358781309, 18.820947526949453, 18.975741568188322, 18.93752011116788, 18.791101516272114]
Took: 0.19959187507629395 seconds to run.


#### 5.3.4 The Bootstrap

In [311]:
df = pd.read_csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ISLR/Portfolio.csv", 
                 index_col=0).dropna()

In [376]:
print(df.shape)
df.head()

(100, 2)


Unnamed: 0,X,Y
1,-0.895251,-0.234924
2,-1.562454,-0.885176
3,-0.41709,0.271888
4,1.044356,-0.734198
5,-0.315568,0.841983


In [361]:
def alpha_fn(data, r, c):
    X = data.X[r:c]
    Y = data.Y[r:c]
    return ((np.var(Y)-np.cov(X,Y)[0,1])/(np.var(X)+np.var(Y) - 2 * np.cov(X,Y)[0,1]))

alpha_fn(df, 0, 100)

0.5766511516104116

In [450]:
# cleaner version
def alpha_fn(data, index):
    X = data.X[index]
    Y = data.Y[index]
    return (np.var(Y) - np.cov(X,Y)[0,1])/(np.var(X) + np.var(Y) - 2 * np.cov(X, Y)[0,1])

alpha_fn(Portfolio, list(range(100)))

0.5766511516104116

In [449]:
# numpy version of random sampling for bootstrapping

np.random.seed(1)
alpha_fn(Portfolio, np.random.choice(list(range(100)), size=100))

0.4504820492455901

In [488]:
# sklearn version of sampling for bootstrapping

from sklearn.utils import resample

def alpha_fn(data, r, c):
    data = resample(data, n_samples=100, random_state=1)
    X = data.X[r:c]
    Y = data.Y[r:c]
    return ((np.var(Y)-np.cov(X,Y)[0,1])/(np.var(X)+np.var(Y) - 2 * np.cov(X,Y)[0,1]))

alpha_fn(df, 0, 100)

0.4504820492455901

In [485]:
# R equivalent: sample(100,100,replace = T)

np.sort(np.random.choice(100, 100)) # replace=True by default

array([ 0,  2,  2,  2,  5,  5,  6,  7,  8, 10, 10, 11, 11, 12, 12, 13, 14,
       14, 16, 16, 17, 18, 18, 19, 21, 22, 22, 23, 25, 26, 27, 27, 28, 28,
       28, 28, 29, 30, 30, 32, 33, 35, 35, 36, 36, 36, 37, 38, 40, 40, 41,
       41, 42, 42, 46, 46, 46, 48, 48, 49, 50, 52, 52, 56, 57, 58, 58, 61,
       63, 63, 64, 66, 70, 72, 72, 75, 75, 75, 78, 78, 80, 81, 81, 81, 85,
       85, 86, 87, 87, 89, 91, 92, 92, 93, 94, 96, 97, 97, 98, 99])

In [480]:
x = [1,2,3]
y = [6,5,4]
X = np.stack((x,y), axis=1)
print(X)
print(np.cov(x,y))
print(np.cov(x))

[[1 6]
 [2 5]
 [3 4]]
[[ 1. -1.]
 [-1.  1.]]
1.0


In [482]:
X.T

array([[1, 2, 3],
       [6, 5, 4]])