In [1]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error
from sklearn import cross_validation

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'svg',}
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('darkgrid')

In [2]:
#df1 = pd.read_excel('Data/Auto2.xlsx')
df1 = pd.read_csv('Data/Auto.csv', na_values='?')
df1 = df1.dropna()
df1['horsepower2'] = df1.horsepower**2
df1['horsepower3'] = df1.horsepower**3
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 396
Data columns (total 11 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null int64
acceleration    392 non-null float64
year            392 non-null int64
origin          392 non-null int64
name            392 non-null object
horsepower2     392 non-null float64
horsepower3     392 non-null float64
dtypes: float64(6), int64(4), object(1)
memory usage: 35.2+ KB


## Lab

### § 5.3.1 Validation Set
Split data in a training and a test set. In this case: equal size.

In [47]:
t_prop = 0.5

X_train, X_test, y_train, y_test = cross_validation.train_test_split(df1, df1.mpg.reshape(-1,1),
                                                                     test_size=t_prop, random_state=0)
y = y_test

In [4]:
regr1 = skl_lm.LinearRegression()
regr1.fit(X_train['horsepower'].as_matrix().reshape(-1,1), y_train)
pred1 = regr1.predict(X_test['horsepower'].as_matrix().reshape(-1,1))

regr2 = skl_lm.LinearRegression()
regr2.fit(X_train[['horsepower', 'horsepower2']].as_matrix().reshape(-1,2), y_train)
pred2 = regr2.predict(X_test[['horsepower', 'horsepower2']].as_matrix().reshape(-1,2))

regr3 = skl_lm.LinearRegression()
regr3.fit(X_train[['horsepower', 'horsepower3']].as_matrix().reshape(-1,2), y_train)
pred3 = regr3.predict(X_test[['horsepower', 'horsepower3']].as_matrix().reshape(-1,2))

print(mean_squared_error(y, pred1))
print(mean_squared_error(y, pred2))
print(mean_squared_error(y, pred3))

23.6166170697
18.7630313469
18.8813346895


### § 5.3.2 Leave-One-Out Cross-Validation

In [46]:
loo = cross_validation.LeaveOneOut(392)

scores1 = cross_validation.cross_val_score(regr1, df1[['horsepower']], df1.mpg,
                                 cv=loo, scoring='mean_squared_error')
scores2 = cross_validation.cross_val_score(regr2, df1[['horsepower', 'horsepower2']], df1.mpg,
                                 cv=loo, scoring='mean_squared_error')
scores3 = cross_validation.cross_val_score(regr3, df1[['horsepower', 'horsepower3']], df1.mpg,
                                 cv=loo, scoring='mean_squared_error')

print(scores1.mean())
print(scores2.mean())
print(scores3.mean())

-24.2315135179
-19.2482131245
-19.4271386046


### § 5.3.3 k-Fold Cross-Validation
Using k=10

In [48]:
# Passing an integer to the 'cv' argument in cross_val_score() implies the use of KFold iterator.
# This is the same as defining the iterator explicitly and passing it as the value to the 'cv' argument.
# kf_10 = cross_validation.KFold(392, n_folds=10)

n_folds = 10

scores1 = cross_validation.cross_val_score(regr1, df1[['horsepower']], df1.mpg,
                                 cv=n_folds, scoring='mean_squared_error')
scores2 = cross_validation.cross_val_score(regr2, df1[['horsepower', 'horsepower2']], df1.mpg,
                                 cv=n_folds, scoring='mean_squared_error')
scores3 = cross_validation.cross_val_score(regr3, df1[['horsepower', 'horsepower3']], df1.mpg,
                                 cv=n_folds, scoring='mean_squared_error')

print(scores1.mean())
print(scores2.mean())
print(scores3.mean())

-27.4399336523
-21.2358400558
-21.4956756318
