## Learning objectives
* Chain multiple data processing steps together using `Pipeline`
* Use the `KFolds` object to split data into multiple folds.
* Perform cross validation using SciKit Learn with `cross_val_predict` and `GridSearchCV`


In [18]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [19]:
#load file
boston = pickle.load(open('boston_housing_clean.pickle', "rb" ))
boston.keys()

dict_keys(['dataframe', 'description'])

In [23]:
data = boston['dataframe']
data_Description = boston['description']
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [24]:
# numbers of obersvation
len(data.index)

506

In [25]:
X = data.drop('MEDV', axis=1)
Y = data.MEDV


# Steps:
* Split Data to Response Variable and Predictors
* Run Kflod Function create 3 splits independent testing and training set
* build regression model for each cross validate sets
* calculate accuracy for each model



In [26]:
KF = KFold(shuffle=True, random_state = 123, n_splits = 3)



In [27]:
for train_index, test_index in KF.split(X):
    print("Train index:", train_index[:10], len(train_index))
    print("Test index: ", test_index[:10], len(test_index))
    print('')

Train index: [ 1  2  3  4  7  8 10 12 14 16] 337
Test index:  [ 0  5  6  9 11 13 15 20 21 24] 169

Train index: [ 0  1  2  3  5  6  8  9 11 13] 337
Test index:  [ 4  7 10 12 19 22 23 28 29 30] 169

Train index: [ 0  4  5  6  7  9 10 11 12 13] 338
Test index:  [ 1  2  3  8 14 16 17 18 25 27] 168



In [28]:
from sklearn.metrics import r2_score, mean_squared_error

scores = []
lr = LinearRegression()

for train_index, test_index in KF.split(X):
        X_train, X_test, y_train, y_test = (X.iloc[train_index, :],
                                        X.iloc[test_index, :],
                                        Y[train_index],
                                        Y[test_index])
        lr.fit(X_train, y_train)

        y_pred = lr.predict(X_test)

        score = r2_score(y_test.values, y_pred)

        scores.append(score)

scores

[0.6548764502968953, 0.7702392531467167, 0.6671723298202625]

In [32]:
#Apply scaling the data
scores = []

lr = LinearRegression()
s = StandardScaler()

for train_index, test_index in KF.split(X):
    X_train, X_test, y_train, y_test = (X.iloc[train_index, :],
                                        X.iloc[test_index, :],
                                        Y[train_index],
                                        Y[test_index])

    X_train_s = s.fit_transform(X_train)

    lr.fit(X_train_s, y_train)

    X_test_s = s.transform(X_test)

    y_pred = lr.predict(X_test_s)

    score = r2_score(y_test.values, y_pred)

    scores.append(score)
scores

[0.6548764502968956, 0.7702392531467168, 0.6671723298202646]

(same scores, because for vanilla linear regression with no regularization, scaling actually doesn't matter for performance)