In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np
import pandas as pd
import math

In the following cells, you will load in the Brooklyn Bridge pedestrian traffic dataset, which you have worked with before for exploratory data analysis. 

You will train a model to predict pedestrian traffic based on the following features: temperature, precipitation, hour, whether or not it is a weekend, and whether or not it is a holiday or other special event.

Your feature data is loaded into `X` and the target variable is loaded into `y`.

In [2]:
df = pd.read_excel('brooklyn-bridge-automated-counts.xlsx')
df['hour'] = df['hour_beginning'].dt.hour
df['date'] = df['hour_beginning'].dt.date
df['day_name'] = df['hour_beginning'].dt.day_name()
df['day_no'] = df['hour_beginning'].dt.dayofweek
df['temperature'] = df['temperature'].fillna(method="ffill")
df['precipitation'] = df['precipitation'].fillna(method="ffill")
df['weather_summary'] = df['weather_summary'].fillna(method="ffill")
df['is_weekend'] = df['day_no'].isin([5, 6]).astype('int')
df['is_holiday'] = df['events'].notnull().astype('int')
df

Unnamed: 0,hour_beginning,location,Pedestrians,Towards Manhattan,Towards Brooklyn,weather_summary,temperature,precipitation,lat,long,events,hour,date,day_name,day_no,is_weekend,is_holiday
0,2017-10-01 00:00:00,Brooklyn Bridge,44,30,14,clear-night,52.0,0.0001,40.708164,-73.999509,,0,2017-10-01,Sunday,6,1,0
1,2017-10-01 01:00:00,Brooklyn Bridge,30,17,13,partly-cloudy-night,53.0,0.0002,40.708164,-73.999509,,1,2017-10-01,Sunday,6,1,0
2,2017-10-01 02:00:00,Brooklyn Bridge,25,13,12,partly-cloudy-night,52.0,0.0000,40.708164,-73.999509,,2,2017-10-01,Sunday,6,1,0
3,2017-10-01 03:00:00,Brooklyn Bridge,20,11,9,partly-cloudy-night,51.0,0.0000,40.708164,-73.999509,,3,2017-10-01,Sunday,6,1,0
4,2017-10-01 04:00:00,Brooklyn Bridge,18,10,8,partly-cloudy-night,51.0,0.0000,40.708164,-73.999509,,4,2017-10-01,Sunday,6,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7291,2018-07-31 19:00:00,Brooklyn Bridge,1913,1009,904,partly-cloudy-day,76.0,0.0000,40.708164,-73.999509,,19,2018-07-31,Tuesday,1,0,0
7292,2018-07-31 20:00:00,Brooklyn Bridge,1302,591,711,partly-cloudy-day,75.0,0.0000,40.708164,-73.999509,,20,2018-07-31,Tuesday,1,0,0
7293,2018-07-31 21:00:00,Brooklyn Bridge,880,331,549,partly-cloudy-night,74.0,0.0000,40.708164,-73.999509,,21,2018-07-31,Tuesday,1,0,0
7294,2018-07-31 22:00:00,Brooklyn Bridge,463,206,257,partly-cloudy-night,74.0,0.0000,40.708164,-73.999509,,22,2018-07-31,Tuesday,1,0,0


In [3]:
X = np.array(df[['temperature', 'precipitation', 'hour', 'is_weekend', 'is_holiday']])
y = np.array(df['Pedestrians'])
X

array([[5.2e+01, 1.0e-04, 0.0e+00, 1.0e+00, 0.0e+00],
       [5.3e+01, 2.0e-04, 1.0e+00, 1.0e+00, 0.0e+00],
       [5.2e+01, 0.0e+00, 2.0e+00, 1.0e+00, 0.0e+00],
       ...,
       [7.4e+01, 0.0e+00, 2.1e+01, 0.0e+00, 0.0e+00],
       [7.4e+01, 0.0e+00, 2.2e+01, 0.0e+00, 0.0e+00],
       [7.3e+01, 0.0e+00, 2.3e+01, 0.0e+00, 0.0e+00]])

You have reason to believe that there may be interaction effects or non-linear effects of these features on the target variable. For example, if it is cold *and* rainy, that may have more of a deterrent effect on pedestrians than just the sum of the effects of cold and rainy individually.

So, before training a model, you will use the `sklearn` `PolynomialFeatures` function to generate polynomial and interaction features. According to its documentation, this function will:

> Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree. For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].


For example, this code will generate the degree-2 polynomial features for the Brooklyn Bridge data in `X`:





In [4]:
poly = PolynomialFeatures(degree=2)
X_trans = poly.fit_transform(X)
X_trans.shape

(7296, 21)

where the new features are:

In [5]:
for i, f in enumerate(poly.get_feature_names()):
  print(i, f)

0 1
1 x0
2 x1
3 x2
4 x3
5 x4
6 x0^2
7 x0 x1
8 x0 x2
9 x0 x3
10 x0 x4
11 x1^2
12 x1 x2
13 x1 x3
14 x1 x4
15 x2^2
16 x2 x3
17 x2 x4
18 x3^2
19 x3 x4
20 x4^2


You are interested in training a linear regression on this data, to predict the number of pedestrians, but you don't know what degree of polynomial to use. 

You decide to evaluate linear models on transformed versions of `X` up to degree 5 (including degree 5), to see which has the best performance in a linear regression.

First, you use `PolynomialFeatures` to create a transformed data set with polynomial features up to and including degree 5. 

In [6]:
poly = PolynomialFeatures(degree=5)
X_trans = poly.fit_transform(X)

In [7]:
X_trans.shape

(7296, 252)

In [8]:
X_trans_names = poly.get_feature_names()

Then, you set aside 30% of `X_trans` for evaluating the final model at the end.  Save the result in `X_tr`, `y_tr`, `X_ts`, and `y_ts`. 

You use `sklearn`'s `train_test_split` without shuffling (because of the temporal structure of the data).

In [9]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X_trans, y, test_size = 0.3, shuffle=False)

In [10]:
X_tr.shape

(5107, 252)

Now, you will use 10-fold cross validation (with `sklearn`'s `KFold`) to evaluate each `degree` from 0 to 5 (including 5) in an `sklearn` `LinearRegression` model, using `r2_score` for the metric.  

In your cross validation, you will save the validation R2 for each degree in an array called `r2_val`, and save the training R2 in an array called `r2_train`.





In [11]:
nd = 6
nfold = 10
r2_train = np.zeros((nd, nfold))
r2_val = np.zeros((nd, nfold))

In [12]:
#grade (write your code in this cell and DO NOT DELETE THIS LINE)

# note: only the code in this cell and the code provided for you will be 
# passed to the autograder. If you define any additional variables
# that are required to run this cell, make sure they are defined in this cell!

kf = KFold(n_splits=nfold, shuffle=False)
num_poly = [1,6,21,56,126,252]
for isplit, (idx_tr, idx_val) in enumerate(kf.split(X_tr)):
    
    for degree in range(nd):

        X_train_fold = X_tr[idx_tr, :num_poly[degree]]
        X_val_fold = X_tr[idx_val, :num_poly[degree]]
        
        # Train the model
        reg_dtest = LinearRegression().fit(X_train_fold, y_tr[idx_tr])

        # Compute R^2 for training and validation data
        r2_train[degree, isplit] = r2_score(y_tr[idx_tr], reg_dtest.predict(X_train_fold))
        r2_val[degree, isplit] = r2_score(y_tr[idx_val], reg_dtest.predict(X_val_fold))

Then, create an array `r2_mean` with the mean R2 value for each degree, across K folds. 

In [13]:
#grade (write your code in this cell and DO NOT DELETE THIS LINE)
r2_mean = np.mean(r2_val, axis=1)

In [14]:
r2_mean

array([-0.08015254,  0.12073071,  0.42391391,  0.3977008 , -6.88709641,
       -6.92755144])

Finally, select the model with the best validation R2. Save the model order in `d_opt`.

In [15]:
#grade (write your code in this cell and DO NOT DELETE THIS LINE)
d_opt = np.argmax(r2_mean) 

In [16]:
d_opt

2