In [2]:
%%javascript

window.load_remote_theme = false
var theme_url = "https://drostehk.github.io/ipynb-theme/";
var asset_url = 'https://raw.githubusercontent.com/tijptjik/DS_assets/master/';

window.load_local_theme = function(){
    var hostname = document.location.hostname
    return ((hostname == "localhost" || hostname == '127.0.0.1') && !load_remote_theme)
}

var url = load_local_theme() ? document.location.origin + "/files/theme/custom.js" : theme_url + 'custom.js'

$.getScript(url)

<IPython.core.display.Javascript object>

# Kaggle Competition

## 2. Modeling

In [3]:
from __future__ import unicode_literals, division

import IPython
import warnings
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from pylab import *
from IPython.display import HTML
from IPython.display import display as prnt

# Matplotlib in notebook
%matplotlib inline

# Notebook Options
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

# Matplotlib Styles
c = {'axes.labelsize': 17,
'axes.titlesize': 16,
'figure.figsize': [18, 8],
'grid.linewidth': 1.6,
'legend.fontsize': 17,
'lines.linewidth': 2,
'lines.markeredgewidth': 0.0,
'lines.markersize': 11,
'patch.linewidth': 0.5,
'xtick.labelsize': 16,
'xtick.major.pad': 20,
'xtick.major.width': 2,
'xtick.minor.width': 1,
'ytick.labelsize': 16.0,
'ytick.major.pad': 20,
'ytick.major.width': 2,
'ytick.minor.width': 1 }
wide_c = dict(c, **{'figure.figsize':[20,8]})

#### Utility Functions

In [4]:
def table(df,replace_match="",replace_str=""):
    return IPython.display.display(HTML(df.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped table-hover">').replace(replace_match,replace_str)))

### Load the Data

In [5]:
DATA_DIR = '../../data/bikeshare/'
TRAIN_FILE = DATA_DIR + 'train.csv'
TEST_FILE = DATA_DIR + 'test.csv'
df = pd.read_csv(TRAIN_FILE)

#### Training / Test Split

In [6]:
def get_train_data():
    # Loads the training data, but splits the y from the X
    dfx = pd.read_csv(TRAIN_FILE)
    return dfx.iloc[:, 0:9], dfx.iloc[:,-1]

### Scoring Method

In [7]:
from sklearn.metrics import make_scorer

# First, we should set up some sort of testing framework, so that we can benchmark our progress as we go
# The evaluation metric is Root mean squared logarithmic error.
def rmsele(actual, pred):
    """
    Given a column of predictions and a column of actuals, calculate the RMSELE
    """
    squared_errors = (np.log(pred + 1) - np.log(actual + 1)) ** 2
    mean_squared = np.sum(squared_errors) / len(squared_errors)
    return np.sqrt(mean_squared)

# This helper function will make a callable that we can use in cross_val_score
rmsele_scorer = make_scorer(rmsele, greater_is_better=False)

### Baseline Model

In [8]:
from sklearn.cross_validation import KFold, cross_val_score

expected_value = df['count'].mean()
yhat = np.array([expected_value] * len(df['count']))

rmsele(df['count'].values, yhat)

1.5691983019475926

In [9]:
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold, cross_val_score

# Lets just train a basic model so that we can test if our scoring and
# cross validation framework works well. We'll use a Ridge regression,
# which is a form of linear regression
X, y = get_train_data()
# Subset the X to just use temp, atemp, and workingday
Xhat = X[['temp', 'atemp', 'humidity']]
ridge_estimator = Ridge(normalize=True)
scores = cross_val_score(ridge_estimator, Xhat, y, scoring=rmsele_scorer, cv=5, verbose=1)


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [10]:
X, y = get_train_data()
# Subset the X to just use temp, atemp, and workingday
Xhat = X[['temp', 'atemp', 'humidity']]
ridge_estimator = Ridge()
ridge_estimator.fit(Xhat,y).score(Xhat,y)

0.24286060181991254

### CrossValidation

In [11]:
# Fill in some of the parameters on cross_val_score
def perform_cv(estimator, X, y):
    return cross_val_score(estimator, X, y, scoring=rmsele_scorer, cv=5, verbose=1)

In [12]:
X[ridge_estimator.predict(Xhat) < 0]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
169,2011-01-08 08:00:00,1,0,0,3,6.56,9.09,93,7.0015
170,2011-01-08 09:00:00,1,0,0,3,6.56,9.09,93,7.0015
247,2011-01-11 16:00:00,1,0,1,2,6.56,7.575,86,15.0013
248,2011-01-11 17:00:00,1,0,1,2,6.56,7.575,86,15.0013
249,2011-01-11 18:00:00,1,0,1,3,6.56,9.09,93,7.0015
250,2011-01-11 19:00:00,1,0,1,3,6.56,11.365,93,0.0
251,2011-01-11 20:00:00,1,0,1,3,6.56,7.575,93,12.998
253,2011-01-11 22:00:00,1,0,1,3,6.56,9.09,93,7.0015
254,2011-01-11 23:00:00,1,0,1,3,6.56,9.85,93,6.0032
257,2011-01-12 02:00:00,1,0,1,1,5.74,7.575,86,8.9981


### Grid Search

In [13]:
from sklearn.grid_search import GridSearchCV

# Try a simple grid search with the estimator
parameters = {'alpha': np.logspace(0, 2, 10)}
grid = GridSearchCV(ridge_estimator, parameters, scoring=rmsele_scorer, cv=5)
grid.fit(Xhat, y)
grid.grid_scores_

[mean: -1.40707, std: 0.13649, params: {u'alpha': 1.0},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 1.6681005372000588},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 2.7825594022071245},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 4.6415888336127784},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 7.7426368268112693},
 mean: -1.40706, std: 0.13649, params: {u'alpha': 12.915496650148841},
 mean: -1.40705, std: 0.13650, params: {u'alpha': 21.544346900318832},
 mean: -1.40703, std: 0.13650, params: {u'alpha': 35.938136638046259},
 mean: -1.40701, std: 0.13651, params: {u'alpha': 59.948425031894089},
 mean: -1.40697, std: 0.13652, params: {u'alpha': 100.0}]

In [14]:
# And for grid_search
def perform_grid_search(estimator, parameters, X, y):
    grid_search = GridSearchCV(estimator, parameters, scoring=rmsele_scorer, cv=5)
    grid_search.fit(X, y)
    return grid_search

## Transform Data

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
Xhat = X[['temp', 'atemp', 'humidity']]
normalize = StandardScaler()
normalize.fit(Xhat)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [17]:
print normalize.std_
print normalize.mean_
(Xhat - Xhat.mean()) / Xhat.std()
((Xhat - normalize.mean_) / normalize.std_).head()

[  7.79123196   8.47421137  19.24414932]
[ 20.23085982  23.65508405  61.88645967]


Unnamed: 0,temp,atemp,humidity
0,-1.333661,-1.092737,0.993213
1,-1.438907,-1.182421,0.941249
2,-1.438907,-1.182421,0.941249
3,-1.333661,-1.092737,0.68143
4,-1.333661,-1.092737,0.68143


In [18]:
from sklearn.preprocessing import StandardScaler

# Now lets move on to the actual transformation of the inputs
# First, not every estimator we'll use will have the "normalize" keyword
# So let's break it out into a transformer, so that we have better control over it
ridge_estimator = Ridge()

normalize = StandardScaler()
Xhat = X[['temp', 'atemp', 'humidity']]
Xhat = normalize.fit_transform(Xhat)

scores = perform_cv(ridge_estimator, Xhat, y)
scores

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


array([-1.63396695, -1.47964703, -1.34164532, -1.33777304, -1.241764  ])

### Pipeline

In [19]:
from sklearn.pipeline import Pipeline, FeatureUnion

# Now we have the beginnings of a multi-step pipeline
# Scikit lets you wrap each of these steps into a Pipeline object,
# so you just have to run fit / predict once
# instead of manually feeding the data from one transformer to the next
normalize = StandardScaler()
ridge_estimator = Ridge()
pipeline = Pipeline([('normalize', normalize), ('ridge', ridge_estimator)])
Xhat = X[['temp', 'atemp', 'humidity']]
scores = perform_cv(pipeline, Xhat, y)
scores

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


array([-1.63397871, -1.4796492 , -1.34164469, -1.33777294, -1.24175033])

In [20]:
# Additionally, you can perform grid search over all of the steps of the pipeline
# So you don't have to tune each step manually
# The pipeline exposes the underlying steps' parameters like so:
# ridge__alpha, and normalize__with_mean
ridge_estimator = Ridge()

normalize = StandardScaler()
parameters = {'ridge__alpha': np.logspace(0, 3, 10)}
Xhat = X[['temp', 'atemp', 'humidity']]
pipeline = Pipeline([('normalize', normalize), ('ridge', ridge_estimator)])

grid = GridSearchCV(pipeline, parameters, scoring=rmsele_scorer, cv=5)
grid.fit(Xhat, y)
grid.grid_scores_

[mean: -1.40698, std: 0.13651, params: {u'ridge__alpha': 1.0},
 mean: -1.40688, std: 0.13653, params: {u'ridge__alpha': 2.1544346900318838},
 mean: -1.40671, std: 0.13658, params: {u'ridge__alpha': 4.6415888336127784},
 mean: -1.40642, std: 0.13668, params: {u'ridge__alpha': 10.0},
 mean: -1.40603, std: 0.13688, params: {u'ridge__alpha': 21.544346900318832},
 mean: -1.40652, std: 0.13871, params: {u'ridge__alpha': 46.415888336127772},
 mean: -1.41912, std: 0.13725, params: {u'ridge__alpha': 100.0},
 mean: -1.41035, std: 0.13756, params: {u'ridge__alpha': 215.44346900318823},
 mean: -1.40951, std: 0.13878, params: {u'ridge__alpha': 464.15888336127773},
 mean: -1.42265, std: 0.13961, params: {u'ridge__alpha': 1000.0}]

### Feature Engineering

#### Encoding Dummy Variables

In [21]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
# Lets move on to including more features in our model
# We probably want to use a factor like Season in our model, but it's
# a categorical feature, and we'll need to convert it to a series of booleans
one_hot = OneHotEncoder()
season = one_hot.fit_transform(X['season'].reshape(X.shape[0], 1)).toarray()

In [22]:
one_hot.fit_transform(X['season'].reshape(X.shape[0], 1)).toarray()

array([[ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.]])

In [23]:
# We then have to join this with the other variables
normalize = StandardScaler()
ridge_estimator = Ridge()
pipeline = Pipeline([('normalize', normalize), ('ridge', ridge_estimator)])
Xhat = np.hstack([X[['temp', 'atemp', 'humidity']], season])
scores = perform_cv(pipeline, Xhat, y)
scores

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


array([-1.48713408, -1.44375715, -1.42074189, -1.27959987, -1.2144014 ])

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin

# Actually there's a faster way of doing this with the argument 'categorical_features'
class ToArray(BaseEstimator, TransformerMixin):
    # We need this because OneHotEncoder returns a sparse array, and normalize requires a non-sparse array
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.toarray()
        
Xhat = X[['season', 'weather', 'temp', 'atemp', 'humidity']]
# I think it needs to be 5 here, because it assumes that '0' is a possible value for an int datatype
# Should probably specify the data types in read_csv
one_hot = OneHotEncoder(n_values=[5, 5], categorical_features=[0, 1])
desparse = ToArray()
normalize = StandardScaler()
ridge_estimator = Ridge()
pipeline = Pipeline([('onehot', one_hot), ('desparse', desparse), ('normalize', normalize), ('ridge', ridge_estimator)])
scores = perform_cv(pipeline, Xhat, y)
scores

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


array([-1.49094937, -1.44178361, -1.41395979, -1.27575541, -1.21552556])

#### Selective Normalisation

In [31]:
# OK, so now we've got a pipeline that does one-hot encoding of two categorical variables
# and then normalizes the variables
# But actually we're not supposed to normalize the the dummy variables.
# So we need some way of only normalizing non-dummy variables

# Oops, actually the CV splitting converts the Pandas DF to an array, so we can't rely
# on the normalize having the proper column names
class SelectiveNormalize(StandardScaler):
    def __init__(self, cols, copy=True, with_mean=True, with_std=True):
        self.cols = cols
        super(SelectiveNormalize, self).__init__(copy, with_mean, with_std)
    
    def fit(self, X, y=None):
        subset = X[:, self.cols]
        return super(SelectiveNormalize, self).fit(subset, y)
        
    def transform(self, X):
        subset = X[:, self.cols]
        normalized = super(SelectiveNormalize, self).transform(subset)
        others = [col for col in range(X.shape[1]) if col not in self.cols]
        res = np.hstack([normalized, X[:, others]])
        return res

Xhat = X[['season', 'weather', 'temp', 'atemp', 'humidity']]
one_hot = OneHotEncoder(n_values=[5, 5], categorical_features=[3, 4])
normalize = SelectiveNormalize([2, 3, 4])
desparse = ToArray()
ridge_estimator = Ridge()
pipeline = Pipeline([('normalize', normalize), ('onehot', one_hot), ('desparse', desparse), ('ridge', ridge_estimator)])
scores = perform_cv(pipeline, Xhat, y)
scores

TypeError: unhashable type

#### Datetime

In [25]:
from sklearn.ensemble import RandomForestRegressor

# Lets try tackling the date column now.  The time of day is probably really important
# So we need some way of extracting the hour
# We'll use a FeatureUnion to do this, to demonstrate the functionality
def get_train_data():
    # Loads the training data, but splits the y from the X
    df = pd.read_csv(TRAIN_FILE, parse_dates=['datetime'])
    return df.iloc[:, 0:9], df.iloc[:,-1]


class SelectColumns(BaseEstimator, TransformerMixin):
    """
    Passes on a subset of columns from an input ndarray
    """
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[:, self.cols]
    

class ExtractHour(BaseEstimator, TransformerMixin):
    """
    Extracts hour from a datetime series
    """
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        res = np.zeros(X.shape)
        for xx in xrange(X.shape[0]):
            res[xx] = X[xx, 0].hour
        return res.reshape(res.shape[0], 1)
    

class CastType(BaseEstimator, TransformerMixin):
    def __init__(self, cast_to):
        self.cast_to = cast_to
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.astype(self.cast_to)

X, y = get_train_data()
# Reminder of the columns:
# ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']
select_date = SelectColumns([0])
select_others = SelectColumns(range(1, 9))
cast_float = CastType(np.float64)
one_hot = OneHotEncoder(n_values=[5, 5], categorical_features=[0, 3])
get_hour = ExtractHour()
normalize = SelectiveNormalize(range(2, 8))
desparse = ToArray()
ridge_estimator = RandomForestRegressor(n_estimators=200)

hour_feature = Pipeline([('select_date', select_date), ('get_hour', get_hour)])
other_features = Pipeline([('select_others', select_others), ('cast_float', cast_float), ('onehot', one_hot), ('desparse', desparse)])
join_features = FeatureUnion([('hour', hour_feature), ('others', other_features)])
predict = Pipeline([('featurize', join_features), ('estimator', ridge_estimator)])
scores = perform_cv(predict, X, y)
scores

NameError: name 'SelectiveNormalize' is not defined

## Submission

In [118]:
def make_submission(df_test, prediction, filename='submission.csv'):
    with open(filename, 'w') as f:
        f.write('datetime,count\n')
        submission_strings = df_test.reset_index()['datetime'] + ',' + prediction.astype(str)
        for row in submission_strings:
            f.write(row + '\n')

# make_submission(df_test, prediction, 'submission1.csv')