In [2]:
%%javascript

window.load_remote_theme = false
var theme_url = "https://drostehk.github.io/ipynb-theme/";
var asset_url = 'https://raw.githubusercontent.com/tijptjik/DS_assets/master/';

window.load_local_theme = function(){
    var hostname = document.location.hostname
    return ((hostname == "localhost" || hostname == '127.0.0.1') && !load_remote_theme)
}

var url = load_local_theme() ? document.location.origin + "/files/theme/custom.js" : theme_url + 'custom.js'

$.getScript(url)

<IPython.core.display.Javascript object>

# Kaggle Competition

## 2. Modeling

In [6]:
from __future__ import unicode_literals, division

import IPython
import warnings
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from pylab import *
from IPython.display import HTML
from IPython.display import display as prnt

# Matplotlib in notebook
%matplotlib inline

# Notebook Options
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

# Matplotlib Styles
c = {'axes.labelsize': 17,
'axes.titlesize': 16,
'figure.figsize': [18, 8],
'grid.linewidth': 1.6,
'legend.fontsize': 17,
'lines.linewidth': 2,
'lines.markeredgewidth': 0.0,
'lines.markersize': 11,
'patch.linewidth': 0.5,
'xtick.labelsize': 16,
'xtick.major.pad': 20,
'xtick.major.width': 2,
'xtick.minor.width': 1,
'ytick.labelsize': 16.0,
'ytick.major.pad': 20,
'ytick.major.width': 2,
'ytick.minor.width': 1 }
wide_c = dict(c, **{'figure.figsize':[20,8]})

#### Utility Functions

In [7]:
def table(df,replace_match="",replace_str=""):
    return IPython.display.display(HTML(df.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped table-hover">').replace(replace_match,replace_str)))

### Load the Data

In [120]:
DATA_DIR = '../../data/bikeshare/'
TRAIN_FILE = DATA_DIR + 'train.csv'
TEST_FILE = DATA_DIR + 'test.csv'
df = pd.read_csv(TRAIN_FILE)

#### Training / Test Split

In [121]:
def get_train_data():
    # Loads the training data, but splits the y from the X
    dfx = pd.read_csv(TRAIN_FILE)
    return dfx.iloc[:, 0:9], dfx.iloc[:,-1]

### Scoring Method

In [10]:
from sklearn.metrics import make_scorer

# First, we should set up some sort of testing framework, so that we can benchmark our progress as we go
# The evaluation metric is Root mean squared logarithmic error.
def rmsele(actual, pred):
    """
    Given a column of predictions and a column of actuals, calculate the RMSELE
    """
    squared_errors = (np.log(pred + 1) - np.log(actual + 1)) ** 2
    mean_squared = np.sum(squared_errors) / len(squared_errors)
    return np.sqrt(mean_squared)

# This helper function will make a callable that we can use in cross_val_score
rmsele_scorer = make_scorer(rmsele, greater_is_better=False)

### Baseline Model

In [127]:
from sklearn.cross_validation import KFold, cross_val_score

expected_value = df['count'].mean()
yhat = np.array([expected_value] * len(df['count']))

rmsele(df['count'].values, yhat)

1.5691983019475926

In [114]:
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold, cross_val_score

# Lets just train a basic model so that we can test if our scoring and
# cross validation framework works well. We'll use a Ridge regression,
# which is a form of linear regression
X, y = get_train_data()
# Subset the X to just use temp, atemp, and workingday
Xhat = X[['temp', 'atemp', 'humidity']]
ridge_estimator = Ridge(normalize=True)
scores = cross_val_score(ridge_estimator, Xhat, y, scoring=rmsele_scorer, cv=5, verbose=1)


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [115]:
X, y = get_train_data()
# Subset the X to just use temp, atemp, and workingday
Xhat = X[['temp', 'atemp', 'humidity']]
ridge_estimator = Ridge()
ridge_estimator.fit(Xhat,y).score(Xhat,y)


-0.9917841659309734

### CrossValidation

In [98]:
# Fill in some of the parameters on cross_val_score
def perform_cv(estimator, X, y):
    return cross_val_score(estimator, X, y, scoring=rmsele_scorer, cv=5, verbose=1)

In [113]:
X[ridge_estimator.predict(Xhat) < 0]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
1603,2011-07-21 19:00:00,3,0,1,1,35.26,46.21,67,19.0012
1604,2011-07-21 20:00:00,3,0,1,1,34.44,44.695,71,15.0013
1605,2011-07-21 21:00:00,3,0,1,1,33.62,44.695,75,15.0013


### Grid Search

In [99]:
from sklearn.grid_search import GridSearchCV

# Try a simple grid search with the estimator
parameters = {'alpha': np.logspace(0, 2, 10)}
grid = GridSearchCV(ridge_estimator, parameters, scoring=rmsele_scorer, cv=5)
grid.fit(Xhat, y)
grid.grid_scores_

[mean: -1.40707, std: 0.13649, params: {u'alpha': 1.0},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 1.6681005372000588},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 2.7825594022071245},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 4.6415888336127784},
 mean: -1.40707, std: 0.13649, params: {u'alpha': 7.7426368268112693},
 mean: -1.40706, std: 0.13649, params: {u'alpha': 12.915496650148841},
 mean: -1.40705, std: 0.13650, params: {u'alpha': 21.544346900318832},
 mean: -1.40703, std: 0.13650, params: {u'alpha': 35.938136638046259},
 mean: -1.40701, std: 0.13651, params: {u'alpha': 59.948425031894089},
 mean: -1.40697, std: 0.13652, params: {u'alpha': 100.0}]

In [100]:
# And for grid_search
def perform_grid_search(estimator, parameters, X, y):
    grid_search = GridSearchCV(estimator, parameters, scoring=rmsele_scorer, cv=5)
    grid_search.fit(X, y)
    return grid_search

## Transform Data

In [128]:
from sklearn.preprocessing import StandardScaler

In [130]:
Xhat = X[['temp', 'atemp', 'humidity']]
normalize = StandardScaler()
normalize.fit(Xhat)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [131]:
print normalize.std_
print normalize.mean_
(Xhat - Xhat.mean()) / Xhat.std()
(Xhat - normalize.mean_) / normalize.std_

[  8.05896237   8.78206495  19.29190522]
[ 20.62060681  24.01286462  64.12521177]


Unnamed: 0,temp,atemp,humidity
0,-1.235966,-1.440193,-0.421172
1,-1.235966,-1.181711,-0.421172
2,-1.235966,-1.181711,-0.421172
3,-1.235966,-1.267682,-0.421172
4,-1.235966,-1.267682,-0.421172
5,-1.337716,-1.440193,-0.213831
6,-1.439467,-1.526733,-0.213831
7,-1.439467,-1.526733,-0.473007
8,-1.439467,-1.526733,-0.473007
9,-1.337716,-1.440193,-0.628513


In [132]:
from sklearn.preprocessing import StandardScaler

# Now lets move on to the actual transformation of the inputs
# First, not every estimator we'll use will have the "normalize" keyword
# So let's break it out into a transformer, so that we have better control over it
normalize = StandardScaler()
ridge_estimator = Ridge()
Xhat = X[['temp', 'atemp', 'humidity']]
Xhat = normalize.fit_transform(Xhat)
scores = perform_cv(ridge_estimator, Xhat, y)
scores

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


array([-0.79759964, -1.1612211 , -0.95312686, -0.9597229 , -1.13519441])

In [133]:
from sklearn.pipeline import Pipeline, FeatureUnion

# Now we have the beginnings of a multi-step pipeline
# Scikit lets you wrap each of these steps into a Pipeline object, so you just have to run fit / predict once
# instead of manually feeding the data from one transformer to the next
normalize = StandardScaler()
ridge_estimator = Ridge()
pipeline = Pipeline([('normalize', normalize), ('ridge', ridge_estimator)])
Xhat = X[['temp', 'atemp', 'humidity']]
scores = perform_cv(pipeline, Xhat, y)
scores

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


array([-0.79748819, -1.16231977, -0.95315771, -0.95972978, -1.13522833])

In [134]:
# Additionally, you can perform grid search over all of the steps of the pipeline
# So you don't have to tune each step manually
# The pipeline exposes the underlying steps' parameters like so:
# ridge__alpha, and normalize__with_mean
normalize = StandardScaler()
ridge_estimator = Ridge()
parameters = {'ridge__alpha': np.logspace(0, 3, 10)}
Xhat = X[['temp', 'atemp', 'humidity']]
pipeline = Pipeline([('normalize', normalize), ('ridge', ridge_estimator)])
grid = GridSearchCV(pipeline, parameters, scoring=rmsele_scorer, cv=5)
grid.fit(Xhat, y)
grid.grid_scores_

[mean: -1.00157, std: 0.13375, params: {u'ridge__alpha': 1.0},
 mean: -1.00007, std: 0.13095, params: {u'ridge__alpha': 2.1544346900318838},
 mean: -1.00438, std: 0.13395, params: {u'ridge__alpha': 4.6415888336127784},
 mean: -1.00412, std: 0.12983, params: {u'ridge__alpha': 10.0},
 mean: -1.00532, std: 0.12538, params: {u'ridge__alpha': 21.544346900318832},
 mean: -1.00748, std: 0.12074, params: {u'ridge__alpha': 46.415888336127772},
 mean: -1.01160, std: 0.11830, params: {u'ridge__alpha': 100.0},
 mean: -1.01541, std: 0.11720, params: {u'ridge__alpha': 215.44346900318823},
 mean: -1.01812, std: 0.11709, params: {u'ridge__alpha': 464.15888336127773},
 mean: -1.02011, std: 0.11788, params: {u'ridge__alpha': 1000.0}]

## Submission

In [118]:
def make_submission(df_test, prediction, filename='submission.csv'):
    with open(filename, 'w') as f:
        f.write('datetime,count\n')
        submission_strings = df_test.reset_index()['datetime'] + ',' + prediction.astype(str)
        for row in submission_strings:
            f.write(row + '\n')

# make_submission(df_test, prediction, 'submission1.csv')