# Imports

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
from sklearn import linear_model
from sklearn import cross_validation
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.learning_curve import learning_curve
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import explained_variance_score

# Importing and Creating Training & Testing Sets


In [10]:
original_train = pd.read_csv('../train.csv',header = 0)
original_test = pd.read_csv('../test.csv',header = 0)

In [63]:
df_train = original_train.copy()
df_train['month'] = pd.DatetimeIndex(df_train.datetime).month
df_train['day'] = pd.DatetimeIndex(df_train.datetime).dayofweek
df_train['hour'] = pd.DatetimeIndex(df_train.datetime).hour
df_train = df_train.drop(['datetime','casual','registered'], axis = 1)

In [64]:
df_train_y = df_train['count'].values
df_train_x = df_train.drop(['count'],axis = 1).values

In [65]:
df_test = original_test.copy()
df_test['month'] = pd.DatetimeIndex(df_test.datetime).month
df_test['day'] = pd.DatetimeIndex(df_test.datetime).dayofweek
df_test['hour'] = pd.DatetimeIndex(df_test.datetime).hour
df_test = df_test.drop(['datetime'], axis = 1)

# Initial Data View

In [22]:
df_train.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month,day,hour
0,1,0,0,1,9.84,14.395,81,0.0,16,1,5,0
1,1,0,0,1,9.02,13.635,80,0.0,40,1,5,1


In [27]:
df_train.dtypes

season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
count           int64
month           int32
day             int32
hour            int32
dtype: object

In [26]:
df_train.shape

(10886, 12)

# Utils

In [56]:
# Cross Validation Generator
cv = cross_validation.ShuffleSplit(len(df_train_x), n_iter=3, test_size=0.2,
    random_state=0)

In [69]:
# Function that prints the prediction according the submission format
def printPrediction(pred, fileName='pred.csv'):
    str_prediction = "datetime,count\n"
    for i in range(0, len(pred)):
        datetime = original_test['datetime'][i]
        currentPred = int(round(pred[i]))
        str_prediction += "{},{}\n".format(datetime, currentPred)

    #print str_prediction
    f = open(fileName,'w')
    f.write(str_prediction)
    f.close()

print "SVR(kernel='rbf',C=10,gamma=.001)"
for train, test in cv:
    
    svc = svm.SVR(kernel ='rbf', C = 10, gamma = .001).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))

print "Ridge"    
for train, test in cv:    
    svc = linear_model.Ridge().fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
    
print "Random Forest(n_estimators = 100)"    
for train, test in cv:    
    svc = RandomForestRegressor(n_estimators = 100).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))

# Models

## Decision Tree

In [51]:
#Finding best parameters
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    df_train_x, df_train_y, test_size=0.3, random_state=0)

tuned_parameters = [
    {'max_depth':[9,10,11,20,25,50]},
    {'min_samples_leaf':[1,2,5,10,50,100]},
    {'min_samples_split':[1,2,5,10,50,100]},
    {'presort':[True, False]},
    {'min_weight_fraction_leaf':[0.0, 0.1, 0.3, 0.4, 0.5]}]   
    
scores = ['r2']

for score in scores:
    
    print score
    
    clf = GridSearchCV(tree.DecisionTreeRegressor(), tuned_parameters, cv=5, scoring=score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    
    #best_estimator_ returns the best estimator chosen by the search
    print(clf.best_estimator_)
    print ""
    print("Grid scores on development set:")
    print ""
    #grid_scores_ returns:
    #    * a dict of parameter settings
    #    * the mean score over the cross-validation folds 
    #    * the list of scores for each fold
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print ""

r2
Best parameters set found on development set:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

Grid scores on development set:

0.795 (+/-0.007) for {'max_depth': 9}
0.795 (+/-0.006) for {'max_depth': 10}
0.787 (+/-0.009) for {'max_depth': 11}
0.740 (+/-0.008) for {'max_depth': 20}
0.738 (+/-0.009) for {'max_depth': 25}
0.740 (+/-0.007) for {'max_depth': 50}
0.741 (+/-0.010) for {'min_samples_leaf': 1}
0.761 (+/-0.007) for {'min_samples_leaf': 2}
0.797 (+/-0.006) for {'min_samples_leaf': 5}
0.806 (+/-0.006) for {'min_samples_leaf': 10}
0.758 (+/-0.010) for {'min_samples_leaf': 50}
0.702 (+/-0.007) for {'min_samples_leaf': 100}
0.741 (+/-0.008) for {'min_samples_split': 1}
0.738 (+/-0.009) for {'min_samples_split': 2}
0.753 (+/-0.008) for {'min_samples_split': 5}
0.778 (+/-0.009) for 

In [67]:
# Cross Validation
dtr = tree.DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
for train, test in cv:
    clf = dtr.fit(df_train_x[train], df_train_y[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        clf.score(df_train_x[train], df_train_y[train]), clf.score(df_train_x[test], df_train_y[test])))

train score: 0.882, test score: 0.804

train score: 0.881, test score: 0.832

train score: 0.881, test score: 0.818



In [70]:
# Predicting & Exporting File
dtr_pred = dtr.predict(df_test)
printPrediction(dtr_pred)


## Random Forest

In [77]:
#Finding best parameters
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    df_train_x, df_train_y, test_size=0.3, random_state=0)

tuned_parameters = [
    {'max_depth':[50,100, 500, 1000]}, #Best: 1000
    #{'n_estimators':[1000, 10000]}, #We assume is 1000
    {'min_samples_leaf':[1,2,5,10,50,100]}, #2
    {'min_samples_split':[1,2,5,10,50,100]}, #10
    {'min_weight_fraction_leaf':[0.0, 0.1, 0.3, 0.4, 0.5]}] #0.0    
    
scores = ['r2']

for score in scores:
    
    print score
    
    clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring=score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    
    #best_estimator_ returns the best estimator chosen by the search
    print(clf.best_estimator_)
    print ""
    print("Grid scores on development set:")
    print ""
    #grid_scores_ returns:
    #    * a dict of parameter settings
    #    * the mean score over the cross-validation folds 
    #    * the list of scores for each fold
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print ""





r2
Best parameters set found on development set:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=1, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

Grid scores on development set:

0.847 (+/-0.003) for {'max_depth': 50}
0.847 (+/-0.004) for {'max_depth': 100}
0.847 (+/-0.003) for {'max_depth': 500}
0.848 (+/-0.002) for {'max_depth': 1000}
0.843 (+/-0.004) for {'min_samples_leaf': 1}
0.845 (+/-0.003) for {'min_samples_leaf': 2}
0.844 (+/-0.004) for {'min_samples_leaf': 5}
0.837 (+/-0.005) for {'min_samples_leaf': 10}
0.771 (+/-0.004) for {'min_samples_leaf': 50}
0.693 (+/-0.007) for {'min_samples_leaf': 100}
0.849 (+/-0.003) for {'min_samples_split': 1}
0.846 (+/-0.005) for {'min_samples_split': 2}
0.843 (+/-0.004) for {'min_samples_split': 5}
0.848 (+/-0.005) for {'mi

In [78]:
# Cross Validation
rfr = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=1000,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
    
for train, test in cv:
    clf = rfr.fit(df_train_x[train], df_train_y[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        clf.score(df_train_x[train], df_train_y[train]), clf.score(df_train_x[test], df_train_y[test])))

train score: 0.941, test score: 0.860

train score: 0.940, test score: 0.876

train score: 0.941, test score: 0.866



In [79]:
# Predicting & Exporting File
rfr_pred = rfr.predict(df_test)
printPrediction(rfr_pred)


In [96]:
#TRAINING WITH BEST PARAMETERS FOUND
rfr = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

svc2 = rfr.fit(df_train_data[train], df_train_target[train])
print("train score: {0:.3f}, test score: {1:.3f}\n".format(
    svc2.score(df_train_data[train], df_train_target[train]), svc2.score(df_train_data[test], df_train_target[test])))

train score: 0.982, test score: 0.871



In [97]:
#PREDICT TEST DATA
df_test_to_pred = df_test.drop(['datetime', 'year'], axis = 1)
test_pred = rfr.predict(df_test_to_pred)

str_prediction = "datetime,count\n"
for i in range(0, len(test_pred)):
    datetime = df_test['datetime'][i]

    pred = int(round(test_pred[i]))
    str_prediction += "{},{}\n".format(datetime, pred)

#print str_prediction
f = open('pred.csv','w')
f.write(str_prediction)
f.close()