### Load the datasets

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print('train.shape: {}'.format(train.shape))
print('test.shape: {}'.format(test.shape))

train.shape: (10886, 12)
test.shape: (6493, 9)


### Drop the useless 'casual' and 'registiered' attributes

In [3]:
train = train.drop(['casual', 'registered'], axis=1)
print('train.shape: {}'.format(train.shape))
print('test.shape: {}'.format(test.shape))

train.shape: (10886, 10)
test.shape: (6493, 9)


### Combine the training and testing data for easy processing

In [4]:
sep = train.shape[0]
combine = pd.concat([train.drop('count', axis=1), test], axis=0)
print('combine.shape: {}'.format(combine.shape))

combine.shape: (17379, 9)


### Log transform the target 'count'

In [5]:
train['count'] = np.log1p(train['count'])

### Feature engineering

- datetime: to pandas datetime dtype and isolate the time attributes
- season: to pandas category dtype
- holiday: to binary, already done
- workingday: to binary, already done
- weather: to pandas category dtype
- temp: stay the same
- atemp: to drop
- humidity: stay the same
- windspeed: stay the same
- year: stay the same (already in binary style)
- month: to pandas category dtype
- day: to drop
- hour: to pandas category dtype

In [6]:
combine['datetime'] = pd.to_datetime(combine['datetime'])

combine['year'] = combine['datetime'].dt.year
combine['month'] = combine['datetime'].dt.month
combine['day'] = combine['datetime'].dt.day
combine['hour'] = combine['datetime'].dt.hour

combine = combine.drop('datetime', axis=1)

combine['season'] = combine.season.astype('category')
combine['weather'] = combine.weather.astype('category')
combine = combine.drop('atemp', axis=1)
combine['month'] = combine.month.astype('category')
combine = combine.drop('day', axis=1)
combine['hour'] = combine.hour.astype('category')

#### Transform categorical data into dummy variables

In [7]:
combine = pd.get_dummies(combine)

In [8]:
combine.head()

Unnamed: 0,holiday,workingday,temp,humidity,windspeed,year,season_1,season_2,season_3,season_4,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,0,0,9.84,81,0.0,2011,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,9.02,80,0.0,2011,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,9.02,80,0.0,2011,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,9.84,75,0.0,2011,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,9.84,75,0.0,2011,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Prepare the data for modeling

In [9]:
X_train = combine[:sep]
y_train = train['count']
X_test = combine[sep:]

print('X_train.shape: {}'.format(X_train.shape))
print('y_train.shape: {}'.format(y_train.shape))
print('X_test.shape: {}'.format(X_test.shape))

X_train.shape: (10886, 50)
y_train.shape: (10886,)
X_test.shape: (6493, 50)


### Use xgboost to predict the target

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from xgboost.sklearn import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

#### Set up the metric

In [11]:
mt = 'neg_mean_squared_error'
kf = KFold(n_splits=4, shuffle=True, random_state=0)

def performance(model):
    scores = - cross_val_score(model, X_train, y_train, cv=kf, scoring=mt, n_jobs=4)
    score_mean = scores.mean()
    score_std = scores.std()
    print('score mean: {:.4f}'.format(score_mean))
    print('score std: {:.4f}'.format(score_std))
    return

def grid_search(model, params):
    grid = GridSearchCV(model, params, cv=kf, scoring=mt, verbose=True, n_jobs=4).fit(X_train, y_train)
    print('grid.best_score_: {:.4f}'.format(-grid.best_score_))
    print('grid.best_params:\n{}'.format(grid.best_params_))
    return grid.best_estimator_

#### Baseline

In [12]:
performance(XGBRegressor())

score mean: 0.2288
score std: 0.0041


#### Grid search for best parameters

In [13]:
params = {
    'objective': ['reg:squarederror'],
    'n_estimators': [500],
    'learning_rate': [0.09],
    'gamma': [1e-3],
    'max_depth': [5],
    'reg_alpha': [1.0],
    'reg_lambda': [0.01],
    'subsample_by_tree': [0.1]
}

reg = grid_search(XGBRegressor(), params)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   25.0s remaining:   25.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   25.1s finished


grid.best_score_: 0.1002
grid.best_params:
{'gamma': 0.001, 'learning_rate': 0.09, 'max_depth': 5, 'n_estimators': 500, 'objective': 'reg:squarederror', 'reg_alpha': 1.0, 'reg_lambda': 0.01, 'subsample_by_tree': 0.1}


#### Make prediction and submit the result

In [14]:
reg = reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred = np.exp(y_pred) - 1
test['count'] = y_pred
sub = test[['datetime', 'count']]
sub.to_csv('sub.csv', index=False)