# Demand prediction baseline solution

Victor Kantor, xead.wl@gmail.com

## Let's load our data

In [165]:
import numpy as np
import pandas as pd
import pickle

In [166]:
sample_submission = pd.read_csv("sample_submission.tsv")

In [167]:
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [168]:
data = load_obj('data')

In [169]:
X_train = data['train'][0][0]
m_train = data['train'][0][1]
y_train = data['train'][1]

In [170]:
X_test = data['test'][0][0]
m_test = data['test'][0][1]

In [171]:
blocks = data['blocks']
blocks

[0, 4, 57, 60, 73, 133]

## Let's make log transform of our data

In [172]:
print X_train

[[  1.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   6.22070000e+04
    6.39600000e+04   7.62860000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   1.27999000e+05
    1.29234000e+05   1.48216000e+05]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   4.00000000e+01
    3.00000000e+01   4.00000000e+01]
 ..., 
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   1.75780000e+04
    1.34970000e+04   3.03900000e+03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   5.80650000e+04
    7.57210000e+04   6.52000000e+03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   5.95320000e+04
    9.17580000e+04   1.02126000e+05]]


In [122]:
x = X_train[:,73:]
x[x == 0] = 1e-7
X_train[:,73:] = np.log(x)
y_train = np.log(y_train)

In [123]:
x = X_test[:,73:]
x[x == 0] = 1e-7
X_test[:,73:] = np.log(x)

## Let's set up validation for parameter tuning

In [173]:
# Доля в трейн сет
frac = 0.8
l = len (X_train)
ind = int(l*frac)
print ind
X_t = X_train[:ind,:]
y_t = y_train[:ind]
X_v = X_train[ind:,:]
y_v = y_train[ind:] 

5796


In [174]:
X_train.shape

(7246, 133)

In [None]:
X_

## Parameter tuning  for GBR

In [175]:
from itertools import product

In [176]:
n_est = [50, 100, 200]
max_depth = [4, 5, 7]
grid = list(product(n_est, max_depth))
mse_score = []

In [177]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

for p in grid:
    model = GradientBoostingRegressor(n_estimators=p[0], max_depth=p[1], random_state=42)
    model.fit(X_t, y_t)

    preds = model.predict(X_v)
    mse_err = mean_squared_error(y_v, preds)
    print p, mse_err
    mse_score.append(mse_err)    

(50, 4) 14856881054.6
(50, 5) 15944948175.1
(50, 7) 17976400358.0
(100, 4) 14092333770.9
(100, 5) 15445530245.9
(100, 7) 17839784072.9
(200, 4) 13964273125.8
(200, 5) 15302403817.1
(200, 7) 17726758242.6
CPU times: user 4min 19s, sys: 240 ms, total: 4min 20s
Wall time: 4min 24s


## Evaluate best parameters

In [131]:
model = GradientBoostingRegressor(n_estimators=50, max_depth=5, random_state=42)
model.fit(X_train, y_train)
preds = np.exp(model.predict(X_test))
preds = np.multiply(preds, m_test)
print preds

[   1133.8607276    18595.15539151  256640.26877896 ...,   12752.4394081
     364.12401065    2818.86984272]


In [132]:
sample_submission['y'] = preds

In [133]:
sample_submission.head(5)

Unnamed: 0,Num,y
0,348622,1133.860728
1,348623,18595.155392
2,348624,256640.268779
3,348625,20164.130725
4,348626,18.553431


In [134]:
# In GBM you can get some negative predictions:
print sample_submission[sample_submission['y'] < 0]

Empty DataFrame
Columns: [Num, y]
Index: []


In [135]:
sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0)

In [136]:
sample_submission.to_csv("baseline_submission.tsv", sep=',', index=False)