In [1]:
from pandas.tseries.offsets import *
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import itertools
from sklearn.tree import DecisionTreeRegressor
from sklearn import grid_search
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
path = {}
path['train'] = {'order':'./training_data_2/order_data/order_data_{}', 
                    'weather': './training_data_2/weather_data/weather_data_{}',
                    'traffic': './training_data_2/traffic_data/traffic_data_{}',
                    'district':'./training_data_2/cluster_map/cluster_map',
                    'poi':'./training_data_2/poi_data/poi_data'}
path['test'] = {'order':'./test_data_2/order_data/order_data_{}_test', 
                'weather': './test_data_2/weather_data/weather_data_{}_test',
                'traffic': './test_data_2/traffic_data/traffic_data_{}_test',
                'district':'./test_data_2/cluster_map/cluster_map'}

M = np.timedelta64(1, 'm') # base time stamp of 1 minute

test_slot1 = range(46,154,12) # The test slot for day 22, 26, 30
test_slot2 = range(58,154,12) # The test slot for day 24, 28

D_range = range(1,67) # List of all district Ids
T_range = range(1,145) # List of all time slots

# Dictionary of District Info Table
district_dict = pd.read_table(path['train']['district'], header=None, index_col=0)
district_dict = district_dict[1].to_dict()

In [3]:
print len(district_dict) == 66

True


## Utility Funtion

In [4]:
def index(district, slot):
    if type(district) is int:
        if type(slot) is int:
            return [x for x in itertools.product([district],[slot])]
        else:
            return [x for x in itertools.product([district],slot)]
    else:
        if type(slot) is int:
            return [x for x in itertools.product(district,[slot])]
        else:
            return [x for x in itertools.product(district,slot)]

In [5]:
def District(df):
    return df['district'].apply(lambda x: district_dict[x])

In [6]:
def Weekday(df):
    return pd.to_datetime(df['time']).apply(lambda x: x.weekday())

In [7]:
def Time(df, day):
    time = pd.to_datetime(df['time'])
    time = (time - pd.Timestamp(day)) / M / 10 + 1
    return time.astype(int)

## Getting testing data and training data

In [9]:
def Traffic(day, option):
    df = pd.read_table(path[option]['traffic'].format(day.date()), header=None,
                      names=['district', 'LV1', 'LV2', 'LV3', 'LV4','time'])
    df['district'] = District(df)
    df['weekday'] = Weekday(df)
    df['time'] = Time(df, str(day.date()))
    for L in ['LV{}'.format(n) for n in range(1,5)]:
        df[L]=df[L].apply(lambda x: x.split(':')[1]).astype(int)
    index = pd.MultiIndex.from_arrays([df['district'].values, df['time'].values], names=('district', 'time'))
    return pd.DataFrame({'weekday':df['weekday'].values,
                         'day':day.day,
                         'district':df['district'].values,
                         'time':df['time'].values,
                         'LV1':df['LV1'].values, 
                         'LV2':df['LV2'].values,
                         'LV3':df['LV3'].values,
                         'LV4':df['LV4'].values,}, index=index).sort_index()

In [10]:
# DTG: District Time Gap
def DTG(day, option):
    df = pd.read_table(path[option]['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    df = df[df['driver'].isnull()] 
    df['district'] = District(df)
    df['time'] = Time(df, day.date())
    Order = df.groupby(['district', 'time'])
    return pd.DataFrame({'gap':Order.size()})

In [11]:
def Weather(day, option):
    df = pd.read_table(path[option]['weather'].format(day.date()), header=None,
                      names=['time', 'weather', 'temprature', 'pm2.5'])
    df['time'] = Time(df, day.date())
    df = df.drop_duplicates(subset='time')
    DF = pd.DataFrame({'time': T_range}, columns=df.columns)
    DF = DF.set_index('time')
    DF.update(df.set_index('time'))
    return DF.fillna(method='bfill').fillna(method='ffill')

In [12]:
# Dictionary of order data for testing, indexed by date, cols = [gap]
test_order = {} 
# Dictionary of traffic data for testing, indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
test_traffic = {}
# Dictionary of weather data for testing, indexed by date, cols = [temperature, weather, pm2.5]
test_weather = {} 
for day in pd.date_range('1/23/2016', periods=5, freq='2D'):
    test_order[str(day.date())] = DTG(day, 'test')
    test_traffic[str(day.date())] = Traffic(day, 'test')
    test_weather[str(day.date())] = Weather(day, 'test')

In [13]:
# Check data
print len(test_order.keys()) == 5
print len(test_traffic.keys()) == 5
print len(test_weather.keys()) == 5
print all(test_order['2016-01-23'].columns.values == np.array(['gap']))
print all(test_traffic['2016-01-23'].columns.values == np.array(['LV1', 'LV2', 'LV3', 'LV4', 'day', 'district', 'time', 'weekday']))
print all(test_weather['2016-01-23'].columns.values == np.array(['weather', 'temprature', 'pm2.5']))

True
True
True
True
True
True


In [14]:
# Dictionary of order data for training, indexed by date, cols = [gap]
train_order = {}
# Dictionary of traffic data for training, indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
train_traffic = {}
# Dictionary of weather data for training, indexed by date, cols = [temperature, weather, pm2.5]
train_weather = {}
for day in pd.date_range('1/1/2016', periods=21, freq='D'):
    train_order[str(day.date())] = DTG(day, 'train')
    train_traffic[str(day.date())] = Traffic(day, 'train')
    train_weather[str(day.date())] = Weather(day, 'train')

In [15]:
# Check data
print len(train_order.keys()) == 21
print len(train_traffic.keys()) == 21
print len(train_weather.keys()) == 21

True
True
True


### POI

In [16]:
columns = ['district'] + range(1,26)
POI = pd.DataFrame(columns=columns)
with open(path['train']['poi'], 'r') as f:
    for i, line in enumerate(f): 
        interests = line.strip().split('\t')
        row = {'district': interests[0]}
        for item in interests[1:]:
            category,num = item.split(':')
            category = int(category.split('#')[0])
            if category in row:
                row[category] += int(num)
            else:
                row[category] = int(num)
        POI = pd.concat( [POI, pd.DataFrame(row, index=[i], columns=columns)])
POI['district'] = District(POI)
POI = POI.set_index('district').sort_index()
POI = POI.fillna(0)
# Standardization
POI = (POI - POI.mean()) / POI.std()

## Preparing training data X and Y

In [222]:
def preprocessing(day):
    X = train_order[day].reindex(index(D_range,T_range)).fillna(0)
    X['last'] = X['gap'].shift().fillna(method='bfill')
    X['diff'] = pd.Series(index=X.index)
    X['GAP'] = pd.Series(index=X.index)
    for D in D_range:
        X.loc[(D, T_range),'diff'] = X.loc[(D, T_range),'gap'].diff().shift(-1).fillna(0)
        X.loc[(D, T_range), 'GAP'] = X.loc[(D, T_range),'gap'].shift(-1).fillna(method='ffill')
    # filling nearest value for missin traffic data 
    tempX = train_traffic[day].reindex(index(D_range,T_range)).fillna(method='bfill').fillna(method='ffill')
    X = pd.concat([X, tempX],axis=1)
    X = X.join(train_weather[day], on='time')
    Y_diff = X['diff']
    Y_gap = X['GAP']
    X.drop('diff', axis=1, inplace=True)
    X.drop('GAP', axis=1, inplace=True)
    return X, Y_diff, Y_gap

In [287]:
# Create X, Y_diff, Y_gap
# X = []
# Y_diff = []
# Y_gap = []
# for d in pd.date_range('1/2/2016', periods=20, freq='D'):
#     tempX, tempY_diff, tempY_gap = preprocessing(str(d.date()))
#     X.append(tempX)
#     Y_diff.append(tempY_diff)
#     Y_gap.append(tempY_gap)
# X = pd.concat(X)
# X = X.join(POI,on='district')
# X.sort_index(inplace=True)
# Y_diff = pd.concat(Y_diff)
# Y_diff.sort_index(inplace=True)
# Y_gap = pd.concat(Y_gap)
# Y_gap.sort_index(inplace=True)
# X.to_csv('./X.csv', columns=X.columns, header=True)
# Y_diff.to_csv('./Y_diff.csv', header=True)
# Y_gap.to_csv('./Y_gap.csv', header=True)

X = pd.read_csv('./X.csv', index_col=('district', 'time'))
Y_gap = pd.read_csv('./Y_gap.csv', index_col=('district', 'time'))
Y_diff = pd.read_csv('./Y_diff.csv', index_col=('district', 'time'))

In [241]:
print len(X.columns) == 38
print Y_diff.shape[0] == X.shape[0]
print Y_gap.shape[0] == X.shape[0]

True
True
True


## Developing Zone

In [None]:
def add_test_data(day):
    X = train_order[day]
    X['diff'] = pd.Series(index=X.index)
    for D in D_range:
        X.loc[(D, T_range),'diff'] = X.loc[(D, T_range),'gap'].diff().shift(-1).fillna(0)
    # filling nearest value for missin traffic data 
    tempX = train_traffic[day].reindex(index(D_range,T_range)).fillna(method='bfill').fillna(method='ffill')
    X = pd.concat([X, tempX],axis=1)
    X = X.join(train_weather[day], on='time')
    Y = X['diff']
    X.drop('diff', axis=1, inplace=True)
    return X, Y

## Identified the most common gap growth each day

In [87]:
DF = pd.DataFrame(columns=range(1,12))
for d in pd.date_range('1/2/2016', periods=20, freq='1D'):
    df = train_order[str(d.date())]
    df = df.reindex(index(D_range, T_range)).fillna(0)
    for D in D_range:
        df.loc[(D, T_range),'diff'] = df.loc[(D, T_range),'gap'].diff().shift(-1).fillna(0)
    row = pd.DataFrame(df['diff'].value_counts().sort_values(ascending=False).iloc[:11].index.values.reshape((1,11)),
                       columns=range(1,12), index=[d.day])
    DF = DF.append(row)
print DF

    1   2   3   4   5   6   7   8   9   10  11
2    0  -1   1   2  -2   3  -3  -4   4   5  -5
3    0   1  -1   2  -2  -3   3   4  -4   5  -5
4    0   1  -1  -2   2  -3   3   4  -4   5  -5
5    0  -1   1   2  -2   3  -3   4  -4  -5   5
6    0   1  -1   2  -2   3  -3  -4   4   5  -5
7    0  -1   1  -2   2   3  -3  -4   4   5  -5
8    0  -1   1  -2   2   3  -3  -4   4   5  -5
9    0   1  -1  -2   2   3  -3   4  -4  -5   5
10   0   1  -1  -2   2   3  -3   4  -4  -5   5
11   0   1  -1   2  -2   3  -3  -4   4   5  -5
12   0   1  -1  -2   2  -3   3   4  -4   5  -5
13   0   1  -1  -2   2  -3   3   4  -4  -5   5
14   0  -1   1   2  -2   3  -3   4  -4   5  -5
15   0   1  -1  -2   2  -3   3  -4   4  -5   5
16   0   1  -1  -2   2  -3   3   4  -4  -5   5
17   0  -1   1   2  -2  -3   3   4  -4  -5   5
18   0  -1   1  -2   2   3  -3  -4   4   5  -5
19   0  -1   1   2  -2   3  -3   4  -4   5  -5
20   0  -1   1  -2   2   3  -3   4  -4  -5   5
21   0   1  -1  -2   2   3  -3  -4   4  -5   5


## Identified the most common gap each day

In [88]:
DF = pd.DataFrame(columns=range(1,12))
for d in pd.date_range('1/2/2016', periods=20, freq='1D'):
    df = train_order[str(d.date())]
    row = pd.DataFrame(df['gap'].value_counts().sort_values(ascending=False)[:11].index.values.reshape((1,11)),
                       columns=range(1,12), index=[d.day])
    DF = DF.append(row)

In [89]:
for d in pd.date_range('1/23/2016', periods=5, freq='2D'):
    df = test_order[str(d.date())]
    row = pd.DataFrame(df['gap'].value_counts().sort_values(ascending=False)[:11].index.values.reshape((1,11)),
                       columns=range(1,12), index=[d.day])
    DF = DF.append(row)

In [90]:
# columns is the top 11 most common gap
# index is each day
print DF

    1   2   3   4   5   6   7   8   9   10  11
2    1   2   3   4   5   6   7   8   9  10  11
3    1   2   3   4   5   6   7   8  10   9  11
4    1   2   3   4   5   6   7   9   8  10  11
5    1   2   3   4   5   6   7   8   9  10  11
6    1   2   3   4   5   6   7   8   9  10  12
7    1   2   3   4   5   6   7   8   9  10  11
8    1   2   3   4   5   6   7   8   9  10  12
9    1   2   3   4   5   6   7   8   9  10  11
10   1   2   3   4   5   6   7   8   9  10  11
11   1   2   3   4   5   6   7   8   9  10  11
12   1   2   3   4   5   6   7   8   9  11  10
13   1   2   3   4   5   6   7   8   9  11  10
14   1   2   3   4   5   6   7   8   9  10  11
15   1   2   3   4   5   6   7   8   9  10  11
16   1   2   3   4   5   6   7   8   9  10  11
17   1   2   3   4   5   6   7   8   9  10  11
18   1   2   3   4   5   6   7   8   9  11  10
19   1   2   3   4   5   6   7   8   9  12  11
20   1   2   3   4   5   6   7   8   9  10  11
21   1   2   3   4   5   6   7   8   9  10  11
23   1   2   

## Method4 by GradientBoosting on classification

In [105]:
def score3(day, pred):
    ans = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
    ans = ans['gap'].values
    pred = pred[ans>0]
    ans = ans[ans>0]
    gap = (ans - pred) / ans
    return np.fabs(gap).sum()/ans.shape[0]

In [106]:
def select_last_points(day, slot):
    slot = np.array(slot)
    x = test_order[day].reindex(index(D_range, slot)).fillna(0)
    x['last'] = test_order[day].reindex(index(D_range, slot-1)).fillna(0)
    x = pd.concat([x, test_traffic[day]],axis=1)
    x = x.drop(x[x['gap'].isnull()].index)
    # For missing traffic data on district 54, replaced by district 17 
    for t in slot:
        x.loc[(54,t)]['LV1':'weekday'] = x.loc[(17,t)]['LV1':'weekday']
    x = x.join(test_weather[day], on='time')
    x = x.join(POI,on='district')
    print "Select data from {} on {}".format(day, slot)
    print "\t shape: {}".format(x.shape)
    return x

In [94]:
from sklearn.ensemble import GradientBoostingClassifier
slot1 = range(45,153,12) # Last time slot of test slot for day 23, 27, 31
slot2 = range(57,153,12) # Last time slot of test slot for day 25, 29
S_range = {'2016-01-23':slot1, '2016-01-25':slot2, '2016-01-27':slot1, '2016-01-29':slot2, '2016-01-31':slot1}

In [250]:
params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
          'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20}
clf = GradientBoostingClassifier(**params)
clf.fit(X,Y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=0.8, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [380]:
# 
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    score = score3(day, clf.predict(x[columns]))
    scores.append(score * x.shape[0])
    print '\t score: {}'.format(score)
print np.array(scores).sum()/2838

Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.487475705874
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.455771252706
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.5231750185
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.486871117277
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.527273014907
0.494759231394


In [263]:
columns = []
for i, important in enumerate(clf.feature_importances_ > 0.03):
    if important:
        columns.append(X.columns[i])

In [377]:
newY = Y_gap[Y_gap>0]
newX = X[Y_gap>0]
newY[newY>12]=13

In [379]:
clf = GradientBoostingClassifier(**params)
clf.fit(newX[columns],newY)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=0.8, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

### Feature transformations with ensembles of trees

In [419]:
def get_score(clf, columns):
    SLOT1 = range(44,152,12)
    SLOT2 = range(56,152,12)
    RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
    scores = []
    for day in RANGE.keys():
        x = select_last_points(day, RANGE[day])
        score = score3(day, clf.predict(x[columns]))
        scores.append(score * x.shape[0])
        print '\t score: {}'.format(score)
    print np.array(scores).sum()/2838

In [387]:
from sklearn.cross_validation import train_test_split
X_train, X_train_lr, y_train, y_train_lr = train_test_split(newX[columns],
                                                            newY,
                                                            test_size=0.5)
grd = GradientBoostingClassifier(**params)
grd.fit(X_train,y_train)

NameError: name 'LogisticRegression' is not defined

In [420]:
get_score(grd, columns)

Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.514295689059
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.464276793287
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.538297314982
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.509096692953
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.541246695547
0.512218015394


In [403]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
enc = OneHotEncoder()
lm = LogisticRegression()
enc.fit(grd.apply(X_train)[:, :, 0])
lm.fit(enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
# lm.fit(X_train_lr, y_train_lr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [427]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    y_pred_grd_lm = lm.predict_proba(enc.transform(grd.apply(x[columns])[:, :, 0]))[:, 1]
    score = score3(day, pd.Series(y_pred_grd_lm, index=x.index))
    scores.append(score * x.shape[0])
    print '\t score: {}'.format(score)
print np.array(scores).sum()/2838

Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.908838025346
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.904813715905
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.914107711282
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.900150672539
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.909535937323
0.90728769565


### Explore which gap is the most common errors

In [369]:
def score4(day, pred):
    a = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
    ans = a['gap'].values
    pred = pred[ans>0]
    ans = ans[ans>0]
    gap = (ans - pred) / ans
    temp = pd.DataFrame(ans, index=pred.index,columns=[day])
    return temp, pd.DataFrame(np.fabs(gap),columns=[day])

In [381]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
ans = pd.DataFrame()
scores = pd.DataFrame()
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    a, s = score4(day, pd.Series(clf.predict(x[columns]), index=x.index))
    ans = pd.concat([ans, a], axis=1)
    scores = pd.concat([scores, s], axis=1)

Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)


In [382]:
worst = scores[(scores>=0.8).any(1)]
print ans.loc[worst.index]['2016-01-23'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-25'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-27'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-29'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-31'].value_counts()[:10]

1     25
5     17
4     16
2     11
6     11
3      9
9      8
8      6
10     6
Name: 2016-01-23, dtype: int64
1     16
2     16
5     13
3     13
9     11
4      9
7      8
6      6
10     6
Name: 2016-01-25, dtype: int64
1     26
5     17
3     15
4     14
2     12
6      8
7      5
11     4
13     3
22     3
20     3
19     3
15     3
12     3
10     3
Name: 2016-01-27, dtype: int64
1     16
2     12
4     12
6     11
7     11
3     10
5      6
10     5
Name: 2016-01-29, dtype: int64
1     21
2     19
6     12
3      9
5      9
4      8
8      8
10     8
Name: 2016-01-31, dtype: int64


## Method3 by GradientBoosting on regression

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
slot1 = range(45,153,12) # Last time slot of test slot for day 23, 27, 31
slot2 = range(57,153,12) # Last time slot of test slot for day 25, 29
S_range = {'2016-01-23':slot1, '2016-01-25':slot2, '2016-01-27':slot1, '2016-01-29':slot2, '2016-01-31':slot1}

In [None]:
def write3(x, day, slot, regr, mode):
    with open('ans3_v1.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in D_range:
            for S in slot:
                key = (D,S)
                if key in x.index:
                    gap = x['gap'].loc[key] + regr.predict(x.loc[key].reshape(1, -1))[0]
                else:
                    gap = 1
                gap = 0 if gap < 0 else gap
                writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])

### Training Regressor

In [None]:
# ans3_v1.csv
# params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':250, 'max_features':0.5, 'random_state':1,
#           'warm_start':False,'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0}
#-----brand new turing
params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':20, 'max_features':1.0, 'random_state':1,
          'warm_start':False,'max_depth': 10, 'learning_rate': 0.25, 'subsample': 0.85,
          'min_samples_leaf': 25, 'min_samples_split':100}

# Paramters need to be searched
parameters = {'n_estimators': np.arange(100,600,150)}

In [None]:
# Searching best parameters
# scoring_function = make_scorer(mean_squared_error, greater_is_better=False)
# regr = grid_search.GridSearchCV(GradientBoostingRegressor(**params), 
#                                 param_grid=parameters, scoring=scoring_function, cv=3)
# regr.fit(X, Y)
# Regr = regr.best_estimator_

Regr = GradientBoostingRegressor(**params)
Regr.fit(X, Y)
print Regr

### Predict Score

In [None]:
# max_features: sqrt -> 0.5 : 0.538287 -> 0.528706599387
# max_features: 0.5 -> 0.8 : 0.528706599387 -> 0.530272920336
#-----brand new turing
# alpha: 0.9 -> 0.6 : 0.537699383041 -> 0.576867122284
# learning_rate: 0.2 : 0.536744595218
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    score = score3(day, x['gap']+Regr.predict(x))
    scores.append(score * x.shape[0])
    print '\t score: {}'.format(score)
print np.array(scores).sum()/2838

### Writing ANS

In [None]:
SLOT1 = range(45,153,12)
SLOT2 = range(57,153,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    write3(x, day, S_range[day], Regr, 'a')

## Naive Method 2 by using interpolation

In [None]:
slot1 = range(45,153,12)
slot2 = range(57,153,12)
S_range = {'2016-01-22':slot1, '2016-01-24':slot2, '2016-01-26':slot1, '2016-01-28':slot2, '2016-01-30':slot1}
D_range = range(1,67)

In [None]:
Day_range = {'2016-01-22':pd.date_range('1/2/2016', periods=3, freq='7D'),
             '2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
             '2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
             '2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
             '2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score2(day):
    ans = test_order[day].select(lambda x: x[1] in S_range[day])
    
    
    deltas = []
    for d in Day_range[day]:
        data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
        data = data.diff().shift(-1)
        data = data.loc[ans.index]
        deltas.append(data)

    delta = deltas[0]
    for i in range(1,len(deltas)):
        delta.add(deltas[i], fill_value=0)
    pred = test_order[day].shift().loc[ans.index].fillna(0)+(delta/len(deltas))
    gap = (ans - pred) / ans

    return gap.abs().sum()/(66*len(S_range[day]))

In [None]:
print naive_score2('2016-01-22')
print naive_score2('2016-01-24')
print naive_score2('2016-01-26')
print naive_score2('2016-01-28')
print naive_score2('2016-01-30')

In [None]:
def slope(day):
    base_points = test_order[day].select(lambda x: x[1] in S_range[day])
    deltas = []
    for d in Day_range[day]:
        data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
        data = data.diff().shift(-1)
        deltas.append(data)
    delta = deltas[0]
    for i in range(1,len(deltas)):
        delta = delta + deltas[i]
    return base_points, delta/len(deltas)

In [None]:
def write2(day, base_points, slot, delta, mode):
    with open('ans.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in range(1,67):
            for S in slot:
                key = (D,S-1)
                if key in base_points.index:
                    gap = base_points['gap'].loc[key] + delta['gap'].loc[key]
                    gap = base_points['gap'].loc[key] if gap < 0 else gap
                else:
                    gap = 0.0
                writer.writerow([D,'{}-{}'.format(day,S), '{:.3f}'.format(gap)])

### Predict Day22

In [None]:
day = '2016-01-22'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'w')

### Predict Day24

In [None]:
day = '2016-01-24'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')

### Predict Day26

In [None]:
day = '2016-01-26'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')

### Predict Day28

In [None]:
day = '2016-01-28'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')

### Predict Day30

In [None]:
day = '2016-01-30'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')

## Predict the score of naive method 1

In [None]:
Day_range = {'2016-01-22':pd.date_range('1/1/2016', periods=3, freq='7D'),
             '2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
             '2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
             '2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
             '2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score1(day):
    ans = test_order[day].select(lambda x: x[1] in S_range[day])
    
    prediction = []
    for d in Day_range[day]:
        data = train_order[str(d.date())]
        data = data.loc[ans.index].fillna(0)
        prediction.append(data)

    pred = prediction[0]
    for i in range(1,len(prediction)):
        pred.add(prediction[i], fill_value=0)
    pred = pred/len(prediction)
    gap = (ans - pred) / ans

    return gap.abs().sum()/(66*len(S_range[day]))

In [None]:
print naive_score1('2016-01-22')
print naive_score1('2016-01-24')
print naive_score1('2016-01-26')
print naive_score1('2016-01-28')
print naive_score1('2016-01-30')

## Naive Method 1 by using mean

In [None]:
test_slot1 = range(46,154,12)
test_slot2 = range(58,154,12)

### Predict Day22

In [None]:
test_day22 = []

In [None]:
for day in pd.date_range('1/1/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day22.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'w') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day22:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-22-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day24

In [None]:
test_day24 = []

In [None]:
for day in pd.date_range('1/3/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day24.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot2:
            key = (D,S)
            gap = []
            for data in test_day24:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-24-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day26

In [None]:
test_day26 = []

In [None]:
for day in pd.date_range('1/5/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day26.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day26:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-26-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day28

In [None]:
test_day28 = []

In [None]:
for day in pd.date_range('1/7/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day28.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot2:
            key = (D,S)
            gap = []
            for data in test_day28:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-28-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day30

In [None]:
test_day30 = []

In [None]:
for day in pd.date_range('1/9/2016', periods=2, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day30.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day30:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-30-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

## Process order

In [None]:
day = '2016-01-01'
Day = pd.Timestamp(day)

In [None]:
order = pd.read_table(path['order'].format(day), header=None, usecols=[1,3,6],
                      names=['driver', 'district_id', 'time'])
order = order[order['driver'].isnull()] # Select NA for calculating the value of gap

### Translating district hash to id

In [None]:
order['district_id'] = order['district_id'].apply(lambda x: district[x])

### Translating timestamp to slot

In [None]:
order['time'] = pd.to_datetime(order['time'])

In [None]:
order['time_slot'] = (order['time'] - Day) / M / 10 + 1
order['time_slot'] = order['time_slot'].astype(int)