In [1]:
from pandas.tseries.offsets import *
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import itertools
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import grid_search
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
path = {}
path['train'] = {'order':'./training_data/order_data/order_data_{}', 
                    'weather': './training_data/weather_data/weather_data_{}',
                    'traffic': './training_data/traffic_data/traffic_data_{}',
                    'district':'./training_data/cluster_map/cluster_map',
                    'poi':'./training_data/poi_data/poi_data'}
path['test'] = {'order':'./test_data/order_data/order_data_{}_test', 
                'weather': './test_data/weather_data/weather_data_{}_test',
                'traffic': './test_data/traffic_data/traffic_data_{}_test',
                'district':'./test_data/cluster_map/cluster_map',
                'poi':'./test_data/poi_data/poi_data'}

M = np.timedelta64(1, 'm') # base time stamp of 1 minute

test_slot1 = range(46,154,12) # The test slot for day 22, 26, 30
test_slot2 = range(58,154,12) # The test slot for day 24, 28

D_range = range(1,67) # List of all district Ids
T_range = range(1,145) # List of all time slots

# Dictionary of District Info Table
district_dict = pd.read_table(path['train']['district'], header=None, index_col=0)
district_dict = district_dict[1].to_dict()

## Utility Funtion

In [3]:
def index(district, slot):
    if type(district) is int:
        if type(slot) is int:
            return [x for x in itertools.product([district],[slot])]
        else:
            return [x for x in itertools.product([district],slot)]
    else:
        if type(slot) is int:
            return [x for x in itertools.product(district,[slot])]
        else:
            return [x for x in itertools.product(district,slot)]

In [4]:
def District(df):
    return df['district'].apply(lambda x: district_dict[x])

In [5]:
def Weekday(df):
    return pd.to_datetime(df['time']).apply(lambda x: x.weekday())

In [6]:
def Time(df, day):
    time = pd.to_datetime(df['time'])
    time = (time - pd.Timestamp(day)) / M / 10 + 1
    return time.astype(int)

In [15]:
def preprocessing(day):
    X = train_order[day].reindex(index(D_range,T_range))
    Y = pd.DataFrame(columns=X.columns,index=X.index)
    for D in D_range:
        Y.loc[(D, T_range),:] = X.loc[(D, T_range),:].diff().shift(-1).fillna(0)
    X = pd.concat([X,train_traffic[day]],axis=1).dropna()
    X = X.join(train_weather[day], on='time')
    return X, Y

## Getting testing data and training data

In [8]:
def Traffic(day, option):
    df = pd.read_table(path[option]['traffic'].format(day.date()), header=None,
                      names=['district', 'LV1', 'LV2', 'LV3', 'LV4','time'])
    df['district'] = District(df)
    df['weekday'] = Weekday(df)
    df['time'] = Time(df, str(day.date()))
    for L in ['LV{}'.format(n) for n in range(1,5)]:
        df[L]=df[L].apply(lambda x: x.split(':')[1]).astype(int)
    index = pd.MultiIndex.from_arrays([df['district'].values, df['time'].values], names=('district', 'time'))
    return pd.DataFrame({'weekday':df['weekday'].values,
                         'district':df['district'].values,
                         'time':df['time'].values,
                         'LV1':df['LV1'].values, 
                         'LV2':df['LV2'].values,
                         'LV3':df['LV3'].values,
                         'LV4':df['LV4'].values,}, index=index).sort_index()

In [9]:
# DTG: District Time Gap
def DTG(day, option):
    df = pd.read_table(path[option]['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    df = df[df['driver'].isnull()] 
    df['district'] = District(df)
    df['time'] = Time(df, day.date())
    Order = df.groupby(['district', 'time'])
    return pd.DataFrame({'gap':Order.size()})

In [10]:
def Weather(day, option):
    df = pd.read_table(path[option]['weather'].format(day.date()), header=None,
                      names=['time', 'weather', 'temprature', 'pm2.5'])
    df['time'] = Time(df, day.date())
    df = df.drop_duplicates(subset='time')
    DF = pd.DataFrame({'time': T_range}, columns=df.columns)
    DF = DF.set_index('time')
    DF.update(df.set_index('time'))
    return DF.fillna(method='bfill').fillna(method='ffill')

In [None]:
# Dictionary of order data for testing indexed by date, cols = [gap]
test_order = {} 
# Dictionary of traffic data for testing indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
test_traffic = {}
# Dictionary of weather data for testing indexed by date, cols = [temperature, weather, pm2.5]
test_weather = {} 
for day in pd.date_range('1/22/2016', periods=5, freq='2D'):
    test_order[str(day.date())] = DTG(day, 'test')
    test_traffic[str(day.date())] = Traffic(day, 'test')

In [11]:
# Dictionary of order data for training indexed by date, cols = [gap]
train_order = {}
# Dictionary of traffic data for training indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
train_traffic = {}
# Dictionary of weather data for training indexed by date, cols = [temperature, weather, pm2.5]
train_weather = {}
for day in pd.date_range('1/1/2016', periods=1, freq='D'):
    train_order[str(day.date())] = DTG(day, 'train')
    train_traffic[str(day.date())] = Traffic(day, 'train')
    train_weather[str(day.date())] = Weather(day, 'train')

## Preparing training data X and Y

In [None]:
X = []
Y = []
for d in pd.date_range('1/1/2016', periods=1, freq='D'):
    tempX, tempY = preprocessing(str(d.date()))
    X.append(tempX)
    Y.append(tempY)
X = pd.concat(X)
X.sort_index(inplace=True)
Y = pd.concat(Y)
Y.sort_index(inplace=True)
Y = Y['gap']

## Developing Zone

In [14]:
print X

               gap   LV1  LV2  LV3  LV4  district  time  weekday
district time                                                   
1        2       7  1399  318  102   94         1     2        4
         3      10  1491  322   99   64         1     3        4
         4       5  1490  287   98   78         1     4        4
         5       1  1425  302   95   51         1     5        4
         6       1  1327  313   94   66         1     6        4
         7       6  1361  258   68   55         1     7        4
         8       2  1395  280   97   69         1     8        4
         9       6  1348  272   93   97         1     9        4
         10      6  1417  236   93   53         1    10        4
         11      2  1316  241   82   64         1    11        4
         12      3  1393  254   65   65         1    12        4
         13      1  1339  241   87   69         1    13        4
         14      4  1390  198   69   47         1    14        4
         15      3  1303 

## Medthod3 by GradientBoosting

In [None]:
slot1 = range(45,153,12) # Last time slot of test slot for day 22, 26, 30
slot2 = range(57,153,12) # Last time slot of test slot for day 24, 28
S_range = {'2016-01-22':slot1, '2016-01-24':slot2, '2016-01-26':slot1, '2016-01-28':slot2, '2016-01-30':slot1}

In [None]:
def write3(x, district, day, slot, regr, mode):
    with open('ans3.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for S in slot:
            key = (district,S-1)
            if key in x.index:
                gap = x['gap'].loc[key] + regr.predict(x.loc[key].reshape(1, -1))[0]
            else:
                gap = 0
            gap = 0 if gap < 0 else gap
            writer.writerow([str(district),'{}-{}'.format(day,S), '{:.3f}'.format(gap)])

In [None]:
def score3(day, pred):
    ans = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
    ans = ans['gap'].values
    pred = pred[ans>0]
    ans = ans[ans>0]
    gap = (ans - pred) / ans
    return np.fabs(gap).sum()/ans.shape[0]

### Predict Day22

In [None]:
# Predict Score on this day
day = '2016-01-22'
# all areas mixed: 0.770059708462
params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators': 250,
          'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0}

# Paramters need to be searched
parameters = {'n_estimators': np.arange(100,600,150)}

In [None]:
# Searching best parameters
# scoring_function = make_scorer(mean_squared_error, greater_is_better=False)
# regr = grid_search.GridSearchCV(GradientBoostingRegressor(loss='quantile'), 
#                                 param_grid=parameters, scoring=scoring_function, cv=3)
# regr.fit(X, Y)
# Regr = regr.best_estimator_

Regr = GradientBoostingRegressor(**params)
Regr.fit(X, Y)
print Regr

In [None]:
# alpha: 0.9 -> 0.1 : 0.775124360324 -> 1.03167874288
# loss: quantile -> ls : 0.775124360324 -> 0.686632655895
# loss: quantile -> huber : 0.775124360324 -> 0.559654872126
# learning_rate: 0.1 -> 0.9 : 0.559654872126 -> 0.584819846794
# max_depth: 3 -> 5 : 0.584819846794 -> 0.624282373776
# n_estimators: 250 -> 400 : 0.584819846794 -> 0.607329151263
# subsample: 1.0 -> 0.5 : 0.559654872126 -> 0.56019559633
SLOT1 = range(44,152,12)
x = test_order[day].reindex(index(D_range,SLOT1)).fillna(0)
x = pd.concat([x,test_traffic[day]], axis=1, join_axes=[x.index]).fillna(0)
print score3(day,x['gap']+Regr.predict(x))

### Predict Day24

In [None]:
day = '2016-01-24'
x = test_order[day].select(lambda x: x[1] in S_range[day])
x = pd.concat([x,test_traffic[day]],axis=1).fillna(0)

X = []
Y = []
for d in ['2016-01-03','2016-01-10','2016-01-17']:
    tempX, tempY = preprocessing(d)
    X.append(tempX)
    Y.append(tempY)
X = pd.concat(X)
Y = pd.concat(Y)

In [None]:
for D in D_range:
    regr = GradientBoostingRegressor(**params)

    regr.fit(X,Y['gap'])
    write3(x, D, day, test_slot2, regr, 'a')

### Predict Day26

In [None]:
day = '2016-01-26'
x = test_order[day].select(lambda x: x[1] in S_range[day])
x = pd.concat([x,test_traffic[day]],axis=1).fillna(0)

X = []
Y = []
for d in ['2016-01-05','2016-01-12','2016-01-19']:
    tempX, tempY = preprocessing(d)
    X.append(tempX)
    Y.append(tempY)
X = pd.concat(X)
Y = pd.concat(Y)

In [None]:
for D in D_range:
    regr = GradientBoostingRegressor(**params)

    regr.fit(X,Y['gap'])
    write3(x, D, day, test_slot1, regr, 'a')

### Predict Day28

In [None]:
day = '2016-01-28'
x = test_order[day].select(lambda x: x[1] in S_range[day])
x = pd.concat([x,test_traffic[day]],axis=1).fillna(0)

X = []
Y = []
for d in ['2016-01-07','2016-01-14','2016-01-21']:
    tempX, tempY = preprocessing(d)
    X.append(tempX)
    Y.append(tempY)
X = pd.concat(X)
Y = pd.concat(Y)

In [None]:
for D in D_range:
    regr = GradientBoostingRegressor(**params)

    regr.fit(X,Y['gap'])
    write3(x, D, day, test_slot2, regr, 'a')

### Predict Day30

In [None]:
day = '2016-01-30'
x = test_order[day].select(lambda x: x[1] in S_range[day])
x = pd.concat([x,test_traffic[day]],axis=1).fillna(0)

X = []
Y = []
for d in ['2016-01-09','2016-01-16']:
    tempX, tempY = preprocessing(d)
    X.append(tempX)
    Y.append(tempY)
X = pd.concat(X)
Y = pd.concat(Y)

In [None]:
for D in D_range:
    regr = GradientBoostingRegressor(**params)

    regr.fit(X,Y['gap'])
    write3(x, D, day, test_slot1, regr, 'a')

## Naive Method 2 by using interpolation

In [None]:
slot1 = range(45,153,12)
slot2 = range(57,153,12)
S_range = {'2016-01-22':slot1, '2016-01-24':slot2, '2016-01-26':slot1, '2016-01-28':slot2, '2016-01-30':slot1}
D_range = range(1,67)

In [None]:
Day_range = {'2016-01-22':pd.date_range('1/2/2016', periods=3, freq='7D'),
             '2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
             '2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
             '2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
             '2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score2(day):
    ans = test_order[day].select(lambda x: x[1] in S_range[day])
    
    
    deltas = []
    for d in Day_range[day]:
        data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
        data = data.diff().shift(-1)
        data = data.loc[ans.index]
        deltas.append(data)

    delta = deltas[0]
    for i in range(1,len(deltas)):
        delta.add(deltas[i], fill_value=0)
    pred = test_order[day].shift().loc[ans.index].fillna(0)+(delta/len(deltas))
    gap = (ans - pred) / ans

    return gap.abs().sum()/(66*len(S_range[day]))

In [None]:
print naive_score2('2016-01-22')
print naive_score2('2016-01-24')
print naive_score2('2016-01-26')
print naive_score2('2016-01-28')
print naive_score2('2016-01-30')

In [None]:
def slope(day):
    base_points = test_order[day].select(lambda x: x[1] in S_range[day])
    deltas = []
    for d in Day_range[day]:
        data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
        data = data.diff().shift(-1)
        deltas.append(data)
    delta = deltas[0]
    for i in range(1,len(deltas)):
        delta = delta + deltas[i]
    return base_points, delta/len(deltas)

In [None]:
def write2(day, base_points, slot, delta, mode):
    with open('ans.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in range(1,67):
            for S in slot:
                key = (D,S-1)
                if key in base_points.index:
                    gap = base_points['gap'].loc[key] + delta['gap'].loc[key]
                    gap = base_points['gap'].loc[key] if gap < 0 else gap
                else:
                    gap = 0.0
                writer.writerow([D,'{}-{}'.format(day,S), '{:.3f}'.format(gap)])

### Predict Day22

In [None]:
day = '2016-01-22'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'w')

### Predict Day24

In [None]:
day = '2016-01-24'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')

### Predict Day26

In [None]:
day = '2016-01-26'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')

### Predict Day28

In [None]:
day = '2016-01-28'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')

### Predict Day30

In [None]:
day = '2016-01-30'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')

## Predict the score of naive method 1

In [None]:
Day_range = {'2016-01-22':pd.date_range('1/1/2016', periods=3, freq='7D'),
             '2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
             '2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
             '2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
             '2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score1(day):
    ans = test_order[day].select(lambda x: x[1] in S_range[day])
    
    prediction = []
    for d in Day_range[day]:
        data = train_order[str(d.date())]
        data = data.loc[ans.index].fillna(0)
        prediction.append(data)

    pred = prediction[0]
    for i in range(1,len(prediction)):
        pred.add(prediction[i], fill_value=0)
    pred = pred/len(prediction)
    gap = (ans - pred) / ans

    return gap.abs().sum()/(66*len(S_range[day]))

In [None]:
print naive_score1('2016-01-22')
print naive_score1('2016-01-24')
print naive_score1('2016-01-26')
print naive_score1('2016-01-28')
print naive_score1('2016-01-30')

## Naive Method 1 by using mean

In [None]:
test_slot1 = range(46,154,12)
test_slot2 = range(58,154,12)

### Predict Day22

In [None]:
test_day22 = []

In [None]:
for day in pd.date_range('1/1/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day22.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'w') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day22:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-22-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day24

In [None]:
test_day24 = []

In [None]:
for day in pd.date_range('1/3/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day24.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot2:
            key = (D,S)
            gap = []
            for data in test_day24:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-24-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day26

In [None]:
test_day26 = []

In [None]:
for day in pd.date_range('1/5/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day26.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day26:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-26-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day28

In [None]:
test_day28 = []

In [None]:
for day in pd.date_range('1/7/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day28.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot2:
            key = (D,S)
            gap = []
            for data in test_day28:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-28-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

### Predict Day30

In [None]:
test_day30 = []

In [None]:
for day in pd.date_range('1/9/2016', periods=2, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day30.append(order.groupby(['district', 'time']))

In [None]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day30:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-30-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

## Process order

In [None]:
day = '2016-01-01'
Day = pd.Timestamp(day)

In [None]:
order = pd.read_table(path['order'].format(day), header=None, usecols=[1,3,6],
                      names=['driver', 'district_id', 'time'])
order = order[order['driver'].isnull()] # Select NA for calculating the value of gap

### Translating district hash to id

In [None]:
order['district_id'] = order['district_id'].apply(lambda x: district[x])

### Translating timestamp to slot

In [None]:
order['time'] = pd.to_datetime(order['time'])

In [None]:
order['time_slot'] = (order['time'] - Day) / M / 10 + 1
order['time_slot'] = order['time_slot'].astype(int)