In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import datetime



In [2]:
input_path = "F:/meituan_Project/20170930 MDD Cup/data/total"

def load_order_data(file_name):
    df = pd.read_csv('%s/%s' % (input_path, file_name))
    c = 'order_unix_time'
    mask = pd.notnull(df[c])
    df.loc[mask, c] = df.loc[mask, c].apply(lambda x: datetime.datetime.fromtimestamp(x))
    df.loc[mask, 'date'] = df.loc[mask, c].apply(lambda x: x.strftime('%Y%m%d'))
    df.loc[mask, 'hour'] = df.loc[mask, c].apply(lambda x: x.hour)
    df.loc[mask, 'minute'] = df.loc[mask, c].apply(lambda x: x.minute)
    return df

def load_area_data(file_name):
    df = pd.read_csv('%s/%s' % (input_path, file_name), dtype={'date': str, 'time': str})
    mask = pd.notnull(df['time'])
    df.loc[mask, 'hour'] = df.loc[mask, 'time'].apply(lambda x: int(x[:2]))
    df.loc[mask, 'minute'] = df.loc[mask, 'time'].apply(lambda x: int(x[2:]))
    df.drop(['log_unix_time', 'time'], axis=1, inplace=True)
    return df

In [3]:
print('loading data...')
df_tr = load_order_data('waybill_info.csv')
mask = (df_tr.delivery_duration < 4654.0) & (df_tr.delivery_duration > 663.0) & ((df_tr.hour.values == 11) | (df_tr.hour.values == 17))
df_tr = df_tr.loc[mask]
df_te = load_order_data('waybill_info_test_b.csv')

df_tr_weather = load_area_data('weather_realtime.csv')
df_te_weather = load_area_data('weather_realtime_test.csv')

df_tr_area = load_area_data('area_realtime.csv')
df_te_area = load_area_data('area_realtime_test.csv')

print('merging data...')
df_tr = pd.merge(df_tr, df_tr_weather, on=['date', 'hour', 'minute', 'area_id'], how='left')
df_tr = pd.merge(df_tr, df_tr_area, on=['date', 'hour', 'minute', 'area_id'], how='left')

df_te = pd.merge(df_te, df_te_weather, on=['date', 'hour', 'minute', 'area_id'], how='left')
df_te = pd.merge(df_te, df_te_area, on=['date', 'hour', 'minute', 'area_id'], how='left')

loading data...
merging data...


In [4]:
print('constructing training data...')
cols = df_tr.columns.tolist()
to_drop = ['order_unix_time', 'arriveshop_unix_time', 'fetch_unix_time', 'finish_unix_time', 'order_id', 'delivery_duration', 'date']
features = list(np.setdiff1d(cols, to_drop))
print(features)

x_train = df_tr[features]
y_train = df_tr['delivery_duration']

x_test = df_te[features]
id_test = df_te['order_id']

print(x_train.shape)
print(x_test.shape)

dtrain = xgb.DMatrix(x_train.values, y_train)
dtest = xgb.DMatrix(x_test.values)

print('training model...')
watchlist = [(dtrain, 'train')]
param = {
        'booster': 'gbtree',
        'objective': 'reg:linear',
        'eval_metric': 'mae',
        'eta': 0.15,
        'num_round': 1000,
        'colsample_bytree': 0.65,
        'subsample': 0.8,
        'max_depth': 5,
        'nthread': -1,
        'seed': 20171001,
        'silent': 1,
    }
bst = xgb.train(param, dtrain, param['num_round'], watchlist, verbose_eval=10)

print('generating prediction...')
pred = bst.predict(dtest)

print('generating submission...')
sub = pd.DataFrame({'order_id': id_test, 'delivery_duration': pred})

print('saving submission...')
sub.to_csv(input_path+'/'+'sub_xgb_starter.csv', index=False)

constructing training data...
['area_id', 'box_total_value', 'customer_latitude', 'customer_longitude', 'delivery_distance', 'deliverying_order_num', 'food_num', 'food_total_value', 'hour', 'minute', 'not_fetched_order_num', 'notbusy_working_rider_num', 'poi_id', 'poi_lat', 'poi_lng', 'rain', 'temperature', 'waiting_order_num', 'wind', 'working_rider_num']
(283815, 20)
(251864, 20)
training model...
[0]	train-mae:1987.16
[10]	train-mae:575.099
[20]	train-mae:459.698
[30]	train-mae:444.07
[40]	train-mae:436.818
[50]	train-mae:431.652
[60]	train-mae:428.265
[70]	train-mae:425.238
[80]	train-mae:422.607
[90]	train-mae:420.692
[100]	train-mae:419.127
[110]	train-mae:417.21
[120]	train-mae:415.581
[130]	train-mae:414.508
[140]	train-mae:413.255
[150]	train-mae:412.037
[160]	train-mae:411.171
[170]	train-mae:410.128
[180]	train-mae:409.113
[190]	train-mae:408.298
[200]	train-mae:407.644
[210]	train-mae:406.968
[220]	train-mae:406.289
[230]	train-mae:405.657
[240]	train-mae:405.061
[250]	trai

In [6]:
sub.to_csv(input_path+'/'+'sub_xgb_starter.csv', index=False)