In [1]:
import pandas as pd
from os import walk

"""import the order datas into a dataframe""" 

# Use path to data files.
order_path = "season_2/test_set_2/order_data/" 
_, _, filenames = next(walk(order_path), (None, None, []))

mSeq = len(filenames)
frame = pd.DataFrame()
list_ = []

for i in range(mSeq-1):
    # White Space as Delim, import all the data file.
    df = pd.read_csv(str(order_path) + filenames[i+1],  delim_whitespace = 1, dtype = str, header = None) 
    list_.append(df)
orders = pd.concat(list_)

# Rename all the columns.
orders.columns = ['order_id', 'driver_id', 'passenger_id', 
                  'start_district_hash', 'dest_district_hash', 
                  'price', 'date', 'time']
# Convert the Date and Time column in orders from an object to a datetime value. 
# This makes it easier to work with as a date.
# Extract the year and month from from date_time, and assign them to their own columns.
orders['datetime'] = pd.to_datetime(orders['date'] + ' ' + orders['time'])
orders = orders.drop('date', 1)
orders = orders.drop('time', 1)
orders['day'] = orders["datetime"].dt.day
# Every 10 minutes is considered as a time slot. It can be calculated by following fomula
orders['time_slot'] = orders["datetime"].dt.hour*6 + orders["datetime"].dt.minute//10+1

In [2]:
"""Categorize the start and destination with its id in """

# Import the hash id into a dictionary
cluster_path = "season_1/training_data/cluster_map/"
cluster = {}
with open(cluster_path + "cluster_map") as f:
    for line in f:
       (key, val) = line.split()
       cluster[str(key)] = val
        
# Update the start and destination district ID
orders['start_district_id'] = orders['start_district_hash'].map(cluster)
orders['dest_district_id'] = orders['dest_district_hash'].map(cluster)

In [3]:
# Now try to generate the gap data. We're looking for the orders which were not answered by a driver.
# Hence we extract the orders with missing driver and count the number of them in each distrct and time slot.
# The number of the not answered orders should be the gap we're looking for

gaporders = orders[pd.isnull(orders['driver_id'])]
gap = pd.DataFrame({'gap' : gaporders.groupby( ['day', 'start_district_id', 'time_slot'] ).size()}).reset_index()
gap['key'] = gap.day.astype(str).str.cat(gap.start_district_id.astype(str), sep='_').str.cat(gap.time_slot.astype(str), sep='_')
gap["district_id"] = gap['start_district_id']

In [6]:
# Create a new DataFrame to save the useful information
time_slot = pd.DataFrame({'stat' : orders.groupby(["time_slot"]).size()})
szone = pd.DataFrame({'stat' : orders.groupby(["start_district_id"]).size()})
ezone = pd.DataFrame({'stat' : orders.groupby(["dest_district_id"]).size()})
day = pd.DataFrame({'stat' : orders.groupby(["day"]).size()})
print (len(time_slot),len(szone), len(ezone), len(day))
in_d = pd.DataFrame(columns=['day', 'start_district_id', 'time_slot',"gap"])
n = 1
for d in day.index.values:    
    date = d
    print('day: '+ str(d))
    for z in szone.index.values:
        start = z
        for t in time_slot.index.values:
            time = t
            in_d.loc[n]=[date, start, time, 0.0]
            n+=1

27 66 66 5
day: 23
day: 25
day: 27
day: 29
day: 31


In [7]:
# Create a new column called key which combines day, start district id and time slot. 
in_d['key'] = in_d["day"]
in_d['key'] = in_d.day.astype(int).astype(str).str.cat(in_d.start_district_id.astype(str), sep='_').str.cat(in_d.time_slot.astype(int).astype(str), sep='_')
in_d["district_id"] = in_d["start_district_id"]

In [8]:
# Generate demand, supply and future supply (which depends on the destination of each order). Our naive model 
# will be based on these variables. Future supply will be an interesting attribute to dig in,

demand = pd.DataFrame({'demand' : orders.groupby( ['day', 'start_district_id', 'time_slot'] ).size()}).reset_index()
supply = pd.DataFrame({'demand' : orders.groupby( ['day', 'driver_id', 'time_slot'] ).size()}).reset_index()
fsupply = pd.DataFrame({'future_supply' : orders.groupby( ['day', 'dest_district_id', 'time_slot'] ).size()}).reset_index()
demand["district_id"] = demand['start_district_id']
fsupply["district_id"] = fsupply['dest_district_id']

In [9]:
# fill in the table with the information generated.

def join_value(left, right, key):
    left['key'] = left.day.astype(int).astype(str).str.cat(left.district_id.astype(int).astype(str), sep='_').str.cat(left.time_slot.astype(int).astype(str), sep='_')
    right['key'] = right.day.astype(int).astype(str).str.cat(right.district_id.astype(int).astype(str), sep='_').str.cat(right.time_slot.astype(int).astype(str), sep='_')
    left_a = left.set_index(key)
    right_a = right.set_index(key)

    res = left_a.loc[:, left_a.columns.union(right_a.columns)]
    res.update(right_a)
    return res

res = join_value(in_d, gap, "key")
res = join_value(res, demand, "key")
res = join_value(res, fsupply, "key")
res["time"] = res["time_slot"]/144*24
res.drop(["dest_district_id", "start_district_id"], axis=1,inplace=True) 
res["supply"] = res["demand"]-res["gap"]

print (res.sort_values(["gap"], ascending=False))


            day  demand district_id  future_supply     gap  time_slot  \
key                                                                     
29_51_93   29.0  1735.0          51         1233.0  1185.0       93.0   
25_51_103  25.0  1876.0          51         1202.0  1173.0      103.0   
29_51_128  29.0  1672.0          51          792.0  1037.0      128.0   
23_51_128  23.0  1462.0          51          646.0  1011.0      128.0   
23_51_127  23.0  1466.0          51          671.0   959.0      127.0   
25_51_128  25.0  1385.0          51          616.0   918.0      128.0   
29_51_92   29.0  1358.0          51         1119.0   906.0       92.0   
29_51_91   29.0  1321.0          51         1114.0   902.0       91.0   
25_51_105  25.0  1443.0          51         1005.0   885.0      105.0   
27_51_104  27.0  1493.0          51         1085.0   884.0      104.0   
29_51_103  29.0  1577.0          51         1140.0   841.0      103.0   
27_51_105  27.0  1403.0          51         1054.0 

In [10]:
res['fri'] = res['day']==1
res['sat'] = res['day']==2
res['sun'] = res['day']==3
res['mon'] = res['day']==4
res['tue'] = res['day']==5
res['wed'] = res['day']==6
res['thu'] = res['day']==7

res['gap_slot'] = res['gap']
res['gap_max'] = 100
res['gap_slot'] = res[['gap_slot', 'gap_max']].min(axis=1)
res['gap_slot'] = res['gap_slot']//5

In [11]:
res.shape

(8910, 17)

In [12]:
input = pd.DataFrame(res[['day', 'district_id', 'gap', 'time_slot']])
output = pd.DataFrame(res[['day', 'time_slot', 'gap', 'gap_slot']])

time_slot_dummies = pd.get_dummies(input['time_slot'])

for i in range(142):
    input['time_slot_'+str(i+1)] = 0
for i in list(time_slot_dummies.columns.values):
    input['time_slot_'+str(int(i))] = time_slot_dummies[float(i)]
district_dummies = pd.get_dummies(input['district_id'])
for i in range(65):
    input['district_id_'+str(i+1)] = district_dummies[str(i+1)]
input['fri'] = input['day']//7==1
input['sat'] = input['day']//7==2
input['sun'] = input['day']//7==3
input['mon'] = input['day']//7==4
input['tue'] = input['day']//7==5
input['wed'] = input['day']//7==6

In [13]:
input.shape

(8910, 217)

In [14]:
input = input.reset_index()

In [15]:
for i in range(input.shape[0]-2):
    if i%5000==0:
        print(i)
    input.loc[i+1, 'gap_past1'] = input.loc[i, 'gap']
    input.loc[i+2, 'gap_past2'] = input.loc[i, 'gap']

0
5000


In [16]:
input

Unnamed: 0,key,day,district_id,gap,time_slot,time_slot_1,time_slot_2,time_slot_3,time_slot_4,time_slot_5,...,district_id_64,district_id_65,fri,sat,sun,mon,tue,wed,gap_past1,gap_past2
0,23_1_43,23.0,1,9.0,43.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,,
1,23_1_44,23.0,1,11.0,44.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,9.0,
2,23_1_45,23.0,1,6.0,45.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,11.0,9.0
3,23_1_55,23.0,1,8.0,55.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,6.0,11.0
4,23_1_56,23.0,1,6.0,56.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,8.0,6.0
5,23_1_57,23.0,1,7.0,57.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,6.0,8.0
6,23_1_67,23.0,1,3.0,67.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,7.0,6.0
7,23_1_68,23.0,1,4.0,68.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,3.0,7.0
8,23_1_69,23.0,1,4.0,69.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,4.0,3.0
9,23_1_79,23.0,1,7.0,79.0,0,0,0,0,0,...,0.0,0.0,False,False,True,False,False,False,4.0,4.0


In [17]:
re = input
re['0'] = 1
a = range(0,len(re))
re['0'] = a
re['0'] = re['0']%3
re = re.drop(re[re['0']!=2].index)

In [18]:
re

Unnamed: 0,key,day,district_id,gap,time_slot,time_slot_1,time_slot_2,time_slot_3,time_slot_4,time_slot_5,...,district_id_65,fri,sat,sun,mon,tue,wed,gap_past1,gap_past2,0
2,23_1_45,23.0,1,6.0,45.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,11.0,9.0,2
5,23_1_57,23.0,1,7.0,57.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,6.0,8.0,2
8,23_1_69,23.0,1,4.0,69.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,4.0,3.0,2
11,23_1_81,23.0,1,8.0,81.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,6.0,7.0,2
14,23_1_93,23.0,1,4.0,93.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,4.0,3.0,2
17,23_1_105,23.0,1,4.0,105.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,7.0,16.0,2
20,23_1_117,23.0,1,4.0,117.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,5.0,10.0,2
23,23_1_129,23.0,1,4.0,129.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,10.0,8.0,2
26,23_1_141,23.0,1,6.0,141.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,5.0,4.0,2
29,23_10_45,23.0,10,1.0,45.0,0,0,0,0,0,...,0.0,False,False,True,False,False,False,1.0,0.0,2


In [19]:
re['district_id'] = re['district_id'].astype(int)
re = re.sort_values(['day','time_slot', "district_id"], ascending=True)

In [20]:
re = re.drop('key', 1)
re = re.drop('district_id', 1)
re = re.drop('day', 1)
re = re.drop('time_slot', 1)
re = re.drop('0', 1)

In [24]:
import numpy as np
test_x = re.as_matrix().astype(float)

In [25]:
np.savetxt('test_x.txt', test_x, fmt='%.18e', delimiter=' ', newline='\n', header='', footer='', comments='# ')

In [None]:
re.shape

In [None]:
re['time_slot_']

In [4]:
input

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x106054be0>>