In [2]:
# to clean and pre-process the data
# import data using pandas
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# label the cluster map
# labels:
# region_hash, region_id

columns = ['region_hash', 'region_id']
# read the cluster map
cluster_map = pd.read_csv('../dataset/training_data/cluster_map/cluster_map', sep='\t', on_bad_lines='skip', header=None, names=columns)
print('cluster_map finished')

print('cluster_map.head(): \n', cluster_map.head())


cluster_map.to_csv('../dataset/labeledData/cluster_map.csv', index=False)



In [None]:
# label the orders data
# labels:
# order_id, driver_id, passenger_id, start_district_hash, dest_district_hash, price, time
columns = ['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'price', 'time']

# read the orders data
orders_data = []
for f in glob.glob('../dataset/training_data/order_data/order_data_*'):
    # file name
    print('filename: ', f)
    df = pd.read_csv(f, sep='\t', on_bad_lines='skip', header=None, names=columns)
    orders_data.append(df)

print('orders_data finished')
orders_data = pd.concat(orders_data,  ignore_index=True)

# print('orders_data.head(): ', orders_data.head())
orders_data.to_csv('../dataset/labeledData/orders_data.csv', index=False)


In [None]:
# label the weather data
# labels:
# time, weather, temperature, pm25
columns = ['time', 'weather', 'temperature', 'pm25']

# print('weather_data.head(): \n', weather_data.head())


# # read the weather data
weather_data = []
for f in glob.glob('../dataset/training_data/weather_data/weather_data_*'):
    # file name
    print('filename: ', f)
    df = pd.read_csv(f, sep='\t', on_bad_lines='skip', header=None, names=columns)
    weather_data.append(df)

print('weather_data finished')
weather_data = pd.concat(weather_data, ignore_index=True)


# weather_data.to_csv('../dataset/labeledData/weather_data.csv', index=False)


In [None]:
# label the poi data
# labels:
# region_hash, poi_id 
# 1st column: district_hash
# whole next column is: poi_id
columns = ['region_hash', 'poi_id']



# read the poi data
poi_data = pd.read_csv('../dataset/training_data/poi_data/poi_data', sep='\t', header=None, on_bad_lines='skip')

# extract the district_hash column and the POI ID columns
district_hash = poi_data.iloc[:, 0]
poi_ids = poi_data.iloc[:, 1:]

# combine all the POI IDs for each row into a list
poi_ids_list = poi_ids.apply(lambda x: x.tolist(), axis=1)

# combine the district_hash and poi_ids_list into a new DataFrame
labeled_poi_data = pd.concat([district_hash, poi_ids_list], axis=1)
labeled_poi_data.columns = ['region_hash', 'poi_ids']

# print the result
# print(labeled_poi_data.head())

# updated list
updated_list = []

# convert the column of lists to a list of lists
list_of_lists_poi_id = labeled_poi_data['poi_ids'].tolist()

# poi format poi_id = class1#class2:numofFacilities
# seperate numofFacilities from list_of_lists_poi_id and sum them up

# for each list in list_of_lists_poi_id 
# change the list of poi_id to weighted sum of numofFacilities 
for poi_list in list_of_lists_poi_id:
    weighted_sum = 0
    for poi in poi_list:
        if(pd.isna(poi)==False):
            poi_id, num_of_facilities = poi.split(':')
            poi_class = poi_id.split('#')
            # combine the class1 and class2 numbers
            if(len(poi_class) == 1):
                poi_class[0] = '0' + poi_class[0]
            else:
                poi_number = poi_class[0] + '' + poi_class[1]
            weighted_sum += int(num_of_facilities) * int(poi_number)

    updated_list.append(weighted_sum)

# print(list_of_lists_poi_id)

# change labeled_poi_data['poi_ids'] to list_of_lists_poi_id
labeled_poi_data['poi_ids'] = updated_list

print('labeled_poi_data.head(): ', labeled_poi_data.head())

labeled_poi_data.to_csv('../dataset/labeledData/poi_data.csv', index=False)



In [3]:
# read from the labeled data
cluster_map = pd.read_csv('../dataset/labeledData/cluster_map.csv')
orders_data = pd.read_csv('../dataset/labeledData/orders_data.csv')
weather_data = pd.read_csv('../dataset/labeledData/weather_data.csv')
poi_data = pd.read_csv('../dataset/labeledData/poi_data.csv')


In [4]:
# map time to time slot
# devide day in 10 min time slots (144 time slots)


# convert time to datetime
orders_data['time'] = pd.to_datetime(orders_data['time'])
weather_data['time'] = pd.to_datetime(weather_data['time'])

# map time to time slot
orders_data['time_slot'] = orders_data['time'].dt.hour * 6 + orders_data['time'].dt.minute // 10
weather_data['time_slot'] = weather_data['time'].dt.hour * 6 + weather_data['time'].dt.minute // 10

# map time to time slot with weekday
orders_data['weekday'] = orders_data['time'].dt.weekday
weather_data['weekday'] = weather_data['time'].dt.weekday

# remove the time column
orders_data = orders_data.drop(['time'], axis=1)
weather_data = weather_data.drop(['time'], axis=1)

print(orders_data.head())
print(weather_data.head())

                           order_id                         driver_id  \
0  97ebd0c6680f7c0535dbfdead6e51b4b  dd65fa250fca2833a3a8c16d2cf0457c   
1  92c3ac9251cc9b5aab90b114a1e363be  c077e0297639edcb1df6189e8cda2c3d   
2  abeefc3e2aec952468e2fd42a1649640  86dbc1b68de435957c61b5a523854b69   
3  cb31d0be64cda3cc66b46617bf49a05c  4fadfa6eeaa694742de036dddf02b0c4   
4  139d492189ae5a933122c098f63252b3                               NaN   

                       passenger_id                 start_region_hash  \
0  ed180d7daf639d936f1aeae4f7fb482f  4725c39a5e5f4c188d382da3910b3f3f   
1  191a180f0a262aff3267775c4fac8972  82cc4851f9e4faa4e54309f8bb73fd7c   
2  7029e813bb3de8cc73a8615e2785070c  fff4e8465d1e12621bc361276b6217cf   
3  21dc133ac68e4c07803d1c2f48988a83  4b7f6f4e2bf237b6cc58f57142bea5c0   
4  26963cc76da2d8450d8f23fc357db987  fc34648599753c9e74ab238e9a4a07ad   

                   dest_region_hash  price  time_slot  weekday  
0  3e12208dd0be281c92a6ab57d9a6fb32   24.0         81    

In [5]:
# group the orders data by time slot 
# aggregate count the number of orders where driver_id = NULL

# this is supply demand deficit - order gap
orders_data_grouped = orders_data[orders_data['driver_id'].isnull()].groupby(['start_region_hash','dest_region_hash','time_slot', 'weekday']).agg({'order_id': 'count'}).rename(columns={'order_id': 'order_gap'}).reset_index()

# this is the total demand
total_orders_grouped = orders_data.groupby(['start_region_hash','dest_region_hash','time_slot', 'weekday']).agg({'order_id': 'count', 'price': 'mean'}).rename(columns={'order_id': 'demand'}).reset_index()

# merge the two dataframes on the region, time slot and weekday
orders_data_grouped = pd.merge(orders_data_grouped, total_orders_grouped, on=['start_region_hash','dest_region_hash','time_slot', 'weekday'])

# calculate the supply variable as the difference between total_orders and order_gap
orders_data_grouped['supply'] = orders_data_grouped['demand'] - orders_data_grouped['order_gap']

print(orders_data_grouped.head())

                  start_region_hash                  dest_region_hash  \
0  08232402614a9b48895cc3d0aeb0e9f2  08232402614a9b48895cc3d0aeb0e9f2   
1  08232402614a9b48895cc3d0aeb0e9f2  08232402614a9b48895cc3d0aeb0e9f2   
2  08232402614a9b48895cc3d0aeb0e9f2  08232402614a9b48895cc3d0aeb0e9f2   
3  08232402614a9b48895cc3d0aeb0e9f2  08232402614a9b48895cc3d0aeb0e9f2   
4  08232402614a9b48895cc3d0aeb0e9f2  08232402614a9b48895cc3d0aeb0e9f2   

   time_slot  weekday  order_gap  demand      price  supply  
0          0        4          5       5   9.000000       0  
1          1        4          3       3  10.666667       0  
2          1        6          1       1  13.000000       0  
3          2        4          3       3  45.333333       0  
4          3        6          1       1  11.000000       0  


In [6]:
# group the weather data by time slot
# aggregate the mean of temperature and pm25
weather_data_grouped = weather_data.groupby(['time_slot', 'weekday']).agg({'temperature': 'mean', 'pm25': 'mean'}).reset_index()

print(weather_data_grouped)

      time_slot  weekday  temperature        pm25
0             0        0     8.000000  134.200000
1             0        1     7.500000  182.500000
2             0        2     5.500000   73.500000
3             0        3     5.500000  102.500000
4             0        4     4.333333  171.333333
...         ...      ...          ...         ...
1003        143        2     5.500000  102.500000
1004        143        3     2.666667   92.666667
1005        143        4     5.500000  130.000000
1006        143        5     8.000000  147.000000
1007        143        6     8.000000  137.500000

[1008 rows x 4 columns]


In [7]:
# merge the orders data and weather data
orders_weather_data = pd.merge(orders_data_grouped, weather_data_grouped, on=['time_slot', 'weekday'], how='inner' )

print(orders_weather_data)


                       start_region_hash                  dest_region_hash  \
0       08232402614a9b48895cc3d0aeb0e9f2  08232402614a9b48895cc3d0aeb0e9f2   
1       0a5fef95db34383403d11cb6af937309  c119d09aebdac22f875d38fd982bd24b   
2       1afd7afbc81ecc1b13886a569d869e8a  0c48862b9748fe682b4c8ee996ebe26a   
3       1afd7afbc81ecc1b13886a569d869e8a  1afd7afbc81ecc1b13886a569d869e8a   
4       1afd7afbc81ecc1b13886a569d869e8a  2407d482f0ffa22a947068f2551fe62c   
...                                  ...                               ...   
262915  d4ec2125aff74eded207d2d915ef682f  b05379ac3f9b7d99370d443cfd5dcc28   
262916  d4ec2125aff74eded207d2d915ef682f  d4ec2125aff74eded207d2d915ef682f   
262917  d4ec2125aff74eded207d2d915ef682f  ed8eb1876d270f25e29fe4339ad41524   
262918  dd8d3b9665536d6e05b29c2648c0e69a  dd8d3b9665536d6e05b29c2648c0e69a   
262919  fff4e8465d1e12621bc361276b6217cf  8316146a6f78cc6d9f113f0390859417   

        time_slot  weekday  order_gap  demand       price  supp

In [8]:
# merge the poi_list class characteristics with the cluster_map
# cluster_map: region_hash, region_id
# poi_data: district_hash, poi_ids
# merge on district_hash
cluster_map_poi = pd.merge(cluster_map, poi_data, left_on='region_hash', right_on='region_hash', how='inner')

# remove the region_hash column
# cluster_map_poi = cluster_map_poi.drop(['region_id'], axis=1)

print(cluster_map_poi)


                         region_hash  region_id    poi_ids
0   90c5a34f06ac86aee0fd70e2adce7d8a          1  118257404
1   f2c8c4bb99e6377d21de71275afd6cd2          2   68155035
2   58c7a4888306d8ff3a641d1c0feccbe3          3    5013449
3   b26a240205c852804ff8758628c0a86a          4   42874231
4   4b9e4cf2fbdc8281b8a1f9f12b80ce4d          5    4327122
..                               ...        ...        ...
56  a735449c5c09df639c35a7d61fad3ee5         62     363374
57  0a5fef95db34383403d11cb6af937309         63   12680076
58  bf44d327f0232325c6d5280926d7b37d         64   41160364
59  825a21aa308dea206adb49c4b77c7805         65   16275636
60  1ecbb52d73c522f184a6fc53128b1ea1         66   23374709

[61 rows x 3 columns]


In [9]:
# merge the orders_data with the cluster_map_poi
# orders_weather_data: start_district_hash, time_slot, weekday, order_gap, temperature, pm25
# cluster_map_poi: region_id, poi_ids
# merge on start_district_hash
# print(orders_weather_data.head())
# print(cluster_map_poi.head())

orders_weather_cluster_map_poi = pd.merge(orders_weather_data, cluster_map_poi, left_on='start_region_hash', right_on='region_hash', how='inner')
orders_weather_cluster_map_poi=orders_weather_cluster_map_poi.rename(columns={'poi_ids': 'start_poi_ids'}).rename(columns={'region_id': 'start_region_id'})

orders_weather_cluster_map_poi = pd.merge(orders_weather_cluster_map_poi, cluster_map_poi, left_on='dest_region_hash', right_on='region_hash', how='inner')
orders_weather_cluster_map_poi=orders_weather_cluster_map_poi.rename(columns={'poi_ids': 'dest_poi_ids'}).rename(columns={'region_id': 'dest_region_id'})

# remove the start_district_hash column
orders_weather_cluster_map_poi = orders_weather_cluster_map_poi.drop(['start_region_hash', 'dest_region_hash', 'region_hash_x','region_hash_y'], axis=1)

print(orders_weather_cluster_map_poi)


# save the data
orders_weather_cluster_map_poi.to_csv('../dataset/processedData/orders_weather_cluster_map_poi.csv', index=False)


        time_slot  weekday  order_gap  demand      price  supply  temperature  \
0               0        4          5       5   9.000000       0     4.333333   
1               1        4          3       3  10.666667       0     4.500000   
2               1        6          1       1  13.000000       0     8.000000   
3               2        4          3       3  45.333333       0     4.000000   
4               3        6          1       1  11.000000       0     8.000000   
...           ...      ...        ...     ...        ...     ...          ...   
100164        109        5          1       1   1.000000       0    10.600000   
100165        126        1          2       3   2.000000       1     5.500000   
100166        135        1          1       1   1.000000       0     5.250000   
100167        133        0          1       1   4.000000       0    12.000000   
100168         78        4          2       3  47.000000       1     8.750000   

              pm25  start_r