# Preparing Uber Movement travel time data
---

### Overview
1. Inspect raw data
2. Process raw data
3. Process all geographic data
4. Training, validation, testing data
---

We start with importing hyper parameters and modules

In [1]:
import hyper
import prep_uber_movement
import pandas as pd

HYPER = hyper.HyperParameter()

### 1. Inspect raw data
Let us first inspect the available raw data files for an exemplar city. This will allow us to better understand our raw data and what our features and labels are. We have data available for and can choose from the following list of cities: 

Amsterdam, Atlanta, Auckland, Bagalore, Bogota, Boston, Brisbane, Bristol, Brussels, Cairo, Cape Town, Cincinnati, Guadalajara, Hyderabad, Johannesburg and Pretoria, Kolkata, Leeds, London, Los Angeles, Madrid, Manchester, Melbourne, Mexico City, Miami, Mumbai, Nairobi, New Delhi, Orlando, Paris, Perth, Pittsburgh, San Francisco, Santiago De Chile, Sao Paulo, Seattle, Stockholm, Sydney, Taipei, Tampa Bay, Toronto, Vienna, 'Washington D.C.', 'West Midlands, UK'.

The raw data shows a number of characteristics that are worth to note:

* Travel time data is describes by four distinct values: mean, std, gemoetric mean and geometric std. These are our labels
* Our features are hour of day, a source ID and a destination ID. We can further see that the filename of our .csv files contain further meta data that is useful for describining features, which are the year, the quarter of the year and the day type (weekday or weekend).
* The geojson file further maps a set of latitudinal and longitudinal coordinates to each city zone ID, which describe the coordinates of a two dimensional polygon representing each zone. 

In [2]:
# choose a city from the list of available ones above
city = 'Brussels'

# call the import data function
df_geojson = prep_uber_movement.import_geojson(HYPER, city)
df_csv_dict_list = prep_uber_movement.import_csvdata(HYPER, city)
df_csv_dict = df_csv_dict_list[0]
df_csv = df_csv_dict['df']

# set maximum column width to see more of geojson
pd.set_option('max_colwidth', 400)

# print filename
print(df_csv_dict['filename'])
display(df_csv)
display(df_geojson)

brussels-statisticaldistrict-2017-2-OnlyWeekdays-HourlyAggregate.csv


Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time
0,498,427,0,211.04,162.73,171.52,1.82
1,493,477,0,142.33,70.15,129.83,1.51
2,506,478,2,505.00,172.17,476.60,1.41
3,482,587,0,333.00,232.60,277.98,1.74
4,526,278,2,238.20,227.53,188.32,1.90
...,...,...,...,...,...,...,...
1185080,724,723,14,134.31,100.06,110.21,1.79
1185081,724,723,16,98.13,58.57,61.36,3.95
1185082,724,723,18,118.83,95.79,63.42,4.49
1185083,724,723,20,103.33,67.51,81.07,2.07


Unnamed: 0,type,features
0,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 137, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.260', 'SDDC': 'G310', 'MD_ID': 78, 'MU_ID': 7400, 'NAME_DUT': 'HAREN-ZUIDWEST', 'NAME_FRE': 'HAREN-SUD-OUEST', 'MOVEMENT_ID': '1', 'DISPLAY_NAME': 'HAREN-ZUIDWEST / HAREN-SUD-OUEST'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.4048543, 50.8841368], [4.4049443, 50.8842297], [4.4050271, 50.8844293..."
1,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 163, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.714', 'SDDC': 'F572', 'MD_ID': 800, 'MU_ID': 7400, 'NAME_DUT': 'MARLY-ZUID', 'NAME_FRE': 'MARLY-SUD', 'MOVEMENT_ID': '2', 'DISPLAY_NAME': 'MARLY-ZUID / MARLY-SUD'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.3762399, 50.8834765], [4.3770824, 50.88395], [4.3777652, 50.884485], [4.3780837, 50.8849..."
2,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 141, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.393', 'SDDC': 'G371', 'MD_ID': 800, 'MU_ID': 7400, 'NAME_DUT': 'VORMINGSSTATION', 'NAME_FRE': 'GARE DE FORMATION', 'MOVEMENT_ID': '3', 'DISPLAY_NAME': 'VORMINGSSTATION / GARE DE FORMATION'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.4136255, 50.9116693], [4.4137834, 50.9116272], [4.4139327, 50...."
3,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 144, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.679', 'SDDC': 'G321', 'MD_ID': 78, 'MU_ID': 7400, 'NAME_DUT': 'HAREN-OOST', 'NAME_FRE': 'HAREN-EST', 'MOVEMENT_ID': '4', 'DISPLAY_NAME': 'HAREN-OOST / HAREN-EST'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.4202233, 50.8891119], [4.4206459, 50.8895716], [4.4211542, 50.8901286], [4.42133609999999..."
4,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 164, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.711', 'SDDC': 'F91-', 'MD_ID': 77, 'MU_ID': 7400, 'NAME_DUT': 'OORLOGSKRUISENLAAN', 'NAME_FRE': 'CROIX DE GUERRE (AVENUE DES)', 'MOVEMENT_ID': '5', 'DISPLAY_NAME': 'OORLOGSKRUISENLAAN / CROIX DE GUERRE (AVENUE DES)'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.3756208, 50.8884238], [4.3758454, 5..."
...,...,...
719,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 632, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.132', 'SDDC': 'A432', 'MD_ID': 105, 'MU_ID': 8700, 'NAME_DUT': 'DRIES', 'NAME_FRE': 'DRIES', 'MOVEMENT_ID': '720', 'DISPLAY_NAME': 'DRIES / DRIES'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.3968626, 50.8017591], [4.3970588, 50.8018771], [4.397167, 50.8019422], [4.3975726, 50.8021818], [4.39770..."
720,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 631, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.153', 'SDDC': 'A421', 'MD_ID': 105, 'MU_ID': 8700, 'NAME_DUT': 'WATERMAAL - STATION', 'NAME_FRE': 'WATERMAEL - STATION', 'MOVEMENT_ID': '721', 'DISPLAY_NAME': 'WATERMAAL - STATION / WATERMAEL - STATION'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.3972698, 50.8072647], [4.3972928, 50.8073743], [..."
721,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 641, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.49', 'SDDC': 'A451', 'MD_ID': 105, 'MU_ID': 8700, 'NAME_DUT': 'HOGEBOMEN', 'NAME_FRE': 'FUTAIE', 'MOVEMENT_ID': '722', 'DISPLAY_NAME': 'HOGEBOMEN / FUTAIE'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.3973337, 50.8011919], [4.3977137, 50.8011707], [4.3979294, 50.801173], [4.3981885, 50.8012202],..."
722,FeatureCollection,"{'type': 'Feature', 'properties': {'ID': 625, 'INSPIRE_ID': 'BE.BRUSSELS.BRIC.ADM.SD.102', 'SDDC': 'A523', 'MD_ID': 104, 'MU_ID': 8700, 'NAME_DUT': 'IJSVOGEL', 'NAME_FRE': 'MARTIN-PECHEUR', 'MOVEMENT_ID': '723', 'DISPLAY_NAME': 'IJSVOGEL / MARTIN-PECHEUR'}, 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[4.3981615, 50.813307], [4.3983521, 50.8133286], [4.3987911, 50.8135303], [4.399248..."


### 2. Process raw data

The data is already clean. The only part that must be processed are the geojson coordinates describing each city zone polygon with latitudes and longitudes. We write a recursive function that fosters the json files and extracts only the latitude and longitude coordinates mapped to each city zone ID. 

The format of files resulting from this step are shown below.

In [3]:
df_augmented_csvdata = prep_uber_movement.process_csvdata(df_csv_dict, city)
df_latitudes, df_longitudes = prep_uber_movement.process_geojson(df_geojson)

display(df_augmented_csvdata)
display(df_latitudes)
display(df_longitudes)

Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,year,quarter,daytype,city
0,498,427,0,211.04,162.73,171.52,1.82,2017,2,weekdays,Brussels
1,493,477,0,142.33,70.15,129.83,1.51,2017,2,weekdays,Brussels
2,506,478,2,505.00,172.17,476.60,1.41,2017,2,weekdays,Brussels
3,482,587,0,333.00,232.60,277.98,1.74,2017,2,weekdays,Brussels
4,526,278,2,238.20,227.53,188.32,1.90,2017,2,weekdays,Brussels
...,...,...,...,...,...,...,...,...,...,...,...
1185080,724,723,14,134.31,100.06,110.21,1.79,2017,2,weekdays,Brussels
1185081,724,723,16,98.13,58.57,61.36,3.95,2017,2,weekdays,Brussels
1185082,724,723,18,118.83,95.79,63.42,4.49,2017,2,weekdays,Brussels
1185083,724,723,20,103.33,67.51,81.07,2.07,2017,2,weekdays,Brussels


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,715,716,717,718,719,720,721,722,723,724
0,50.884137,50.883477,50.911669,50.889112,50.888424,50.890075,50.889147,50.895430,50.902151,50.913103,...,50.807983,50.812090,50.807694,50.801043,50.803459,50.801759,50.807265,50.801192,50.813307,50.809587
1,50.884230,50.883950,50.911627,50.889572,50.888629,50.890184,50.889957,50.895448,50.902328,50.913090,...,50.808077,50.812094,50.807769,50.801127,50.804093,50.801877,50.807374,50.801171,50.813329,50.809539
2,50.884429,50.884485,50.911580,50.890129,50.889102,50.890642,50.890012,50.895518,50.902827,50.912951,...,50.808262,50.812103,50.808163,50.801235,50.804522,50.801942,50.807561,50.801173,50.813530,50.809525
3,50.884658,50.884994,50.911570,50.890310,50.889178,50.890749,50.890356,50.895553,50.903037,50.912886,...,50.808382,50.812137,50.808208,50.801417,50.804516,50.802182,50.807652,50.801220,50.813784,50.809497
4,50.884973,50.885351,50.911565,50.890348,50.889254,50.890846,50.890371,50.895560,50.903191,50.912542,...,50.808712,50.812158,50.808253,50.801851,50.804395,50.802256,50.807684,50.801267,50.813895,50.809274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,,,,,,,,,,,...,,,,,,,,,,
464,,,,,,,,,,,...,,,,,,,,,,
465,,,,,,,,,,,...,,,,,,,,,,
466,,,,,,,,,,,...,,,,,,,,,,


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,715,716,717,718,719,720,721,722,723,724
0,4.404854,4.376240,4.413626,4.420223,4.375621,4.381591,4.411048,4.371622,4.412416,4.406289,...,4.392083,4.392237,4.392370,4.393083,4.396395,4.396863,4.397270,4.397334,4.398161,4.399693
1,4.404944,4.377082,4.413783,4.420646,4.375845,4.381769,4.411455,4.371861,4.412427,4.406335,...,4.392230,4.392251,4.392553,4.393158,4.396784,4.397059,4.397293,4.397714,4.398352,4.400018
2,4.405027,4.377765,4.413933,4.421154,4.376421,4.382725,4.411489,4.372889,4.412783,4.407222,...,4.392498,4.392290,4.393173,4.393178,4.396897,4.397167,4.397341,4.397929,4.398791,4.400112
3,4.405336,4.378084,4.413966,4.421336,4.376496,4.382938,4.411675,4.374406,4.412966,4.407637,...,4.392657,4.392436,4.393230,4.393211,4.397003,4.397573,4.397328,4.398188,4.399249,4.400305
4,4.405755,4.378055,4.413990,4.421370,4.376571,4.383145,4.411680,4.375064,4.413100,4.409826,...,4.392881,4.392526,4.393287,4.393515,4.399139,4.397705,4.397390,4.398448,4.399555,4.401802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,,,,,,,,,,,...,,,,,,,,,,
464,,,,,,,,,,,...,,,,,,,,,,
465,,,,,,,,,,,...,,,,,,,,,,
466,,,,,,,,,,,...,,,,,,,,,,


### 3. Process all geographic data

Now that we know how our raw data looks like and in what format we want to have our geographic data, we can continue with processing the geojson files of all cities into this format.

In [4]:
prep_uber_movement.process_all_raw_geojson_data(HYPER)

### 4. Training, validation, testing data

In [5]:
df_train, df_val, df_test = prep_uber_movement.train_val_test_split(HYPER)

display(df_train)
display(df_val)
display(df_test)

Training data   :    17% 
 Validation data :    17% 
 Testing data    :    66% 



Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,year,quarter,daytype,city
0,529,511,11,1422.00,501.89,1347.77,1.38,2016,1,weekdays,Guadalajara
1,1396,1023,19,1668.15,208.02,1656.23,1.12,2020,1,weekends,Guadalajara
2,287,364,3,643.72,259.99,598.03,1.46,2020,1,weekends,London
3,77,1346,22,746.71,361.34,556.41,2.78,2020,1,weekdays,Guadalajara
4,139,578,15,502.41,280.90,436.73,1.69,2019,4,weekdays,Guadalajara
...,...,...,...,...,...,...,...,...,...,...,...
40137209,371,953,17,2820.72,1005.62,2659.80,1.41,2016,1,weekdays,London
40137210,164,473,23,1660.00,247.12,1642.34,1.16,2020,1,weekdays,London
40137211,506,458,23,994.60,492.60,914.21,1.46,2016,1,weekends,London
40137212,1103,1002,3,1127.33,186.90,1110.65,1.19,2019,4,weekends,Boston


Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,year,quarter,daytype,city
0,742,1642,23,1327.70,282.94,1298.34,1.24,2020,1,weekends,Guadalajara
1,1271,1149,5,398.79,106.57,388.93,1.23,2020,1,weekdays,Guadalajara
2,334,296,19,686.73,336.19,639.62,1.40,2019,4,weekends,Brussels
3,532,341,17,417.92,166.98,389.22,1.44,2020,1,weekdays,Brussels
4,210,782,11,722.44,222.84,693.16,1.33,2019,4,weekdays,Guadalajara
...,...,...,...,...,...,...,...,...,...,...,...
40137209,414,125,1,581.15,250.76,554.05,1.31,2020,1,weekends,Auckland
40137210,557,629,11,1401.45,89.36,1398.69,1.06,2019,4,weekdays,Perth
40137211,1094,1327,14,1343.87,476.11,1279.40,1.34,2016,1,weekdays,Guadalajara
40137212,885,77,11,1007.71,278.99,969.83,1.32,2019,4,weekends,London


Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,year,quarter,daytype,city
0,2307,1935,20,1220.80,170.53,1211.21,1.13,2019,4,weekdays,San Francisco
1,309,527,5,438.00,60.01,434.30,1.14,2019,4,weekdays,Brussels
2,2162,985,11,743.94,316.85,690.69,1.44,2018,3,weekdays,San Francisco
3,2067,791,16,484.61,446.92,376.21,1.91,2016,1,weekdays,San Francisco
4,847,439,3,1124.55,204.97,1105.92,1.20,2017,2,weekdays,London
...,...,...,...,...,...,...,...,...,...,...,...
157186498,834,1468,10,1128.00,780.71,981.97,1.59,2019,4,weekdays,Guadalajara
157186499,2231,8,10,4384.00,458.95,4360.65,1.11,2020,1,weekdays,San Francisco
157186500,619,480,6,283.23,126.63,261.12,1.48,2018,3,weekdays,Brussels
157186501,26,510,9,1301.57,166.16,1291.30,1.13,2015,4,weekdays,Perth
