In [1]:
import math
import pandas as pd 
import geopandas as gpd

import h3 # h3 bins from uber

In [2]:
def create_crash_df(train_file = '../Inputs/Train.csv'):  
    crash_df = pd.read_csv(train_file, parse_dates=['datetime'])
    return crash_df

def create_temporal_features(df):
    dict_windows = {1: "00-03", 2: "03-06", 3: "06-09", 4: "09-12", 5: "12-15", 6: "15-18", 7: "18-21", 8: "21-24"}
    dict_months = {1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun",
               7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"}
    
    df["time_window"] = df["datetime"].apply(lambda x: math.floor(x.hour / 3) + 1)
    df["time_window_str"] = df["time_window"].apply(lambda x: dict_windows.get(x))
    df["day"] = df["datetime"].apply(lambda x: x.day)
    df["month"] = df["datetime"].apply(lambda x: dict_months.get(x.month))
    df["year"] = df["datetime"].apply(lambda x: x.year)
    df["weekday"] = df["datetime"].apply(lambda x: x.weekday())
    return df

def assign_hex_bin(df,lat_column="latitude",lon_column="longitude"):
    df["h3_zone_5"] = df.apply(lambda x: h3.geo_to_h3(x[lat_column], x[lon_column], 5),axis=1)
    df["h3_zone_6"] = df.apply(lambda x: h3.geo_to_h3(x[lat_column], x[lon_column], 6),axis=1)
    df["h3_zone_7"] = df.apply(lambda x: h3.geo_to_h3(x[lat_column], x[lon_column], 7),axis=1)
    return df

def export_df_to_csv(df,path_file='../Inputs/train_h3.csv'):
    df.to_csv(path_file,index=False)
    print(f'file created {path_file}')
    

In [46]:
# create command line commands for downlaoding uber movement data with OSM segment info
month_list = [('01','31'),
              ('02','28'),
              ('03','31'),
              ('04','30'),
              ('05','31'),
              ('06','30'),
              ('07','31'),
              ('08','31'),
              ('09','30'),
              ('10','31'),
              ('11','30'),
              ('12','31')]
for year in ['2018','2019']:
    for month, end_day in month_list:
        break # remove when you want the commands
        # print([f'mdt speeds-to-geojson nairobi {year}-{month}-01 {year}-{month}-{end_day} --output=Inputs/nairobi_{year}_{month}geojson.geojson'])
        # print([f'mdt speeds-transform historical nairobi {year}-{month}-1 {year}-{month}-{end_day} --output=Inputs/nairobi_{year}_{month}_osm.csv'])

['mdt speeds-transform historical nairobi 2018-01-1 2018-01-31 --output=Inputs/nairobi_2018_01_osm.csv']
['mdt speeds-transform historical nairobi 2018-02-1 2018-02-28 --output=Inputs/nairobi_2018_02_osm.csv']
['mdt speeds-transform historical nairobi 2018-03-1 2018-03-31 --output=Inputs/nairobi_2018_03_osm.csv']
['mdt speeds-transform historical nairobi 2018-04-1 2018-04-30 --output=Inputs/nairobi_2018_04_osm.csv']
['mdt speeds-transform historical nairobi 2018-05-1 2018-05-31 --output=Inputs/nairobi_2018_05_osm.csv']
['mdt speeds-transform historical nairobi 2018-06-1 2018-06-30 --output=Inputs/nairobi_2018_06_osm.csv']
['mdt speeds-transform historical nairobi 2018-07-1 2018-07-31 --output=Inputs/nairobi_2018_07_osm.csv']
['mdt speeds-transform historical nairobi 2018-08-1 2018-08-31 --output=Inputs/nairobi_2018_08_osm.csv']
['mdt speeds-transform historical nairobi 2018-09-1 2018-09-30 --output=Inputs/nairobi_2018_09_osm.csv']
['mdt speeds-transform historical nairobi 2018-10-1 201

In [4]:
def join_segment_files(path='../Inputs/', road_surveys='Segment_info.csv',segments_geometry='segments_geometry.geojson'):
    ''' 
        Load the survey data, Load the segment geometry, Join the two segment dfs.
        return a combined dataframe
    '''
    road_surveys = pd.read_csv(path+road_surveys)
    road_segment_locs = gpd.read_file(path+segments_geometry)
    segments_merged = pd.merge(road_segment_locs, road_surveys, on='segment_id', how='left')
    segments_merged["longitude"] = segments_merged.geometry.centroid.x
    segments_merged["latitude"] = segments_merged.geometry.centroid.y
    segments_merged = assign_hex_bin(segments_merged)
    return segments_merged

In [5]:
crash_df = create_crash_df(train_file = '../Inputs/Train.csv')
crash_df = create_temporal_features(crash_df)
crash_df = assign_hex_bin(crash_df)
#crash_df.head()

In [7]:
segments_merged = join_segment_files()


  segments_merged["longitude"] = segments_merged.geometry.centroid.x

  segments_merged["latitude"] = segments_merged.geometry.centroid.y


In [61]:
segments_merged.describe()

Unnamed: 0,side,25_48,79_76,79_65,79_39,28_78_50,55_90,26_76,92_30,62_51,...,80_97_61_65_15,80_97_61_65_16,80_97_61_39_0,80_97_61_39_10,80_97_61_39_11,80_97_61_39_12,80_97_61_39_15,80_97_61_39_16,longitude,latitude
count,1535.0,1535.0,1535.0,1535.0,1535.0,1246.0,1535.0,1535.0,1535.0,1535.0,...,1245.0,1245.0,1232.0,1232.0,1232.0,1232.0,1232.0,1232.0,1535.0,1535.0
mean,1.484039,1.855375,0.03127,3.370033,3.400651,0.074639,8.607818,0.006515,2.481433,1.967427,...,0.079518,0.04498,0.171266,0.064123,0.225649,0.599838,0.078734,0.043831,36.839393,-1.280532
std,0.499908,1.207833,0.174104,0.656837,0.562055,0.262913,4.976077,0.080476,0.631228,0.839993,...,0.270654,0.207343,0.376894,0.245072,0.418179,0.49013,0.269432,0.204802,0.055332,0.029959
min,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.693519,-1.365542
25%,1.0,1.0,0.0,3.0,3.0,0.0,5.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.811194,-1.296877
50%,1.0,1.0,0.0,3.0,3.0,0.0,6.0,0.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36.831715,-1.283121
75%,2.0,3.0,0.0,4.0,4.0,0.0,13.0,0.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36.880048,-1.263554
max,2.0,6.0,1.0,4.0,4.0,1.0,16.0,1.0,3.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,37.088976,-1.174576


In [63]:

segments_h3_zone_7= segments_merged.groupby(by='h3_zone_7').max()
segments_h3_zone_7['h3_zone_5']= segments_merged.groupby(by='h3_zone_5').latitude.max()
segments_h3_zone_7['h3_zone_6']= segments_merged.groupby(by='h3_zone_6').latitude.max()
segments_h3_zone_7['latitude']= segments_merged.groupby(by='h3_zone_7').latitude.mean()
segments_h3_zone_7['longitude']= segments_merged.groupby(by='h3_zone_7').longitude.mean()
segments_h3_zone_7.head()

ValueError: Length of values (6) does not match length of index (66)

In [33]:
path = '../Inputs/'
road_surveys='Segment_info.csv'
segments_geometry='segments_geometry.geojson'
road_segment_locs = gpd.read_file(path+segments_geometry)
road_surveys = pd.read_csv(path+road_surveys)

In [36]:
road_segment_locs.segment_id.nunique()

792

In [37]:
road_surveys.segment_id.nunique()

792

In [11]:
def join_segment_crash_files(crash_data=crash_df, segments=segments_merged, h3_zone='h3_zone_5'):
    ''' 
        Combine the segment data and the crash data by chosen hex.
        return a combined dataframe
    '''
    # Add some groupby function here
    segment_crash_df = pd.merge(crash_data, segments, on=h3_zone, how='left')
    return segment_crash_df

In [12]:
segment_crash_df = join_segment_crash_files()

In [13]:
segment_crash_df.head()

Unnamed: 0,uid,datetime,latitude_x,longitude_x,time_window,time_window_str,day,month,year,weekday,...,80_97_61_39_0,80_97_61_39_10,80_97_61_39_11,80_97_61_39_12,80_97_61_39_15,80_97_61_39_16,longitude_y,latitude_y,h3_zone_6_y,h3_zone_7_y
0,1,2018-01-01 00:25:46,-1.18885,36.931382,1,00-03,1,Jan,2018,0,...,0.0,1.0,0.0,0.0,0.0,0.0,36.826302,-1.284096,867a6e42fffffff,877a6e42cffffff
1,1,2018-01-01 00:25:46,-1.18885,36.931382,1,00-03,1,Jan,2018,0,...,0.0,1.0,0.0,0.0,0.0,0.0,36.826302,-1.284096,867a6e42fffffff,877a6e42cffffff
2,1,2018-01-01 00:25:46,-1.18885,36.931382,1,00-03,1,Jan,2018,0,...,1.0,1.0,0.0,0.0,0.0,0.0,36.826838,-1.283306,867a6e42fffffff,877a6e42cffffff
3,1,2018-01-01 00:25:46,-1.18885,36.931382,1,00-03,1,Jan,2018,0,...,1.0,1.0,0.0,0.0,1.0,0.0,36.826838,-1.283306,867a6e42fffffff,877a6e42cffffff
4,1,2018-01-01 00:25:46,-1.18885,36.931382,1,00-03,1,Jan,2018,0,...,1.0,1.0,0.0,0.0,1.0,0.0,36.82752,-1.28228,867a6e42fffffff,877a6e42cffffff


### The crash data and the segment data needs to be grouped before this join makes sense
### Also need to deal with the issue of missing segments


In [42]:
uber_movement_osm = pd.read_csv('../Inputs/nairobi_2018_01_osm.csv')

In [47]:
uber_movement_osm.head()

Unnamed: 0,year,month,day,hour,utc_timestamp,segment_id,start_junction_id,end_junction_id,osm_way_id,osm_start_node_id,osm_end_node_id,speed_kph_mean,speed_kph_stddev
0,2018,1,2,14,2018-01-02T11:00:00.000Z,41297ea07006246e330f9767862c5b85167eb37a,d97c55174d6c011dcd2bf3d1e05629b09dba739b,a996c01a24deb369c03c19b95b587a688ab2f9b8,8462710,3716534178,6287511790,20.844,6.105
1,2018,1,2,17,2018-01-02T14:00:00.000Z,41297ea07006246e330f9767862c5b85167eb37a,d97c55174d6c011dcd2bf3d1e05629b09dba739b,a996c01a24deb369c03c19b95b587a688ab2f9b8,8462710,3716534178,6287511790,21.637,5.526
2,2018,1,2,15,2018-01-02T12:00:00.000Z,41297ea07006246e330f9767862c5b85167eb37a,d97c55174d6c011dcd2bf3d1e05629b09dba739b,a996c01a24deb369c03c19b95b587a688ab2f9b8,8462710,3716534178,6287511790,21.122,11.239
3,2018,1,1,4,2018-01-01T01:00:00.000Z,41297ea07006246e330f9767862c5b85167eb37a,d97c55174d6c011dcd2bf3d1e05629b09dba739b,a996c01a24deb369c03c19b95b587a688ab2f9b8,8462710,3716534178,6287511790,26.36,29.398
4,2018,1,2,10,2018-01-02T07:00:00.000Z,41297ea07006246e330f9767862c5b85167eb37a,d97c55174d6c011dcd2bf3d1e05629b09dba739b,a996c01a24deb369c03c19b95b587a688ab2f9b8,8462710,3716534178,6287511790,19.39,4.314


In [9]:
geojsonfile = gpd.read_file('../Inputs/nairobi_2018_01_speeds.geojson', parse_dates=['utc_timestamp'])

In [13]:
geojsonfile.osmhighway.unique()

array(['unclassified', 'residential', 'service', 'tertiary', 'secondary',
       'primary', 'trunk', 'road', 'primary_link', 'secondary_link',
       'motorway', 'trunk_link', 'tertiary_link', 'motorway_link',
       'living_street'], dtype=object)

In [17]:
geojsonfile.speed_mean_kph.nunique()

95

In [16]:
geojsonfile.head()

Unnamed: 0,osmstartnodeid,osmhighway,osmendnodeid,osmwayid,osmname,speed_mean_kph,pct_from_freeflow,speed_freeflow_kph,geometry
0,6542149439,unclassified,6542149435,696644681,,,,,"LINESTRING (36.07340 -0.56126, 36.07386 -0.561..."
1,6542149435,unclassified,6542149439,696644681,,,,,"LINESTRING (36.07617 -0.56080, 36.07555 -0.560..."
2,6542149668,unclassified,6542149435,693064841,,,,,"LINESTRING (36.07622 -0.56415, 36.07617 -0.563..."
3,6542149435,unclassified,6542149668,693064841,,,,,"LINESTRING (36.07615 -0.56081, 36.07615 -0.561..."
4,6308021041,residential,6308021072,673589411,,,,,"LINESTRING (37.08520 -1.29747, 37.08469 -1.297..."
