# ACT Bike Infrastructure Project

## Bike Paths and Street Lights Data Wrangling

In [1]:
import pandas as pd
import numpy as np
import itertools as it
import pygeohash as pgh

from cross_arc_distance import cross_arc_distance_vectorised, haversine_vectorised
import loader
import common
import sys
import time

### Load Data

In [2]:
all_paths_df, bike_paths_df, pedestrian_paths_df, street_lights_df = loader.load_data()

### Wrangling bike path data

In [4]:
## Make a copy for modification
bike_paths_mod_df = bike_paths_df.copy()

## Split 'the_geom' dict into two columns, dropping original
bike_paths_mod_df = pd.concat([
        bike_paths_mod_df.drop(['the_geom'], axis=1), 
        bike_paths_mod_df['the_geom'].apply(pd.Series)], axis=1)

## Flatten the 'coordinates' list of lists by depth 1
bike_paths_mod_df = bike_paths_mod_df.explode('coordinates')

## Split every list of lists of pairs of coordinates in 'coordinates' into coordinate quadruples
## Each quadruple is of the form [A_long, A_lat, B_long, B_lat]
bike_paths_mod_df = bike_paths_mod_df.join(bike_paths_mod_df['coordinates']
                              .apply(lambda x: [[i[0], i[1], j[0], j[1]] for i, j in zip(x, x[1:])])
                              .explode().rename('coordinate_pair'))

## Split the coordinates quadruples list into separate lat and long columns
coordinate_pair_split = pd.DataFrame(
        bike_paths_mod_df["coordinate_pair"].to_list(), columns=[
            "pointA_longitude", 
            "pointA_latitude", 
            "pointB_longitude", 
            "pointB_latitude"
        ])

## Append the separated lat and long columns to the original df
## We must drop the index before joining the list onto the df, then reset index.
bike_paths_mod_df = bike_paths_mod_df.reset_index().join(coordinate_pair_split).set_index('index')

## Drop superfluous columns
bike_paths_mod_df = bike_paths_mod_df.drop(['coordinates', 'coordinate_pair', 'type'], axis=1)

## Define a unique path ID geohash based on the new path segment coordinates. Precision is high so we don't lose info.
bike_paths_mod_df['path_segment_ID'] = \
        bike_paths_mod_df.apply(lambda x: pgh.encode(x['pointA_latitude'], x['pointA_longitude'], precision=17), axis=1).astype(str) + \
        bike_paths_mod_df.apply(lambda x: pgh.encode(x['pointB_latitude'], x['pointB_longitude'], precision=17), axis=1).astype(str)

## Remove duplicate path entries. I assume these must have come from the original data.
## There is one instance where we have the same path segments assigned to neighbouring different suburbs - meh
bike_paths_mod_df = bike_paths_mod_df.drop_duplicates(['path_segment_ID'])


## use haversine_vectorised to get the length of the segments from the points I've defined.
bike_paths_mod_df['sub_segment_length'] = haversine_vectorised(np.deg2rad(bike_paths_mod_df['pointA_latitude']), np.deg2rad(bike_paths_mod_df['pointA_longitude']), 
                                                               np.deg2rad(bike_paths_mod_df['pointB_latitude']), np.deg2rad(bike_paths_mod_df['pointB_longitude']))

bike_paths_mod_df.head()

Unnamed: 0_level_0,path_type,path_surf,ave_width,seg_length,suburb,owner,pointA_longitude,pointA_latitude,pointB_longitude,pointB_latitude,path_segment_ID,sub_segment_length
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124824,-35.176958,149.124812,-35.177008,r3dpckjrk5xzgwgf0r3dpckjr5fre2ge5k,5.694729
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124812,-35.177008,149.124806,-35.177035,r3dpckjr5fre2ge5kr3dpckjqgv6mk8r3u,3.012691
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124806,-35.177035,149.12481,-35.177061,r3dpckjqgv6mk8r3ur3dpckjqgbmz5xh5x,2.976032
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.12481,-35.177061,149.124838,-35.177097,r3dpckjqgbmz5xh5xr3dpckjqs9f8d5n6q,4.73026
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124838,-35.177097,149.124896,-35.177148,r3dpckjqs9f8d5n6qr3dpckjqmbjxg40tt,7.731752


### Wrangling street light data

In [5]:
## Make a copy for modification
street_lights_mod_df = street_lights_df.copy()

## Append a Boolean column to our street light data, and a distance column for later
street_lights_mod_df['close_to_path'] = False
street_lights_mod_df['distance_nearest_path'] = np.inf

## Are there any lights that are in a suburb which contains no bike paths?
bike_path_suburbs = bike_paths_mod_df['suburb'].str.lower().unique()
street_light_suburbs = street_lights_mod_df['suburb'].str.lower().unique()

## If so, remove the street lights in suburbs that contain no bike paths
street_lights_no_bike_paths = list(set(street_light_suburbs) - set(bike_path_suburbs))
street_lights_mod_df = street_lights_mod_df[~street_lights_mod_df['suburb']
                        .str.lower()
                        .isin(street_lights_no_bike_paths)]

## Split the 'location' dict column into separate columns (ensure they are of numerical type)
street_lights_mod_df = pd.concat([
    street_lights_mod_df.drop(['location'], axis=1), 
    street_lights_mod_df['location'].apply(pd.Series)], axis=1)

street_lights_mod_df["latitude"] = pd.to_numeric(street_lights_mod_df["latitude"])
street_lights_mod_df["longitude"] = pd.to_numeric(street_lights_mod_df["longitude"])

street_lights_mod_df.head()

Unnamed: 0,column_type,:@computed_region_h8vr_r9vc,suburb,address,outreach_arm_length,lamp_count,lamp_type,height,luminaire,column_material,close_to_path,distance_nearest_path,latitude,human_address,needs_recoding,longitude
0,Rigid column,104,WANNIASSA,,,,,,,,False,inf,-35.403386,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.094254
1,Rigid column,67,HARRISON,,,,,,,,False,inf,-35.206829,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.14904
2,VICPOLE,89,GUNGAHLIN,HORSE PARK DRIVE,4.5 M SINGLE,1.0,LIGHT EMITTING DIODE,12 metres,SYLVANIA ROAD LED,,False,inf,-35.175655,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.139648
3,Energy Absorbing column,83,CITY,VERITY LANE BTWN EAST ROW & NTHBOURNE AVE,3.5 M TWIN,2.0,METAL HALIDE,5 metres,SPECIAL,STEEL ORDINARY,False,inf,-35.279123,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.129941
4,FORDE COLUMN,19,WRIGHT,JOHN GORTON DRIVE,3.0 M SINGLE,1.0,LIGHT EMITTING DIODE,9 metres,PECAN NXT-36S,,False,inf,-35.314926,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.032875


## Find lights that are close to bike paths

In [6]:
## Cutoff radius in metres. Lights at a distance < cutoff will be added to our output.
cutoff = common.cutoff

output_filename = common.lights_close_to_paths_data
street_lights = np.array(street_lights_mod_df[['close_to_path', 'latitude', 'longitude', 'distance_nearest_path']])
bike_paths = np.array(bike_paths_mod_df[['pointA_longitude', 'pointA_latitude', 'pointB_longitude', 'pointB_latitude', 'path_segment_ID']])
total = len(bike_paths)

## I have to chunk the vectorised bike_path data, because vector is too big!
for i in range(len(bike_paths)-1):

    ## Select the chunk. We will test the distance between each light and this chunk.
    bike_paths_iter = bike_paths[i:i+1]
    combinations = it.product(street_lights, bike_paths_iter)
    result = np.array([np.hstack(i) for i in combinations])

    ## Convert latitude and longitude points into radians
    pointA_lat, pointA_long, pointB_lat, pointB_long, pointC_lat, pointC_long = map(np.deg2rad, [
        result[:,5].astype(float), result[:,4].astype(float), result[:,7].astype(float), 
        result[:,6].astype(float), result[:,1].astype(float), result[:,2].astype(float)
        ])

    ## Calculate the cross-arc distance array between the bike path segments chunk and the light array
    distance, segment_length = cross_arc_distance_vectorised(pointA_lat, pointA_long, pointB_lat, pointB_long, pointC_lat, pointC_long)

    ## Find street lights, path segments, and distances such that distance is < our cutoff and stack these with their positions
    street_light_close = np.array(np.where(distance < cutoff, True, False))
    
    light_lat_array, light_long_array = result[:,1].astype(float), result[:,2].astype(float)
    path_segmentA_lat, path_segmentA_long = result[:,5].astype(float), result[:,4].astype(float)
    path_segmentB_lat, path_segmentB_long = result[:,7].astype(float), result[:,6].astype(float)
    path_segment_ID = result[:,8]

    output_lights = np.column_stack((light_lat_array, light_long_array, street_light_close, 
                                    distance, path_segmentA_lat, path_segmentA_long, 
                                    path_segmentB_lat, path_segmentB_long, segment_length, path_segment_ID))

    ## Convert array to df, then keep only rows of df where close_to_path is True, and append to output
    output_lights_df = pd.DataFrame(output_lights, columns=["light_lat", "light_long", "close_to_path", 
                                                            "distance_nearest_path", "path_segmentA_lat", "path_segmentA_long",
                                                            "path_segmentB_lat", "path_segmentB_long", "path_segment_length", "path_segment_ID"])
    output_lights_df['close_to_path'] = output_lights_df['close_to_path'].astype('bool')
    lights_close_to_path_df = output_lights_df[output_lights_df['close_to_path']]
    lights_close_to_path_df.to_csv(output_filename, mode='a', header=False)
    
    sys.stdout.write("\r" + f"{i}/{total}")
    sys.stdout.flush()


97409/97411

## Which paths are not close to any lights?

In [7]:
## We will load the dataset from the above, for the sake of time.
lights_close_to_path_df = pd.read_csv(common.lights_close_to_paths_data, index_col=None, header=None, 
                names=["index", "light_latitude", "light_longitude", "close_to_path", "distance_to_path",
                "pathA_latitude", "pathA_longitude", "pathB_latitude", "pathB_longitude", "path_segment_length", "path_segment_ID"])

## These are all the lights close to paths, grouped by path.
lights_close_to_path_df.groupby(by=['path_segment_ID']).apply(lambda a: a.drop(['path_segment_ID'], axis=1)[:])

## which path segments ids are in bike_paths_mod_df['path_segment_ID'] and not in lights_close_to_path_df['path_segment_ID']
## Because there will potentially be multiple light+path entries for a given path, de-duplicate lights_close_to_path on path_segment_ID
paths_not_close_to_lights = bike_paths_mod_df.merge(lights_close_to_path_df.drop_duplicates(subset=['path_segment_ID']), on=['path_segment_ID'], how='left', indicator=True)
paths_not_close_to_lights = paths_not_close_to_lights[paths_not_close_to_lights['_merge'] == 'left_only'] ## this will give only those in bike_paths_mod_df, not in lights_ or both

paths_close_to_lights = bike_paths_mod_df.merge(lights_close_to_path_df.drop_duplicates(subset=['path_segment_ID']), on=['path_segment_ID'], how='left', indicator=True)
paths_close_to_lights = paths_close_to_lights[paths_close_to_lights['_merge'] == 'both'] ## this will give paths only in lights_, not in both or bike_paths

columns_of_interest = ['pointA_latitude', 'pointA_longitude', 'pointB_latitude', 'pointB_longitude', 'path_segment_ID', 'sub_segment_length']
paths_not_close_to_lights = paths_not_close_to_lights[columns_of_interest]
paths_close_to_lights = paths_close_to_lights[columns_of_interest]

paths_not_close_to_lights.to_csv(common.paths_not_close_to_light_data)
paths_close_to_lights.to_csv(common.paths_close_to_light_data)

In [8]:
print(f"There are {len(bike_paths_mod_df)} unique path segments, and {len(street_lights_df)} unique lights, making a total of {len(bike_paths_mod_df)*len(street_lights_df)} combinations")
print(f"Of these combinations, there are {len(lights_close_to_path_df)} street light + path pairs that are within a {common.cutoff}m cutoff")
print(f"There are {len(paths_not_close_to_lights)} path segments that have zero lights within that cutoff, or {(len(paths_not_close_to_lights)/len(bike_paths_mod_df))*100}% that are entirely unlit (by number of segments, not lengths)")
print(f"Naturally, this means there are {len(paths_close_to_lights)} path segments that have lights within that cutoff, or {(len(paths_close_to_lights)/len(bike_paths_mod_df))*100}% that are lit")
print(f"The length of all bike paths in the ACT is {bike_paths_mod_df['sub_segment_length'].sum()}m, of which {paths_not_close_to_lights['sub_segment_length'].sum()}, or {(paths_not_close_to_lights['sub_segment_length'].sum()/bike_paths_mod_df['sub_segment_length'].sum())*100}%, is unlit")
print(f"Conversely, the length of all bike paths in the ACT is {bike_paths_mod_df['sub_segment_length'].sum()}m, of which {paths_close_to_lights['sub_segment_length'].sum()}, or {(paths_close_to_lights['sub_segment_length'].sum()/bike_paths_mod_df['sub_segment_length'].sum())*100}%, is lit")

There are 97411 unique path segments, and 81228 unique lights, making a total of 7912500708 combinations
Of these combinations, there are 26602 street light + path pairs that are within a 10m cutoff
There are 75109 path segments that have zero lights within that cutoff, or 77.10525505333074% that are entirely unlit (by number of segments, not lengths)
Naturally, this means there are 22302 path segments that have lights within that cutoff, or 22.894744946669267% that are lit
The length of all bike paths in the ACT is 498859.7229937521m, of which 314678.5106914509, or 63.079558478484756%, is unlit
Conversely, the length of all bike paths in the ACT is 498859.7229937521m, of which 184181.21230230108, or 36.92044152151522%, is lit
