# ACT Bike Infrastructure Project

## Bike Paths and Street Lights Data Wrangling

In [10]:
import pandas as pd
import numpy as np
import itertools as it

from cross_arc_distance import cross_arc_distance_vectorised
import loader
import common
import sys
import time

### Load Data

In [2]:
all_paths_df, bike_paths_df, pedestrian_paths_df, street_lights_df = loader.load_data()

### Wrangling bike path data

In [3]:
## Make a copy for modification
bike_paths_mod_df = bike_paths_df.copy()

## Split 'the_geom' dict into two columns, dropping original
bike_paths_mod_df = pd.concat([
        bike_paths_mod_df.drop(['the_geom'], axis=1), 
        bike_paths_mod_df['the_geom'].apply(pd.Series)], axis=1)

## Flatten the 'coordinates' list of lists by depth 1
bike_paths_mod_df = bike_paths_mod_df.explode('coordinates')

## Split every list of lists of pairs of coordinates in 'coordinates' into coordinate quadruples
## Each quadruple is of the form [A_long, A_lat, B_long, B_lat]
bike_paths_mod_df = bike_paths_mod_df.join(bike_paths_mod_df['coordinates']
                              .apply(lambda x: [[i[0], i[1], j[0], j[1]] for i, j in zip(x, x[1:])])
                              .explode().rename('coordinate_pair'))

## Split the coordinates quadruples list into separate lat and long columns
coordinate_pair_split = pd.DataFrame(
        bike_paths_mod_df["coordinate_pair"].to_list(), columns=[
            "pointA_longitude", 
            "pointA_latitude", 
            "pointB_longitude", 
            "pointB_latitude"
        ])

## Append the separated lat and long columns to the original df
## We must drop the index before joining the list onto the df, then reset index.
bike_paths_mod_df = bike_paths_mod_df.reset_index().join(coordinate_pair_split).set_index('index')

## Drop superfluous columns
bike_paths_mod_df = bike_paths_mod_df.drop(['coordinates', 'coordinate_pair', 'type'], axis=1)

bike_paths_mod_df.head()

Unnamed: 0_level_0,path_type,path_surf,ave_width,seg_length,suburb,owner,pointA_longitude,pointA_latitude,pointB_longitude,pointB_latitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124824,-35.176958,149.124812,-35.177008
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124812,-35.177008,149.124806,-35.177035
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124806,-35.177035,149.12481,-35.177061
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.12481,-35.177061,149.124838,-35.177097
81,CYCLEPATH,BITUMEN,2.5,24.11545,AMAROO,ROADS_ACT,149.124838,-35.177097,149.124896,-35.177148


### Wrangling street light data

In [4]:
## Make a copy for modification
street_lights_mod_df = street_lights_df.copy()

## Append a Boolean column to our street light data, and a distance column for later
street_lights_mod_df['close_to_path'] = False
street_lights_mod_df['distance_nearest_path'] = np.inf

## Are there any lights that are in a suburb which contains no bike paths?
bike_path_suburbs = bike_paths_mod_df['suburb'].str.lower().unique()
street_light_suburbs = street_lights_mod_df['suburb'].str.lower().unique()

## If so, remove the street lights in suburbs that contain no bike paths
street_lights_no_bike_paths = list(set(street_light_suburbs) - set(bike_path_suburbs))
street_lights_mod_df = street_lights_mod_df[~street_lights_mod_df['suburb']
                        .str.lower()
                        .isin(street_lights_no_bike_paths)]

## Split the 'location' dict column into separate columns (ensure they are of numerical type)
street_lights_mod_df = pd.concat([
    street_lights_mod_df.drop(['location'], axis=1), 
    street_lights_mod_df['location'].apply(pd.Series)], axis=1)

street_lights_mod_df["latitude"] = pd.to_numeric(street_lights_mod_df["latitude"])
street_lights_mod_df["longitude"] = pd.to_numeric(street_lights_mod_df["longitude"])

street_lights_mod_df.head()

Unnamed: 0,column_type,:@computed_region_h8vr_r9vc,suburb,address,outreach_arm_length,lamp_count,lamp_type,height,luminaire,column_material,close_to_path,distance_nearest_path,latitude,human_address,needs_recoding,longitude
0,Rigid column,104,WANNIASSA,,,,,,,,False,inf,-35.403386,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.094254
1,Rigid column,67,HARRISON,,,,,,,,False,inf,-35.206829,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.14904
2,VICPOLE,89,GUNGAHLIN,HORSE PARK DRIVE,4.5 M SINGLE,1.0,LIGHT EMITTING DIODE,12 metres,SYLVANIA ROAD LED,,False,inf,-35.175655,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.139648
3,Energy Absorbing column,83,CITY,VERITY LANE BTWN EAST ROW & NTHBOURNE AVE,3.5 M TWIN,2.0,METAL HALIDE,5 metres,SPECIAL,STEEL ORDINARY,False,inf,-35.279123,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.129941
4,FORDE COLUMN,19,WRIGHT,JOHN GORTON DRIVE,3.0 M SINGLE,1.0,LIGHT EMITTING DIODE,9 metres,PECAN NXT-36S,,False,inf,-35.314926,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",False,149.032875


## Find lights that are close to bike paths

In [18]:
## Cutoff radius in metres. 
## Lights at a distance < cutoff will be added to our output.
cutoff = 5

output_filename = common.output_name
street_lights = np.array(street_lights_mod_df[['close_to_path', 'latitude', 'longitude', 'distance_nearest_path']])
bike_paths = np.array(bike_paths_mod_df[['pointA_longitude', 'pointA_latitude', 'pointB_longitude', 'pointB_latitude']])
total = len(bike_paths)

## I have to chunk the vectorised bike_path data, because the length of the street_light segments * bike_paths
## vector would be over 1 trillion elements, which occupies more RAM than my RAM can RAM.
for i in range(len(bike_paths)-1):

    ## Select the chunk. We will test the distance between each light and this chunk.
    bike_paths_iter = bike_paths[i:i+1]
    combinations = it.product(street_lights, bike_paths_iter)
    result = np.array([np.hstack(i) for i in combinations])

    ## Convert latitude and longitude points into radians
    pointA_lat, pointA_long, pointB_lat, pointB_long, pointC_lat, pointC_long = map(np.deg2rad, [
        result[:,5].astype(float), result[:,4].astype(float), result[:,7].astype(float), 
        result[:,6].astype(float), result[:,1].astype(float), result[:,2].astype(float)
        ])

    ## Calculate the cross-arc distance array between the bike path segments chunk and the light array
    distance = cross_arc_distance_vectorised(pointA_lat, pointA_long, pointB_lat, pointB_long, pointC_lat, pointC_long)

    ## Find street lights and distances such that distance is < our cutoff and stack these with their positions
    street_light_close = np.array(np.where(distance < cutoff, True, False))
    distance_array = np.array(np.where(distance < cutoff, distance, np.inf))
    light_lat_array, light_long_array = result[:,1].astype(float), result[:,2].astype(float)
    output_lights = np.column_stack((light_lat_array, light_long_array, street_light_close, distance_array))

    ## Convert array to df, then keep only rows of df where close_to_path is True, and append to output
    output_lights_df = pd.DataFrame(output_lights, columns=["light_lat", "light_long", "close_to_path", "distance_nearest_path"])
    output_lights_df['close_to_path'] = output_lights_df['close_to_path'].astype('bool')
    lights_close_to_path_df = output_lights_df[output_lights_df['close_to_path']]
    lights_close_to_path_df.to_csv(output_filename, mode='a', header=False)

    ## Read in & de-duplicate our CSV, then re-export. This is very stupid.
    ## But we could avoid this if my RAM could handle a measly trillion elements (or with better code?).
    try:
        lights_close_to_path_df = pd.read_csv(output_filename, index_col=0, header=None, 
                names=["index", "latitude", "longitude", "close_to_path", "distance_to_path"])

        ## During a new chunk of paths, we might add lights to the output that we've already appended
        ## in previous chunks, and with different distances. To de-duplicate, we first
        ## group by unique light indices, then get minimum distance to any path from that group
        min_value = lights_close_to_path_df.groupby('index')['distance_to_path'].min()
        lights_close_to_path_df = lights_close_to_path_df.merge(min_value, on='index', suffixes=('', '_min'))
        
        ## Then, keep rows where the distances in the original data == the minimum distances, and drop the original data
        ## And then remove remaining exact duplicate rows.
        lights_close_to_path_df = lights_close_to_path_df[
            lights_close_to_path_df['distance_to_path']==lights_close_to_path_df['distance_to_path_min']].drop('distance_to_path_min', axis=1)
        lights_close_to_path_df = lights_close_to_path_df.drop_duplicates()

        ## Output our de-duplicated results
        lights_close_to_path_df.to_csv(output_filename, header=False)

    except pd.errors.EmptyDataError:
        print("error")
    
    sys.stdout.write("\r" + f"{i}/{total}")
    sys.stdout.flush()


219/103648

## Which paths are not close to any lights?

In [35]:
## Now that we have a list of lights that are close to bike paths, which bike paths are not close to any lights?
## (the question of which are not sufficiently lit is probably difficult without assumptions about how 
## much light a lamp produces)