# Retrieve and clean project sidewalk data

In [1]:
import requests
import geopandas as gpd
import pandas as pd

### Settings

In [2]:
CRS = 'epsg:28992'
CRS_map = 'epsg:4326'

## Get project sidewalk data

In [3]:
# Get correct URL
base_url = "https://sidewalk-amsterdam.cs.washington.edu/v2/access/attributesWithLabels?lat1={}&lng1={}&lat2={}&lng2={}" 
amsterdam_bbox_coords = (52.265, 4.73, 52.435, 5.07)
url = base_url.format(*amsterdam_bbox_coords)

In [4]:
# Retrieve data
project_sidewalk_labels = requests.get(url.format(*amsterdam_bbox_coords)).json()

### Create sidewalk data geodataframe

In [5]:
p_sw_gdf = gpd.GeoDataFrame.from_features(project_sidewalk_labels['features'], crs=CRS_map)
p_sw_gdf = p_sw_gdf.to_crs(CRS)
p_sw_gdf.head()

Unnamed: 0,geometry,attribute_id,label_type,street_edge_id,osm_street_id,neighborhood,severity,is_temporary,label_id,gsv_panorama_id,...,image_capture_date,label_date,label_severity,label_is_temporary,agree_count,disagree_count,notsure_count,label_tags,label_description,user_id
0,POINT (124044.481 486005.544),468706,CurbRamp,22157,195649386,Zuidwestkwadrant Indische buurt,1.0,False,31039,NBEqPUMDUqKwPhwBf26iiQ,...,2022-04,2023-05-03 08:24:32.511,1.0,False,0,0,0,[],,fae98f96-fccf-459b-a4f3-c3e9126b460e
1,POINT (124042.998 486007.251),468703,CurbRamp,22157,195649386,Zuidwestkwadrant Indische buurt,1.0,False,31042,UNHLw0adBnLFakzcnj5MqA,...,2022-04,2023-05-03 08:25:11.103,1.0,False,0,0,0,[],,fae98f96-fccf-459b-a4f3-c3e9126b460e
2,POINT (124042.727 486010.648),468707,CurbRamp,22157,195649386,Zuidwestkwadrant Indische buurt,1.0,False,31043,UNHLw0adBnLFakzcnj5MqA,...,2022-04,2023-05-03 08:25:18.43,1.0,False,0,0,0,[],,fae98f96-fccf-459b-a4f3-c3e9126b460e
3,POINT (124045.596 486012.328),468705,CurbRamp,22157,195649386,Zuidwestkwadrant Indische buurt,1.0,False,31040,NBEqPUMDUqKwPhwBf26iiQ,...,2022-04,2023-05-03 08:24:53.571,1.0,False,0,0,0,[],,fae98f96-fccf-459b-a4f3-c3e9126b460e
4,POINT (124899.861 485946.349),463042,CurbRamp,3253,7044940,Zuidoostkwadrant Indische buurt,3.0,False,31051,tzozHRKm6OXWhq2HGwA2Bw,...,2022-02,2023-05-04 10:04:11.187,2.0,False,0,0,0,[debris / pooled water],,d64192aa-666b-47fd-85ed-a57042ce4ab5


In [6]:
# Save data 
# p_sw_gdf.to_csv("/home/azureuser/cloudfiles/code/Users/l.vanoorschot/Project sidewalk/saved dataframes/psw Amsterdam df", index=False)

### Get mask of area

In [7]:
# Select area granularity (buurten/wijken/ggwgebieden/stadsdelen)
area_choice = 'wijken'

# Get area data
area_url = "https://api.data.amsterdam.nl/v1/gebieden/" + area_choice + "/?_format=geojson" 
gdf_area_raw = gpd.read_file(area_url)

# Select columns of geodataframe
gdf_area = gdf_area_raw[['geometry', 'code', 'naam']]

# Adjust CRS
gdf_area = gdf_area.to_crs(CRS)
gdf_area.head(2)

Unnamed: 0,geometry,code,naam
0,"POLYGON ((121536.251 488898.477, 121550.576 48...",AA,Haarlemmerbuurt
1,"POLYGON ((120675.483 487550.142, 120673.947 48...",AB,Jordaan


### Extract sidewalk data in polygon of area

In [8]:
# Function to get psw data from certain wijk
def get_psw_df(wijk):
    polygon = gdf_area[gdf_area['naam'] == wijk].iloc[0]['geometry']
    sw_data_in_area_mask = polygon.contains(p_sw_gdf['geometry'])
    p_sw_area_gdf = p_sw_gdf.loc[sw_data_in_area_mask]

    return p_sw_area_gdf

In [9]:
# Get Osdorp and Venserpolder data
Osdorp_df = get_psw_df("Osdorp-Midden")
Venserpolder_df = get_psw_df("Venserpolder")

In [10]:
# Save data
# Osdorp_df.to_csv("../../data/raw PSW df Osdorp-Midden", index=False)

## Clean data

In [11]:
# Load PSW data to clean
psw_Amsterdam_raw = pd.read_csv('../data/PSW dfs/raw PSW df Amsterdam')
psw_Osdorp_raw = pd.read_csv('../data/PSW dfs/raw PSW df Osdorp-Midden')

In [12]:
# Drop unnecessary (for now)
psw_Ams_raw = psw_Amsterdam_raw.drop(['heading', 'label_is_temporary','pitch', 'zoom', 'label_id', 'gsv_panorama_id','canvas_x', 'canvas_y', 'canvas_width', 'street_edge_id', 'canvas_height', 'gsv_url', 'label_date', 'label_severity', 'neighborhood', 'label_description', 'user_id'], axis=1)
psw_Os_raw = psw_Osdorp_raw.drop(['heading', 'pitch', 'label_is_temporary', 'zoom', 'canvas_x', 'label_id','canvas_y','gsv_panorama_id', 'canvas_width', 'street_edge_id', 'canvas_height', 'gsv_url', 'label_date', 'label_severity', 'neighborhood', 'label_description', 'user_id'], axis=1)

In [13]:
# Create DF for each type of annotation
cross_Ams_dupes = psw_Ams_raw[(psw_Ams_raw['label_type'] == 'Crosswalk')]
curbs_Ams_dupes = psw_Ams_raw[(psw_Ams_raw['label_type'] == 'CurbRamp')]
mCurbs_Ams_dupes = psw_Ams_raw[(psw_Ams_raw['label_type'] == 'NoCurbRamp')]

cross_Os_dupes = psw_Os_raw[(psw_Os_raw['label_type'] == 'Crosswalk')]
curbs_Os_dupes = psw_Os_raw[(psw_Os_raw['label_type'] == 'CurbRamp')]
mCurbs_Os_dupes = psw_Os_raw[(psw_Os_raw['label_type'] == 'NoCurbRamp')]

### Validation filter
We only want annotations that are sufficiently validated. This means we filter out:
- Annotations of which the disagree count is higher than the agree count
- Annotations of which the disagree count plus the not sure count is higher than the agree count
- Annotations that have an agree count of zero

In [14]:
# Function to only keep sufficiently validated rows
def validation(df):
    validated = df[(df['disagree_count'] < df['agree_count']) & 
                        ((df['disagree_count'] + df['notsure_count']) < df['agree_count']) &
                        ((df["agree_count"] != 0))]
    return validated

In [15]:
# Only keep sufficiently validated rows
cross_Ams_val = validation(cross_Ams_dupes)
curbs_Ams_val = validation(curbs_Ams_dupes)
mCurbs_Ams_val = validation(mCurbs_Ams_dupes)

cross_Os_val = validation(cross_Os_dupes)
curbs_Os_val = validation(curbs_Os_dupes) 
mCurbs_Os_val = validation(mCurbs_Os_dupes)

### Removing duplicates
The project sidewalk data has duplicate annotations. We filter these out based on their location.

In [16]:
# Function to remove duplicates
def duplicates(df):
    # Sort df by geometry and agree count
    df_sorted = df.sort_values(by=['geometry', 'agree_count'], ascending=[True, False])
   
    # Group by geometry and aggregate the rows
    df_aggregated = df_sorted.groupby('geometry').agg({'attribute_id': 'first', 
                                                    'label_type': 'first',
                                                    'osm_street_id' : 'first',
                                                    'severity': 'first',
                                                    'is_temporary': 'first',
                                                    'image_capture_date': 'first',
                                                    'agree_count': 'first',
                                                    'disagree_count': 'first',
                                                    'notsure_count': 'first',
                                                    'label_tags':  lambda x: list(x)
                                                    }).reset_index()
    
    return df_aggregated

In [17]:
# Remove duplicates data
cross_Ams_noDupes = duplicates(cross_Ams_val)
curbs_Ams_noDupes = duplicates(curbs_Ams_val)
mCurbs_Ams_noDupes = duplicates(mCurbs_Ams_val)

cross_Os_noDupes = duplicates(cross_Os_val)
curbs_Os_noDupes = duplicates(curbs_Os_val)
mCurbs_Os_noDupes = duplicates(mCurbs_Os_val)

### Add point cloud location

In [18]:
# Function for adding point cloud coordinate labels to df 
def add_PC_coordinates(PSW_df):

    coordinates = PSW_df['geometry']
    pc_coordinates = []

    for val in coordinates: 
        coordinates_str = val.replace("POINT (", "").replace(")", "")
        x_str, y_str = coordinates_str.split()
        x = int(float(x_str) / 50)
        y = int(float(y_str) / 50)
        pc_coordinates.append(f"filtered_{x}_{y}")
    
    PSW_df['pc_file_name'] = pc_coordinates

    return PSW_df

In [19]:
# Add point cloud file location to data
cross_Ams = add_PC_coordinates(cross_Ams_noDupes)
curbs_Ams = add_PC_coordinates(curbs_Ams_noDupes)
mCurbs_Ams = add_PC_coordinates(mCurbs_Ams_noDupes)

cross_Os = add_PC_coordinates(cross_Os_noDupes)
curbs_Os = add_PC_coordinates(curbs_Os_noDupes)
mCurbs_Os = add_PC_coordinates(mCurbs_Os_noDupes)

### Count annotations

In [20]:
# Count annotations
print("Crossings total:", len(cross_Ams))
print("Curbs total:", len(curbs_Ams))
print("Missing curbs total:", len(mCurbs_Ams))
print("Crossings Osdorp:", len(cross_Os))
print("Curbs Osdorp:", len(curbs_Os))
print("Missing curbs Osdorp:", len(mCurbs_Os))

Crossings total: 786
Curbs total: 2514
Missing curbs total: 318
Crossings Osdorp: 106
Curbs Osdorp: 432
Missing curbs Osdorp: 18


### Save data

In [21]:
# cross_Ams.to_csv("../../data/clean CW PSW df Amsterdam", index=False)
# cross_Os.to_csv("../../data/clean CW PSW df Osdorp-Midden", index=False)

# curbs_Ams.to_csv("../../data/clean curb PSW df Amsterdam", index=False)
# curbs_Os.to_csv("../../data/clean curb PSW df Osdorp-Midden", index=False)

# mCurbs_Ams.to_csv("../../data/clean missing curb PSW df Amsterdam", index=False)
# mCurbs_Os.to_csv("../../data/clean missing curb PSW df Osdorp-Midden", index=False)

In [22]:
# Merge dataframes
Ams_complete = pd.concat([cross_Ams, curbs_Ams, mCurbs_Ams])
Os_complete = pd.concat([cross_Os, curbs_Os, mCurbs_Os])

# Reset the index
Ams_complete = Ams_complete.reset_index(drop=True)
Os_complete = Os_complete.reset_index(drop=True)

In [23]:
# Ams_complete.to_csv("../../data/clean PSW df Amsterdam", index=False)
# Os_complete.to_csv("../../data/clean PSW df Osdorp-Midden", index=False)