# Retrieve and clean project sidewalk data

In [41]:
import requests
import geopandas as gpd
import pandas as pd

## Settings

In [42]:
# Select the correct coordinate systems
CRS = 'epsg:28992'
CRS_map = 'epsg:4326'

## Get project sidewalk data

In [43]:
# Set correct URL for the Project Sidewalk data you want to use
base_url = "https://sidewalk-amsterdam.cs.washington.edu/v2/access/attributesWithLabels?lat1={}&lng1={}&lat2={}&lng2={}" 
amsterdam_bbox_coords = (52.265, 4.73, 52.435, 5.07)
url = base_url.format(*amsterdam_bbox_coords)

In [44]:
# Retrieve Project Sidewalk data
project_sidewalk_labels = requests.get(url.format(*amsterdam_bbox_coords)).json()

### Create sidewalk data geodataframe

In [45]:
# Transform the Project Sidewalk data to a GeoDataFrame
p_sw_gdf = gpd.GeoDataFrame.from_features(project_sidewalk_labels['features'], crs=CRS_map)
p_sw_gdf = p_sw_gdf.to_crs(CRS)

### Get mask of area of interest

In [54]:
# Select area granularity (buurten/wijken/ggwgebieden/stadsdelen)
area_choice = 'wijken'

# Get area data
area_url = "https://api.data.amsterdam.nl/v1/gebieden/" + area_choice + "/?_format=geojson" 
gdf_area_raw = gpd.read_file(area_url)

# Select columns of geodataframe
gdf_area = gdf_area_raw[['geometry', 'code', 'naam']]

# Adjust CRS
gdf_area = gdf_area.to_crs(CRS)

### Extract sidewalk data in polygon of area

In [100]:
# Function to get psw data from certain area
def get_psw_df_area(area):
    polygon = gdf_area[gdf_area['naam'] == area].iloc[0]['geometry']
    sw_data_in_area_mask = polygon.contains(p_sw_gdf['geometry'])
    p_sw_area_gdf = p_sw_gdf.loc[sw_data_in_area_mask]

    return p_sw_area_gdf, polygon

In [101]:
# Get PSW gdf for Osdorp-Midden
Osdorp_Midden_p_sw_gdf, polygon = get_psw_df_area('Osdorp-Midden')

In [105]:
# Save the polygon as it is necessary in a consecutive notebook 
polygon_path = "../data/polygons/Osdorp polygon.csv"

# Transform polygon to gdf
polygon_gdf = gpd.GeoDataFrame([{'geometry': polygon}], crs=CRS)

# Save gdf as shapely polygon
polygon_gdf.to_file(polygon_path)

## Clean data

In [68]:
# Function to drop unnecessary rows
def drop_rows(gdf):

    gdf_dropped = gdf.drop(['heading', 'label_is_temporary','pitch', 'zoom', 'label_id', 'gsv_panorama_id','canvas_x', 'canvas_y', 'canvas_width', 'street_edge_id', 'canvas_height', 'gsv_url', 'label_date', 'label_severity', 'neighborhood', 'label_description', 'user_id'], axis=1)
    
    return gdf_dropped

In [70]:
# Function to separate annotations based on label
def separate_annotations(gdf):

    crosswalks = gdf[gdf['label_type'] == 'Crosswalk']
    curbs = gdf[gdf['label_type'] == 'CurbRamp']
    missing_curbs = gdf[gdf['label_type'] == 'NoCurbRamp']

    return crosswalks, curbs, missing_curbs

In [71]:
Osdorp_Midden_p_sw_gdf_dropped = drop_rows(Osdorp_Midden_p_sw_gdf)

Osdorp_Midden_CW, Osdorp_Midden_curb, Osdorp_Midden_mCurb = separate_annotations(Osdorp_Midden_p_sw_gdf_dropped)

### Validation filter
We only want annotations that are sufficiently validated. This means we filter out:
- Annotations of which the disagree count is higher than the agree count
- Annotations of which the disagree count plus the not sure count is higher than the agree count
- Annotations that have an agree count of zero

In [85]:
# Function to only keep sufficiently validated rows
def validation(gdf):

    validated = gdf[(gdf['disagree_count'] < gdf['agree_count']) & 
                        ((gdf['disagree_count'] + gdf['notsure_count']) < gdf['agree_count'])]

    print("Removed annotations:", len(gdf) - len(validated), "/", len(gdf))

    return validated

In [91]:
Osdorp_Midden_CW_validated = validation(Osdorp_Midden_CW)
Osdorp_Midden_curb_validated = validation(Osdorp_Midden_curb)
Osdorp_Midden_mCurb_validated = validation(Osdorp_Midden_mCurb)

Removed annotations: 38 / 316
Removed annotations: 486 / 1098
Removed annotations: 6 / 27


### Removing duplicates
The project sidewalk data has duplicate annotations. We filter these out based on their location.

In [87]:
# Function to remove duplicates
def duplicates(df):
    # Sort df by geometry and agree count
    df_sorted = df.sort_values(by=['geometry', 'agree_count'], ascending=[True, False])
   
    # Group by geometry and aggregate the rows
    df_aggregated = df_sorted.groupby('geometry').agg({'attribute_id': 'first', 
                                                    'label_type': 'first',
                                                    'osm_street_id' : 'first',
                                                    'severity': 'first',
                                                    'is_temporary': 'first',
                                                    'image_capture_date': 'first',
                                                    'agree_count': 'first',
                                                    'disagree_count': 'first',
                                                    'notsure_count': 'first',
                                                    'label_tags':  lambda x: list(x)
                                                    }).reset_index()
    
    print("Removed annotations:", len(df)-len(df_aggregated), "/", len(df))
    
    return df_aggregated

In [92]:
Osdorp_Midden_CW_validated_no_dupes = duplicates(Osdorp_Midden_CW_validated)
Osdorp_Midden_curb_validated_no_dupes = duplicates(Osdorp_Midden_curb_validated)
Osdorp_Midden_mCurb_validated_no_dupes = duplicates(Osdorp_Midden_mCurb_validated)

Removed annotations: 172 / 278
Removed annotations: 180 / 612
Removed annotations: 2 / 21


## Save data

In [94]:
# Tranform dfs to gdf
Osdorp_Midden_CW_validated_no_dupes_gdf = gpd.GeoDataFrame(Osdorp_Midden_CW_validated_no_dupes, geometry='geometry')
Osdorp_Midden_curb_validated_no_dupes_gdf = gpd.GeoDataFrame(Osdorp_Midden_curb_validated_no_dupes, geometry='geometry')
Osdorp_Midden_mCurb_validated_no_dupes_gdf = gpd.GeoDataFrame(Osdorp_Midden_mCurb_validated_no_dupes, geometry='geometry')

In [98]:
# Set paths to save data
CW_path = "../data/PSW gdfs/Osdorp CW PSW"
curb_path = "../data/PSW gdfs/Osdorp curb PSW"
mCurb_path = "../data/PSW gdfs/Osdorp missing curb PSW"

In [133]:
# Save data
Osdorp_Midden_CW_validated_no_dupes_gdf.to_csv(CW_path)
Osdorp_Midden_curb_validated_no_dupes_gdf.to_csv(curb_path)
Osdorp_Midden_mCurb_validated_no_dupes_gdf.to_csv(mCurb_path)