# Scrape and parse reference data

This notebook helps to scrape the necessary reference data (mainly BGT-like data in the form of points and polygons).

In [None]:
# Add project src to path.
import set_path

import shapely.geometry as sg
import pandas as pd
import geopandas as gpd
from tqdm.notebook import tqdm
tqdm.pandas()

import upcp.scrapers.ams_bgt_scraper as ams_bgt_scraper
import upcp.utils.las_utils as las_utils
import upcp.utils.csv_utils as csv_utils

import upc_sw.poly_utils as poly_utils

In [None]:
# Helper functions

def get_bgt_data_for_bbox(bbox, layers):
    content = []
    for layer in layers:
        # Scrape data from the Amsterdam WFS, this will return a json response.
        json_content = ams_bgt_scraper.scrape_amsterdam_bgt(layer, bbox=bbox)

        # Parse the downloaded json response.
        content += ams_bgt_scraper.parse_polygons(json_content)
    return content

# This method scrapes data precisely for the needed area.
def process_tiles(tiles, bgt_layers, out_file, csv_headers):
    bgt_data = []

    tile_tqdm = tqdm(tiles, unit='tile', smoothing=0)
    for tilecode in tile_tqdm:
        tile_tqdm.set_postfix_str(tilecode)

        bbox = las_utils.get_bbox_from_tile_code(tilecode, padding=0)
        bgt_data.extend(get_bgt_data_for_bbox(bbox, bgt_layers))

    # Write the csv
    csv_utils.write_csv(out_file, bgt_data, csv_headers)

# This method scrapes all data in an area defined as the bounding box for all point cloud tiles 
# in a given folder. This results in some unnecessary data, but is much faster if the folder 
# contains many files, and / or is densily packed within the bounding box.
def process_folder(folder, bgt_layers, out_file, csv_headers):
    bbox = las_utils.get_bbox_from_las_folder(folder, padding=0)
    bgt_data = get_bgt_data_for_bbox(bbox, bgt_layers)

    # Write the csv
    csv_utils.write_csv(out_file, bgt_data, csv_headers)

In [None]:
### SETTINGS ###

# Specify individual tiles to scrape data for
tiles = ['2386_9702']

# Or, use all tiles in a given folder
# pc_folder = '../datasets/pointclouds/run1/'
# tiles = las_utils.get_tilecodes_from_folder(pc_folder)

# Output file for the BGT fuser.
bgt_folder = '../datasets/bgt/'

## 1. Sidewalk polygons

We first scrape sidewalk ("voetpad") polygons from the Amsterdam BGT API. The documentation of this data source can be found at:
- https://www.amsterdam.nl/stelselpedia/bgt-index/producten-bgt/prodspec-bgt-dgn-imgeo

In [None]:
# Output file
bgt_data_file = f'{bgt_folder}bgt_voetpad.csv'

# BGT layers and output headers
bgt_layers = ['BGT_WGL_voetpad']
csv_headers = ['bgt_name', 'polygon', 'x_min', 'y_max', 'x_max', 'y_min']

In [None]:
# Process single tile or list of tiles
process_tiles(tiles, bgt_layers, bgt_data_file, csv_headers)

In [None]:
# Or, scrape an area based on all files in a folder
folder = '../datasets/pointclouds/run1/'
process_folder(folder, bgt_layers, bgt_data_file, csv_headers)

## 2. "Terras" polygons

We use these to mark "horeca" terraces as static obstacles.

See https://data.amsterdam.nl/datasets/GsY50tEkoJKCGw/

In [None]:
# Output file
terras_data_file = f'{bgt_folder}terras_polygons.csv'

terras_database = 'temp_terras_shapes.csv'

In [None]:
# TODO use a more pythonic download method
!wget https://api.data.amsterdam.nl/dcatd/datasets/GsY50tEkoJKCGw/purls/4 -O {terras_database}

In [None]:
df = pd.read_csv(terras_database)
df['geometry'] = df.progress_apply(lambda row: row['terrasgeometrie'].split(';')[1], axis=1)

terras_gpd = gpd.GeoDataFrame({'name': ['terras']*len(df),
                               'geometry': gpd.GeoSeries.from_wkt(df['geometry'])},
                              geometry='geometry', crs='wgs84')
terras_gpd.to_crs(epsg='28992', inplace=True)
df = None

In [None]:
# Fix invalid polygons
terras_gpd['geometry'] = terras_gpd['geometry'].progress_apply(poly_utils.fix_invalid)

In [None]:
terras_data = []
target_area = sg.MultiPolygon([poly_utils.tilecode_to_poly(tilecode) for tilecode in tiles])
terras_gpd = terras_gpd[terras_gpd.intersects(target_area)]

# Write to CSV
terras_gpd.to_csv(terras_data_file, index=False)

## 3. BGT obstacle points

In [None]:
csv_headers = ['bgt_type', 'x', 'y']

# BGT layers (without the traffic sign layer)
bgt_layers = ['BGTPLUS_PAL_lichtmast', 'BGTPLUS_VGT_boom']

In [None]:
# Scrape traffic sign data from NDW. The municipality code of Amsterdam is GM0363.
json_content = ndw_scraper.scrape_ndw(town_code='GM0363')

# Parse the downloaded json response.
parsed_content = ndw_scraper.parse_traffic_signs(json_content, bbox)
bgt_point_objects_csv += parsed_content

In [None]:
obst_2018 = gpd.read_file('../datasets/obstakels_totaal_2018.gpkg')