In [1]:
import os
from pathlib import Path

In [2]:
from logzero import logger

In [3]:
import geopandas as gpd
import pandas as pd



In [4]:
GEOJSONS = "nl_osm_locations"

In [5]:
childcare = gpd.read_file(f'{GEOJSONS}/nl_childcare.geojson')
childcare.columns

Index(['id', '@id', 'addr:city', 'addr:country', 'addr:housenumber',
       'addr:postcode', 'addr:province', 'addr:street', 'addr:unit',
       'after_school', 'alt_name', 'amenity', 'branch', 'building:levels',
       'contact:email', 'contact:facebook', 'contact:phone', 'contact:twitter',
       'contact:website', 'description', 'email', 'facebook', 'fee',
       'internet_access', 'kindergarten', 'leisure', 'level', 'max_age',
       'min_age', 'name', 'name:en', 'name:nl', 'name_1', 'name_2', 'name_3',
       'note', 'nursery', 'old_name', 'opening_hours', 'operator',
       'operator:type', 'payment:bitcoin', 'phone', 'phone:mobile',
       'preschool', 'religion', 'social_facility', 'source', 'source:date',
       'start_date', 'survey:date', 'telecom', 'url', 'website', 'wheelchair',
       'geometry'],
      dtype='object')

In [6]:
childcare = gpd.read_file(f'{GEOJSONS}/nl_college.geojson')
childcare.columns

Index(['id', '@id', 'addr:city', 'addr:country', 'addr:housename',
       'addr:housenumber', 'addr:postcode', 'addr:province', 'addr:street',
       'addr:unit', 'alt_name', 'amenity', 'brand', 'brand:wikidata',
       'brand:wikipedia', 'building', 'contact:email', 'contact:facebook',
       'contact:instagram', 'contact:linkedin', 'contact:phone',
       'contact:website', 'created_by', 'description', 'designation', 'email',
       'facebook', 'image', 'internet_access', 'internet_access:fee',
       'isced:level', 'layer', 'max_age', 'min_age', 'name', 'name:en',
       'name:nl', 'note', 'official_name', 'opening_hours', 'operator',
       'operator:type', 'phone', 'source', 'source:date', 'toilets:wheelchair',
       'twitter', 'website', 'wheelchair', 'wheelchair:description',
       'wikidata', 'wikipedia', 'geometry'],
      dtype='object')

**Note**: The column names are inconsistent in data, so I'll create a function to extract unified data from all data frames an return a single union out of it to make it easier to work with. 

In [7]:
def read_geojson(
    name, 
    path=GEOJSONS, 
    ext="geojson",
    columns=['id', 'name', 'addr:postcode', 'geometry'],
    include_name=True,   # include a column containing the file name. Helps with future filtering.
    skip_error=False
):
    file_path = os.path.join(path, f"{name}.{ext}")
    try:
        df = gpd.read_file(file_path)[columns]
    except Exception as e:
        if skip_error:
            logger.warning(f"failed to read: {file_path}")
            logger.warning(e)
            return None
        raise e

    if include_name:
        df['type'] = name
    return df

In [8]:
def read_geojson_multi(names, *args, **kwargs):
    dfs = [read_geojson(name, *args, **kwargs) for name in names]
    return pd.concat(dfs, axis=0, ignore_index=True)

In [9]:
geojsons = [Path(i).stem for i in os.listdir(GEOJSONS)]
geojsons

['nl_sports_centre',
 'nl_sports_hall',
 'nl_kindergarten',
 'nl_childcare',
 'nl_fitness_station',
 'nl_pitch',
 'nl_school',
 'nl_swimming_pool',
 'nl_stadium',
 'nl_college',
 'nl_fitness_centre',
 'nl_university',
 'nl_nursing_house',
 'nl_park',
 'nl_golf_course',
 'nl_horse_riding',
 'nl_track']

In [10]:
df = read_geojson_multi(names=geojsons, skip_error=True)
df

Unnamed: 0,id,name,addr:postcode,geometry,type
0,node/26306647,De Drietip,,POINT (5.55069 51.47122),nl_sports_centre
1,node/30780468,Snellerpoort,,POINT (4.90902 52.08953),nl_sports_centre
2,node/31257636,Star,,POINT (4.89228 52.08340),nl_sports_centre
3,node/34045101,Sport- en Leisurepark Strijthagen,,POINT (6.02307 50.88430),nl_sports_centre
4,node/34046657,De Hoge Devel,,POINT (4.62403 51.81923),nl_sports_centre
...,...,...,...,...,...
9180,node/7653181888,Diomedon,,POINT (4.32363 51.58049),nl_track
9181,node/7656252182,ARSV Thor,,POINT (4.48228 51.51143),nl_track
9182,node/7856570194,Bikepark Gorinchem,,POINT (4.96505 51.84758),nl_track
9183,node/8625065980,Pumptrackbaan Drielanden Harderwijk,,POINT (5.59042 52.32917),nl_track


Extract `POINT` object:

In [11]:
df['lat'] = df['geometry'].y
df['lon'] = df['geometry'].x
df = df.drop(columns=['geometry'])
df

Unnamed: 0,id,name,addr:postcode,type,lat,lon
0,node/26306647,De Drietip,,nl_sports_centre,51.471218,5.550687
1,node/30780468,Snellerpoort,,nl_sports_centre,52.089531,4.909018
2,node/31257636,Star,,nl_sports_centre,52.083400,4.892285
3,node/34045101,Sport- en Leisurepark Strijthagen,,nl_sports_centre,50.884301,6.023069
4,node/34046657,De Hoge Devel,,nl_sports_centre,51.819226,4.624031
...,...,...,...,...,...,...
9180,node/7653181888,Diomedon,,nl_track,51.580488,4.323628
9181,node/7656252182,ARSV Thor,,nl_track,51.511428,4.482278
9182,node/7856570194,Bikepark Gorinchem,,nl_track,51.847583,4.965053
9183,node/8625065980,Pumptrackbaan Drielanden Harderwijk,,nl_track,52.329166,5.590416


In [12]:
df.to_csv("nl_osm_locations.csv")