In [1]:
import os
from pathlib import Path

In [2]:
from logzero import logger

In [3]:
import geopandas as gpd
import pandas as pd

In [4]:
GEOJSONS = "nl_osm_locations"

In [5]:
!ls locations

ls: cannot access 'locations': No such file or directory


In [6]:
childcare = gpd.read_file(f'{GEOJSONS}/nl_childcare.geojson')
childcare.columns

Index(['id', '@id', 'addr:city', 'addr:country', 'addr:housenumber',
       'addr:postcode', 'addr:province', 'addr:street', 'addr:unit',
       'after_school', 'alt_name', 'amenity', 'branch', 'building:levels',
       'contact:email', 'contact:facebook', 'contact:phone', 'contact:twitter',
       'contact:website', 'description', 'email', 'facebook', 'fee',
       'internet_access', 'kindergarten', 'leisure', 'level', 'max_age',
       'min_age', 'name', 'name:en', 'name:nl', 'name_1', 'name_2', 'name_3',
       'note', 'nursery', 'old_name', 'opening_hours', 'operator',
       'operator:type', 'payment:bitcoin', 'phone', 'phone:mobile',
       'preschool', 'religion', 'social_facility', 'source', 'source:date',
       'start_date', 'survey:date', 'telecom', 'url', 'website', 'wheelchair',
       'geometry'],
      dtype='object')

In [7]:
childcare = gpd.read_file(f'{GEOJSONS}/nl_college.geojson')
childcare.columns

Index(['id', '@id', 'addr:city', 'addr:country', 'addr:housename',
       'addr:housenumber', 'addr:postcode', 'addr:province', 'addr:street',
       'addr:unit', 'alt_name', 'amenity', 'brand', 'brand:wikidata',
       'brand:wikipedia', 'building', 'contact:email', 'contact:facebook',
       'contact:instagram', 'contact:linkedin', 'contact:phone',
       'contact:website', 'created_by', 'description', 'designation', 'email',
       'facebook', 'image', 'internet_access', 'internet_access:fee',
       'isced:level', 'layer', 'max_age', 'min_age', 'name', 'name:en',
       'name:nl', 'note', 'official_name', 'opening_hours', 'operator',
       'operator:type', 'phone', 'source', 'source:date', 'toilets:wheelchair',
       'twitter', 'website', 'wheelchair', 'wheelchair:description',
       'wikidata', 'wikipedia', 'geometry'],
      dtype='object')

**Note**: The column names are inconsistent in data, so I'll create a function to extract unified data from all data frames an return a single union out of it to make it easier to work with. 

In [8]:
def read_geojson(
    name, 
    path=GEOJSONS, 
    ext="geojson",
    columns=['id', 'name', 'addr:postcode', 'geometry'],
    include_name=True,   # include a column containing the file name. Helps with future filtering.
    skip_error=False
):
    file_path = os.path.join(path, f"{name}.{ext}")
    try:
        df = gpd.read_file(file_path)[columns]
    except Exception as e:
        if skip_error:
            logger.warning(f"failed to read: {file_path}")
            logger.warning(e)
            return None
        raise e

    if include_name:
        df['type'] = name
    return df

In [9]:
def read_geojson_multi(names, *args, **kwargs):
    dfs = [read_geojson(name, *args, **kwargs) for name in names]
    return pd.concat(dfs, axis=0, ignore_index=True)

In [10]:
geojsons = [Path(i).stem for i in os.listdir(GEOJSONS)]
geojsons

['nl_kindergarten',
 'nl_childcare',
 'nl_school',
 'nl_college',
 'nl_university',
 'nl_nursing_house',
 'nl_park',
 'nl_gyms']

In [11]:
df = read_geojson_multi(names=geojsons, skip_error=True)
df

Unnamed: 0,id,name,addr:postcode,geometry,type
0,node/290848021,PSZ Het Hummelhonk,,POINT (3.79618 51.22780),nl_kindergarten
1,node/299240246,Mareland,,POINT (4.49153 52.16506),nl_kindergarten
2,node/304623674,Angela's Kroost,,POINT (4.46853 52.19076),nl_kindergarten
3,node/304626605,Het kindercircus,,POINT (4.47969 52.19694),nl_kindergarten
4,node/305501081,De Hobbit,5663ST,POINT (5.54686 51.41163),nl_kindergarten
...,...,...,...,...,...
5445,node/9418182063,Fit-Now!,9231DX,POINT (6.17622 53.17208),nl_gyms
5446,node/9452429013,Vikings gym,6431DH,POINT (5.92217 50.93254),nl_gyms
5447,node/9452717476,CrossFit Hilversum - CrossFit & Personal Training,1216SL,POINT (5.14585 52.21238),nl_gyms
5448,node/9495450226,Dudok Gym,1217ET,POINT (5.16862 52.22747),nl_gyms


Extract `POINT` object:

In [17]:
df['lat'] = df['geometry'].x
df['lon'] = df['geometry'].y
df = df.drop(columns=['geometry'])
df

Unnamed: 0,id,name,addr:postcode,type,lat,lon
0,node/290848021,PSZ Het Hummelhonk,,nl_kindergarten,3.796176,51.227796
1,node/299240246,Mareland,,nl_kindergarten,4.491535,52.165060
2,node/304623674,Angela's Kroost,,nl_kindergarten,4.468531,52.190755
3,node/304626605,Het kindercircus,,nl_kindergarten,4.479686,52.196943
4,node/305501081,De Hobbit,5663ST,nl_kindergarten,5.546865,51.411625
...,...,...,...,...,...,...
5445,node/9418182063,Fit-Now!,9231DX,nl_gyms,6.176224,53.172076
5446,node/9452429013,Vikings gym,6431DH,nl_gyms,5.922174,50.932541
5447,node/9452717476,CrossFit Hilversum - CrossFit & Personal Training,1216SL,nl_gyms,5.145847,52.212379
5448,node/9495450226,Dudok Gym,1217ET,nl_gyms,5.168618,52.227471


In [12]:
df.to_csv("nl_osm_locations.csv")