# Open Streetmap Features

In this notebook the features from Open Streetmap (OSM) will be extracted. The [ohsome API](https://docs.ohsome.org/ohsome-api/v1/) will be used. It allows us to extract the data at a specific time. A great upside is, that we don't need to process locally which can be a pain for OSM data. Can take up to 2 hours to process.

The following features will be extracted:
- Buildings
- Amenities
- Roads

In [1]:
from ohsome import OhsomeClient
from tqdm import tqdm
import geopandas as gpd
import os
import pandas as pd

In [2]:
client = OhsomeClient()

We need to create the buffer around the our cluster. Our cluster is 6.74km x 6.74km. Now we need to create also an area of this size. The buffer need to be the half of the side, since it the radius. We do the conversation to not manually calculate the buffer (which is not trivial).

In [3]:
gdf = gpd.read_file("../../data/lsms/processed/_all_nominal.csv")
gdf["geometry"] =  gpd.points_from_xy(gdf.lon, gdf.lat)
gdf.crs = 4326
gdf = gdf.to_crs(3857)
gdf["buffer"]= gdf.geometry.buffer(3360, cap_style=3).to_crs(4326)
gdf = gdf.to_crs(4326)

  return GeometryArray(vectorized.points_from_xy(x, y, z), crs=crs)


## Building Features

In [4]:
filter_building = "building=*"
filter_residential = "residential=* or building=residential or abutters=residential or construction=residential or landuse=residential"
filter_industry = "industry=*"
filter_commercial = "landuse=commercial or building=commercial or building=office"
filter_education = "amenity=school or amenity=kindergarten or amenity=university or amenity=college or landuse=education"
filter_health = "healthcare=* or amenity=doctors or amenity=hospital or amenity=pharmacy"
filter_buildings = [filter_building, filter_residential,
                    filter_industry, filter_education, filter_health]
keys_building = ["building", "residential", "industry", "education", "health"]


In [5]:
count_func = client.elements.count.groupByBoundary
area_func = client.elements.area.groupByBoundary
area_dens_func = client.elements.area.density.groupByBoundary
funcs = [count_func, area_func, area_dens_func]
keys_func = ["count", "area", "density"]

In [6]:
def process_response(resp, value_name):
    ids = []
    value = resp.value.to_list()
    for cluster in resp.value.keys():
        ids.append(cluster[0])
    return pd.DataFrame.from_dict({"id": ids, value_name: value})


In [7]:
def extract_buildings(bboxes, year, country):
    df_full = pd.DataFrame()
    for key, filter in zip(keys_building, filter_buildings):
        for func_key, func in zip(keys_func, funcs):
            response = func.post(bboxes=bboxes, time=f"{year}-12-31", filter=filter).as_dataframe()
            processed_response = process_response(response, f"{key}_{func_key}")
        
            if len(df_full) == 0:
                df_full = df_full.append(processed_response)
            else:
                df_full = df_full.merge(processed_response, on="id")
    df_full.to_csv(f"../../data/osm_features/{country}_{year}_buildings.csv", index=False)

In [8]:
surveys = gdf.groupby(["country", "year"]).groups.keys()
for country, year in tqdm(surveys, total=len(surveys)):
    if os.path.exists(f"../../data/osm_features/{country}_{year}_buildings.csv"):
        continue
    subset_df = gdf[(gdf['country'] == country) & (gdf['year'] == year)].reset_index(drop=True)
    bboxes = {}
    for _, item in subset_df.iterrows():
        ymin, xmix, xmax, ymax = item.buffer.bounds
        bboxes[item.id] = [ymin, xmix, xmax, ymax]
    
    extract_buildings(bboxes, year, country)

100%|██████████| 10/10 [00:00<00:00, 20001.45it/s]


In [10]:
total_df = pd.DataFrame()
for country, year in tqdm(surveys, total=len(surveys)):
    tmp_df = pd.read_csv(f"../../data/osm_features/{country}_{year}_buildings.csv")
    total_df = pd.concat([total_df, tmp_df])
total_df.to_csv("../../data/osm_features/_all_buildings.csv", index=False)

100%|██████████| 10/10 [00:00<00:00, 333.38it/s]


## POI Features

In [11]:
pois = {'monument', 'kindergarten', 'town_hall', 'stadium', 'optician', 'post_box', 'laundry', 'playground', 'computer_shop', 'outdoor_shop', 'florist', 'prison', 'atm', 'mall', 'camp_site', 'gift_shop', 'community_centre', 'veterinary', 'greengrocer', 'bar', 'sports_centre', 'university', 'jeweller', 'bank', 'mobile_phone_shop', 'camera_surveillance', 'drinking_water', 'pitch', 'track', 'toilet', 'water_tower', 'chalet', 'car_rental', 'dentist', 'furniture_shop', 'artwork', 'beauty_shop', 'library', 'tourist_info', 'park', 'viewpoint', 'motel', 'graveyard', 'hospital', 'comms_tower', 'shelter', 'hostel', 'beverages', 'public_building', 'museum', 'swimming_pool', 'kiosk', 'college', 'hairdresser', 'attraction', 'water_well', 'bookshop', 'recycling', 'pharmacy', 'sports_shop', 'cafe', 'theatre', 'guesthouse', 'stationery', 'picnic_site', 'clothes', 'pub', 'hotel', 'nightclub', 'fire_station', 'cinema', 'restaurant', 'waste_basket', 'shoe_shop', 'bicycle_shop', 'police', 'school', 'butcher', 'doityourself', 'chemist', 'car_wash', 'telephone', 'car_dealership', 'toy_shop', 'fast_food', 'food_court', 'tower', 'bakery', 'memorial', 'others', 'supermarket', 'post_office', 'courthouse', 'doctors', 'convenience', 'embassy', 'bench', 'department_store', 'travel_agent', 'fountain', 'water_works'}

In [12]:
def get_pois(bboxes, year, country):
    resp_pois = client.elements.count.groupByBoundary.groupByTag.post(bboxes=bboxes, time=f"{year}-12-31", filter="amenity=*", groupByKey="amenity").as_dataframe()
    cur_key = resp_pois.value.keys()[0][0]
    pois_dic = {}
    for poi in pois:
        pois_dic[poi] = []
    cur_set = pois.copy()
    values = resp_pois.value.to_list()
    pois_dic["id"] = [cur_key]
    for i, cur in enumerate(resp_pois.value.keys()):
        cur_poi = cur[1].replace("amenity=", "").replace(" ", "_").lower()
        if cur_poi in cur_set:
            pois_dic[cur_poi].append(values[i])
            cur_set.remove(cur_poi)

        if cur[0] != cur_key:
            cur_key = cur[0]
            pois_dic["id"].append(cur_key)
            for missing in cur_set:
                pois_dic[missing].append(0)
            cur_set = pois.copy()
            
    for missing in cur_set:
        pois_dic[missing].append(0)

    pd.DataFrame.from_dict(pois_dic).to_csv(f"../../data/osm_features/{country}_{year}_pois.csv", index=False)

In [13]:
surveys = gdf.groupby(["country", "year"]).groups.keys()
for country, year in tqdm(surveys, total=len(surveys)):
    print(country, year)
    if os.path.exists(f"../../data/osm_features/{country}_{year}_pois.csv"):
        continue
    # print(f"Start {country} {year}")
    subset_df = gdf[(gdf['country'] == country) & (gdf['year'] == year)].reset_index(drop=True)
    bboxes = {}
    for _, item in subset_df.iterrows():
        ymin, xmix, xmax, ymax = item.buffer.bounds
        bboxes[item.id] = [ymin, xmix, xmax, ymax]
    
    get_pois(bboxes, year, country)
    # print(f"End {country} {year}")

100%|██████████| 10/10 [00:00<00:00, 20058.84it/s]

ETH 2013
ETH 2015
ETH 2018
MW 2016
MW 2019
NG 2012
NG 2015
NG 2018
TZA 2012
TZA 2014





In [15]:
total_df = pd.DataFrame()
for country, year in tqdm(surveys, total=len(surveys)):
    tmp_df = pd.read_csv(f"../../data/osm_features/{country}_{year}_pois.csv")
    total_df = pd.concat([total_df, tmp_df])
total_df.to_csv("../../data/osm_features/_all_pois.csv", index=False)

100%|██████████| 10/10 [00:00<00:00, 83.68it/s]


## Road Features

In [16]:
df_roads = pd.DataFrame()

### Fine Road

In [17]:
road_filters = {"residential", "track", "living_street", "trunk", "primary", "secondary", "tertiary", "service", "pedestrian", "intersection"}

In [18]:
count_func = client.elements.count.groupByBoundary.groupByTag
len_func = client.elements.length.groupByBoundary.groupByTag
dens_func = client.elements.length.density.groupByBoundary.groupByTag
road_funcs = [count_func, len_func, dens_func]
road_func_keys = ["count", "length", "density"]

In [20]:
def extract_road_features(bboxes, year, country):
    df_roads = pd.DataFrame()

    response_count = client.elements.count.groupByBoundary.post(bboxes=bboxes, time=f"{year}-12-31", filter="highway=* and type:way")
    tota_count = process_response(response_count.as_dataframe(), "total_count")
    df_roads = df_roads.append(tota_count)

    response_len = client.elements.length.groupByBoundary.post(bboxes=bboxes, time=f"{year}-12-31", filter="highway=* and type:way")
    tota_len = process_response(response_len.as_dataframe(), "total_length")
    df_roads = df_roads.merge(tota_len, on="id")

    response_dens = client.elements.length.density.groupByBoundary.post(bboxes=bboxes, time=f"{year}-12-31", filter="highway=* and type:way")
    tota_dens = process_response(response_dens.as_dataframe(), "total_density")
    df_roads = df_roads.merge(tota_dens, on="id")

    road_dic = {}
    for poi in road_filters:
        road_dic[poi] = []

    for key, func in zip(road_func_keys, road_funcs):
        resp_road = func.post(bboxes=bboxes, time=f"{year}-12-31", filter="highway=* and type:way", groupByKey="highway").as_dataframe()
        road_dic = {}
        for poi in road_filters:
            road_dic[f"{key}_{poi}"] = []

        cur_key = resp_road.value.keys()[0][0]
        cur_set = road_filters.copy()
        values = resp_road.value.to_list()
        road_dic["id"] = [cur_key]
        for i, cur in enumerate(resp_road.value.keys()):
            # print(cur)
            cur_poi = cur[1].replace("highway=", "").replace(" ", "_").lower()
            if cur_poi in cur_set:
                road_dic[f"{key}_{cur_poi}"].append(values[i])
                cur_set.remove(cur_poi)

            if cur[0] != cur_key:
                cur_key = cur[0]
                road_dic["id"].append(cur_key)
                for missing in cur_set:
                    road_dic[f"{key}_{missing}"].append(0)
                cur_set = road_filters.copy()
                
        for missing in cur_set:
            road_dic[f"{key}_{missing}"].append(0)

        tmp_dic = pd.DataFrame.from_dict(road_dic)
        df_roads = df_roads.merge(tmp_dic, on="id")
    df_roads.to_csv(f"../../data/osm_features/{country}_{year}_road.csv", index=False)

In [21]:
surveys = gdf.groupby(["country", "year"]).groups.keys()
for country, year in tqdm(surveys, total=len(surveys)):
    print(country, year)
    if os.path.exists(f"../../data/osm_features/{country}_{year}_road.csv"):
        continue
    # print(f"Start {country} {year}")
    subset_df = gdf[(gdf['country'] == country) & (gdf['year'] == year)].reset_index(drop=True)
    bboxes = {}
    for _, item in subset_df.iterrows():
        ymin, xmix, xmax, ymax = item.buffer.bounds
        bboxes[item.id] = [ymin, xmix, xmax, ymax]
    
    extract_road_features(bboxes, year, country)
    # print(f"End {country} {year}")

100%|██████████| 10/10 [00:00<00:00, 20030.11it/s]

ETH 2013
ETH 2015
ETH 2018
MW 2016
MW 2019
NG 2012
NG 2015
NG 2018
TZA 2012
TZA 2014





In [23]:
total_df = pd.DataFrame()
for country, year in tqdm(surveys, total=len(surveys)):
    tmp_df = pd.read_csv(f"../../data/osm_features/{country}_{year}_road.csv")
    total_df = pd.concat([total_df, tmp_df])
total_df.to_csv("../../data/osm_features/_all_road.csv", index=False)

100%|██████████| 10/10 [00:00<00:00, 186.83it/s]
