In [165]:
import osmium
import shapely.wkb as wkblib
#import osmnx as ox
import geopandas
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

In [3]:
#load any country you need -- exchange "Sweden" for some other country at your own risk!
!wget https://download.geofabrik.de/europe/estonia-latest.osm.pbf -P ./

In [4]:
#you will need docker for this cell, but you can also skip this, it makes loading the osm-file faster
!docker run -d -w /wkd -v $(pwd):/wkd stefda/osmium-tool osmium tags-filter -o sweden-filtered.osm.pbf sweden-latest.osm.pbf building

In [2]:
class BuildingHandler(osmium.SimpleHandler):
    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.nodes_count = 0
        self.nodes = []
        self.building_count = 0
        self.buildings = []
        # A global factory that creates WKB from a osmium geometry
        self.wkbfab = osmium.geom.WKBFactory()

    def node(self, n):
        if n.tags.get("building") == 'yes':
            try:
                wkb = self.wkbfab.create_point(n)
                geo = wkblib.loads(wkb, hex=True)
            except Exception as e:
                print(e)
                return
            row = { "w_id": n.id, "geometry": geo }
            
            for key, value in n.tags:
                row[key] = value
                
            self.nodes.append(row)
            self.nodes_count += 1
        
    def area(self, w):
        if w.tags.get("building") == 'yes':
            try:
                wkb = self.wkbfab.create_multipolygon(w)
                geo = wkblib.loads(wkb, hex=True)
            except Exception as e:
                print(e)
                return
            row = { "w_id": w.id, "geometry": geo }

            for key, value in w.tags:
                row[key] = value

            self.buildings.append(row)
            self.building_count += 1

## finding closest building with levels info for every building without levels info

In [150]:
file = "/wrk/eurastof/EnergyMap/manipulating_osm_data/estonia-latest.osm.pbf"

In [151]:
#loading the file into
buildinghandler = BuildingHandler()
buildinghandler.apply_file(file, locations=True)

In [152]:
i = 200000
while i-200000 < len(buildinghandler.buildings):
    dfx = pd.DataFrame(buildinghandler.buildings[(i-200000):min([i, len(buildinghandler.buildings)-1])])
    gdfx = geopandas.GeoDataFrame(dfx, geometry='geometry')
    gdfx = gdfx.set_crs("EPSG:4326")
    #gdfx = ox.project_gdf(gdfx)
    #gdfx = gdfx.dropna(subset=['building:levels'])
    gdfx = gdfx[['w_id', 'geometry', 'building:levels']]
    if i < 200001:
        meta = gdfx
    else:
        meta = pd.concat([meta, gdfx])
    print(meta.shape)
    i += 200000

(200000, 3)
(400000, 3)
(600000, 3)
(800000, 3)
(808304, 3)


In [153]:
meta["geometry"] = meta["geometry"].to_crs("EPSG:3857") # to meters instead of lat, lon degees

In [154]:
meta[(meta['building:levels'].str.contains('[A-Za-z]')) | (meta['building:levels'].str.contains('[;-]'))]["building:levels"] = None
meta[meta['building:levels'] == "0"] = None
meta["building:levels"] = meta["building:levels"].str.replace(",", ".")
meta["building:levels"] = meta["building:levels"].astype("float")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [155]:
areas = meta["geometry"].area
locations = meta["geometry"].centroid
levels = meta["building:levels"]
data = np.array([areas, locations.x, locations.y, levels]).T

In [156]:
with_levels_info = data[np.where(~np.isnan(data[:,3]))]

In [157]:
# Predicting level number using base area and building location

step = 1000
L = []
for i in range(0, data.shape[0], step):
    current = data[i:i+step,:]
    current_without_levels_info = np.where(np.isnan(current[:,3]))[0]
    distances = cdist(current[current_without_levels_info,:-1], with_levels_info[:,:-1])
    closest_with_info = np.argmin(distances, axis=1)
    current[current_without_levels_info,3] = with_levels_info[closest_with_info, 3]
    L.append(current)
    
data = np.concatenate(L)


In [190]:
# Calculating prediction accuracy

import random

known_levels_index = list(levels.dropna().index)

for _ in range(5):
    
    random.shuffle(known_levels_index)
    left_out = known_levels_index[:100]
    left_out_data = data[left_out,:]
    included = known_levels_index[100:]
    included_data = data[included,:]
    
    distances = cdist(left_out_data[:,:-1], included_data[:,:-1])
    closest = np.argmin(distances, axis=1)
    predicted_levels = included_data[closest,3]
    true_levels = left_out_data[:,3]
    
    mean_abs_error = np.mean(np.abs(true_levels - predicted_levels))
    var = np.var(true_levels - predicted_levels)
    
    print(f"Mean Absolute Error: {mean_abs_error}, variance in error {var}")









Mean Absolute Error: 0.785, variance in error 3.7654749999999995
Mean Absolute Error: 0.575, variance in error 2.0718750000000004
Mean Absolute Error: 0.85, variance in error 3.0650999999999997
Mean Absolute Error: 0.782, variance in error 2.538536
Mean Absolute Error: 1.225, variance in error 8.983274999999999
