In [1]:
import osmium
import shapely.wkb as wkblib
#import osmnx as ox
import geopandas
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import random

In [2]:
#load any country you need -- exchange "Sweden" for some other country at your own risk!
!wget https://download.geofabrik.de/europe/austria-latest.osm.pbf -P ./

--2023-04-15 20:23:32--  https://download.geofabrik.de/europe/austria-latest.osm.pbf
Resolving download.geofabrik.de (download.geofabrik.de)... 65.109.50.43, 65.109.48.72
Connecting to download.geofabrik.de (download.geofabrik.de)|65.109.50.43|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 714846436 (682M) [application/octet-stream]
Saving to: ‘./austria-latest.osm.pbf.1’


2023-04-15 20:23:50 (36.8 MB/s) - ‘./austria-latest.osm.pbf.1’ saved [714846436/714846436]



In [3]:
#you will need docker for this cell, but you can also skip this, it makes loading the osm-file faster
#!docker run -d -w /wkd -v $(pwd):/wkd stefda/osmium-tool osmium tags-filter -o sweden-filtered.osm.pbf sweden-latest.osm.pbf building

In [4]:
class BuildingHandler(osmium.SimpleHandler):
    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.nodes_count = 0
        self.nodes = []
        self.building_count = 0
        self.buildings = []
        # A global factory that creates WKB from a osmium geometry
        self.wkbfab = osmium.geom.WKBFactory()

    def node(self, n):
        if n.tags.get("building") == 'yes':
            try:
                wkb = self.wkbfab.create_point(n)
                geo = wkblib.loads(wkb, hex=True)
            except Exception as e:
                print(e)
                return
            row = { "w_id": n.id, "geometry": geo }
            
            for key, value in n.tags:
                row[key] = value
                
            self.nodes.append(row)
            self.nodes_count += 1
        
    def area(self, w):
        if w.tags.get("building") == 'yes':
            try:
                wkb = self.wkbfab.create_multipolygon(w)
                geo = wkblib.loads(wkb, hex=True)
            except Exception as e:
                print(e)
                return
            row = { "w_id": w.id, "geometry": geo }

            for key, value in w.tags:
                row[key] = value

            self.buildings.append(row)
            self.building_count += 1

## finding closest building with levels info for every building without levels info

In [5]:
file = "/wrk/eurastof/EnergyMap/manipulating_osm_data/austria-latest.osm.pbf"

In [6]:
def build_geodf(file):
    
    buildinghandler = BuildingHandler()
    buildinghandler.apply_file(file, locations=True)
    
    i = 200000
    while i-200000 < len(buildinghandler.buildings):
        dfx = pd.DataFrame(buildinghandler.buildings[(i-200000):min([i, len(buildinghandler.buildings)-1])])
        gdfx = geopandas.GeoDataFrame(dfx, geometry='geometry')
        gdfx = gdfx.set_crs("EPSG:4326")
        #gdfx = ox.project_gdf(gdfx)
        #gdfx = gdfx.dropna(subset=['building:levels'])
        gdfx = gdfx[['w_id', 'geometry', 'building:levels']]
        if i < 200001:
            meta = gdfx
        else:
            meta = pd.concat([meta, gdfx])
        print(meta.shape)
        i += 200000
    
    return meta

In [7]:
df = build_geodf(file)

(200000, 3)
(400000, 3)
(600000, 3)
(800000, 3)
(1000000, 3)
(1200000, 3)
(1400000, 3)
(1600000, 3)
(1800000, 3)
(2000000, 3)
(2200000, 3)
(2400000, 3)
(2600000, 3)
(2800000, 3)
(3000000, 3)
(3200000, 3)
(3247276, 3)


In [8]:
geodf = df.copy()

In [9]:
def predict_levels(with_levels_info, without_levels_info, k_neighbours):
     # scaling to zero mean and unit variance, seems to lower accuracy.
    """for i in [0,1,2]:
        with_levels_info[:,i] = (with_levels_info[:,0] - np.mean(with_levels_info[:,0])) / np.std(with_levels_info[:,0])
        without_levels_info[:,i] = (without_levels_info[:,0] - np.mean(without_levels_info[:,0])) / np.std(without_levels_info[:,0])"""
    
    distances = cdist(without_levels_info[:,[0,1,2]], with_levels_info[:,[0,1,2]])
    idx = np.argpartition(distances, k_neighbours, axis=1)
    closest_k = idx[:,:k_neighbours]
    means = []
    for c in range(closest_k.shape[0]):
        means.append(int(np.mean(with_levels_info[closest_k[c,:], 3])))
    return means

## Making DataFrame of buildings along with predicted number of levels

In [10]:
k_neighbours = 3

In [11]:
def build_data_with_predictions(geodf, k_neighbours):
    
    geodf["geometry"] = geodf["geometry"].to_crs("EPSG:3857") # to meters instead of lat, lon degees
    geodf.loc[geodf['building:levels'].str.contains('[A-Za-z]', na=False)] = None
    geodf.loc[geodf['building:levels'].str.contains('[;,.-]', na=False)] = None
    geodf.loc[geodf['building:levels'] == "0"] = None
    geodf["building:levels"] = geodf["building:levels"].astype("float")
    
    areas = geodf["geometry"].area
    locations = geodf["geometry"].centroid
    levels = geodf["building:levels"]
    data = np.array([areas, locations.x, locations.y, levels]).T
    predicted = np.isnan(data[:,3]).reshape(-1,1)
    data = np.concatenate([data, predicted], axis=1)
    
    with_levels_info = data[np.where(~np.isnan(data[:,3]))]
    
    step = 10000
    L = []

    for i in range(0, data.shape[0], step):
        current = data[i:i+step,:]
        current_without_levels_info = np.where(np.isnan(current[:,3]))[0]
        means = predict_levels(with_levels_info, current[current_without_levels_info,:], k_neighbours)
        current[current_without_levels_info, 3] = means
        L.append(current)
        
    data = np.concatenate(L)
    
    df = pd.DataFrame({"base_area": data[:,0], "x": data[:,1], "y": data[:,2],
                       "levels": data[:,3], "predicted": data[:,4]})
    return df

D = build_data_with_predictions(geodf, k_neighbours)

## Testing k-nearest accuracy against dummy model

In [13]:

def accuracy_test(dataframe, leave_out_fraction, k_neighbours):
    
    known = dataframe.loc[dataframe["predicted"] == 0]
    known_index = list(known.index)
    
    mean_errors = []
    mean_dummy_errors = []
     
    for i in range(10):
        
        random.shuffle(known_index)
        to_predict = known_index[:int(leave_out_fraction*len(known_index))]
        to_use = known_index[int(leave_out_fraction*len(known_index)):]
        true_levels = dataframe.iloc[to_predict]["levels"]
        predicted_levels = predict_levels(np.array(dataframe.iloc[to_use]), np.array(dataframe.iloc[to_predict]), k_neighbours)
        
        mean_abs_error = np.mean(np.abs(true_levels - predicted_levels))
        mean_errors.append(mean_abs_error)
        
        mean_dummy_error = np.mean(np.abs(true_levels - np.mean(np.array(dataframe.iloc[to_use])[:,3]))) # error in assigning unknown values as mean of known ones
        mean_dummy_errors.append(mean_dummy_error)

    print(f"Mean Absolute Error using k-nearest: {np.mean(mean_errors)}")
    print(f"Mean Absolute Error using dummy model: {np.mean(mean_dummy_errors)}")
    
    

    
accuracy_test(D, 0.1, k_neighbours)

Mean Absolute Error using k-nearest: 0.6325642141924249
Mean Absolute Error using dummy model: 1.0203369445792005
