In [None]:
import os
from pathlib import Path
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

In [None]:
BASE_DATA_DIR = Path("/home/rico/Documents/thesis/eda/data/Amsterdam/")

In [None]:
ams_neighborhoods = gpd.read_file(BASE_DATA_DIR.joinpath('ams-neighbourhoods.geojson'))

In [None]:
ams_neighborhoods['centroid'] = gpd.points_from_xy(ams_neighborhoods.cent_x, ams_neighborhoods.cent_y, crs='EPSG:4326')
ams_neighborhoods['res_centroid'] = gpd.points_from_xy(ams_neighborhoods.res_cent_x, ams_neighborhoods.res_cent_y, crs='EPSG:4326')
# Places without residential buildings have no residential centroids. Find them and assign to them the geographical centroid.
ams_neighborhoods.loc[ams_neighborhoods['res_cent_x'].isna(), 'res_centroid'] = ams_neighborhoods[ams_neighborhoods['res_cent_x'].isna()]['centroid']
ams_neighborhoods = ams_neighborhoods.drop(columns=['res_cent_x', 'res_cent_y', 'cent_x', 'cent_y'])

In [None]:
kwb_19 = pd.read_excel('/home/rico/Documents/thesis/eda/data/CBS/kwb-2019.xls', na_values='.')
kwb_20 = pd.read_excel('/home/rico/Documents/thesis/eda/data/CBS/kwb-2020.xls', na_values='.')
kwb_21 = pd.read_excel('/home/rico/Documents/thesis/eda/data/CBS/kwb-2021.xls', na_values='.')
metadata_cols = ["gwb_code_10", "gwb_code_8", "regio", "gm_naam", "recs", "gwb_code"]

In [None]:
def kwb_to_float(kwb: pd.DataFrame) -> pd.DataFrame:
    comma_check = kwb.drop(columns=metadata_cols).apply(
        lambda x: x.str.contains(',').any() if x.dtype == object else False, axis=0)
    cs_floats = comma_check[comma_check == True]
    cs_float_cols = cs_floats.index.to_list()
    kwb[cs_float_cols] = kwb[cs_float_cols].apply(lambda x: x.str.replace(',', '.'), axis=0).astype(np.float64)
    return kwb


kwb_19 = kwb_to_float(kwb_19)
kwb_20 = kwb_to_float(kwb_20)
kwb_21 = kwb_to_float(kwb_21)

In [None]:
def kwb_merge_ams_neighborhoods(kwb: pd.DataFrame, ams_neighborhoods: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    kwb['BU_CODE'] = kwb['gwb_code']
    kwb = kwb.drop(columns=metadata_cols)
    ams_neighborhoods = gpd.pd.merge(ams_neighborhoods, kwb, on='BU_CODE')
    return ams_neighborhoods


kwb_19_ams_neighborhoods = kwb_merge_ams_neighborhoods(kwb_19, ams_neighborhoods)
kwb_20_ams_neighborhoods = kwb_merge_ams_neighborhoods(kwb_20, ams_neighborhoods)
kwb_21_ams_neighborhoods = kwb_merge_ams_neighborhoods(kwb_21, ams_neighborhoods)

assert kwb_19_ams_neighborhoods.columns.tolist() == kwb_20_ams_neighborhoods.columns.tolist() == kwb_21_ams_neighborhoods.columns.tolist()

In [None]:
# Fill NaNs
def inverse_dist_weighted_average(weights, values):
    return sum([(1 / weight) * value / sum(1 / weights) for weight, value in zip(weights, values)])

for kwb_ams_nhs in tqdm([kwb_19_ams_neighborhoods, kwb_20_ams_neighborhoods, kwb_21_ams_neighborhoods], position=0):
    for col in tqdm(kwb_ams_nhs.columns, position=1, leave=False):
        if kwb_ams_nhs[col].hasnans:
            entries = kwb_ams_nhs[kwb_ams_nhs[col].isna()]
            entries.res_centroid = entries.res_centroid.to_crs('epsg:32631')
            for i, entry in entries.iterrows():
                point = entry.res_centroid
                non_nan_areas = kwb_ams_nhs[~kwb_ams_nhs[col].isna()]
                distances = non_nan_areas.res_centroid.to_crs('epsg:32631').distance(point).sort_values()
                closest_distances_idxs = distances.index[:3]
                closest_distances = distances.loc[closest_distances_idxs].values
                closest_entries = non_nan_areas.loc[closest_distances_idxs]
                interpolated_nan_value = inverse_dist_weighted_average(closest_distances, closest_entries[col].values)
                kwb_ams_nhs.loc[entry.name, col] = interpolated_nan_value


In [None]:
def check_na(gdf: gpd.GeoDataFrame) -> bool:
    return gdf.apply(lambda x: x.isna(), axis=1).any(axis=0).any()

assert not any([check_na(gdf) for gdf in [kwb_19_ams_neighborhoods, kwb_20_ams_neighborhoods, kwb_21_ams_neighborhoods]])

In [None]:
CLEANED_DIR = BASE_DATA_DIR.joinpath('cleaned_neighbourhood_data')

if not os.path.exists(CLEANED_DIR):
    os.mkdir(CLEANED_DIR)

kwb_19_ams_neighborhoods.to_parquet(BASE_DATA_DIR.joinpath(CLEANED_DIR.joinpath('kwb_19_ams_neighborhoods.parquet')))
kwb_20_ams_neighborhoods.to_parquet(BASE_DATA_DIR.joinpath(CLEANED_DIR.joinpath('kwb_20_ams_neighborhoods.parquet')))
kwb_21_ams_neighborhoods.to_parquet(BASE_DATA_DIR.joinpath(CLEANED_DIR.joinpath('kwb_21_ams_neighborhoods.parquet')))