In [1]:
import logging
import sqlite3
import pandas as pd
import geopandas as gpd

COVARIATE_COLUMNS = [
    "worldcover_2020_built_up_sqkm",
    "worldcover_2020_tree_cover_sqkm",
    "worldcover_2020_sparse_vegetation_sqkm",
    "ghspop_2020",
    "vnl_2020",
    "shdi_2019",
    "osm_road_length_km_2023",
    "region_code",
]


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
def load_urban_centers_grid():
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""
        SELECT
           a.grid_fid as fid
          ,a.*
        FROM all_parameters_urban_centers_grid a
    """
    df = pd.read_sql_query(query, con=con)

    df["region_wb"] = pd.Categorical(df["region_wb"])
    df['region_code'] = df.region_wb.cat.codes

    df['shdi_2019'].fillna((df['shdi_2019'].mean()), inplace=True)
    df['osm_road_length_km_2023'].fillna((df['osm_road_length_km_2023'].mean()), inplace=True)

    for column in df.columns:
        if column in [
            "external_reference_building_area_sqkm",
            "microsoft_building_area_sqkm",
            "reference_building_area_sqkm",
            "reference_osm_completeness",
        ]:
            continue

        df[column] = df[column].fillna(0)

    logging.info(len(df))
    return df

# Display summary stats for explanatory variables
Note: Feature importance is logged when running the model in `scripts/run_prediction.py`.

In [3]:
grid_df = load_urban_centers_grid()

In [4]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

for column in COVARIATE_COLUMNS:
    print(
        f"{column}: "
        f"{round(grid_df[column].min(), 3)} & "
        f"{round(grid_df[column].mean(), 3)} & "
        f"{round(grid_df[column].median(), 3)} & "
        f"{round(grid_df[column].std(), 3)} & "
        f"{round(grid_df[column].max(), 3)} "
    )

worldcover_2020_built_up_sqkm: 0.0 & 0.403 & 0.376 & 0.27 & 1.005 
worldcover_2020_tree_cover_sqkm: 0.0 & 0.228 & 0.149 & 0.228 & 1.008 
worldcover_2020_sparse_vegetation_sqkm: 0.0 & 0.083 & 0.045 & 0.108 & 1.003 
ghspop_2020: 0.0 & 4484.223 & 2750.0 & 6104.975 & 615757.0 
vnl_2020: 0 & 121.375 & 78.0 & 231.79 & 71509 
shdi_2019: 0.276 & 0.77 & 0.77 & 0.126 & 0.985 
osm_road_length_km_2023: 0.0 & 9.106 & 8.133 & 6.453 & 57.635 
region_code: 0 & 2.428 & 2.0 & 2.155 & 6 


In [24]:
feature_importance = {
    'worldcover_2020_built_up_sqkm': 0.6849695124111642,
    'worldcover_2020_tree_cover_sqkm': 0.0365150867377664,
    'worldcover_2020_sparse_vegetation_sqkm': 0.037499777590713504,
    'ghspop_2020': 0.08174518956675597,
    'vnl_2020': 0.02801227382896272,
    'shdi_2019': 0.04355697627407274,
    'osm_road_length_km_2023': 0.056407616161615,
    'region_code': 0.03129356742894947
}