In [21]:
import logging
import sqlite3
import pandas as pd
import geopandas as gpd

COVARIATE_COLUMNS = [
    "built_up_sqkm",
    "built_up_sqkm_moran_loc",
    "tree_cover_sqkm",
    "sparse_vegetation_sqkm",
    "ghspop",
    "ghspop_moran_loc",
    "vnl",
    "shdi",
    "osm_other_major_roads_length_km",
    "region_code",
]

In [14]:
def load_urban_centers_grid(unit="area_sqkm"):
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""
        SELECT
          a.id as fid
          ,a.id as grid_fid
          ,a.urban_center_id
          ,a."name_main" as name
          ,a.iso_a3
          ,a.country_id
          ,a.continent
          ,a.region_wb
          -- reference
          ,a.reference_building_{unit}
          -- OSM
          ,a.osm_building_{unit}
          -- covariates
          ,a.ghspop
          ,a.shdi
          ,a.vnl
          ,a.osm_motorway_roads_length_km
          ,a.osm_other_major_roads_length_km
          ,a.osm_airports_area_sqkm
          ,a.osm_railway_length_km
          -- ,a.osm_amenity_count
          ,a.wsf_built_up_area_sqkm
          -- worldcover
          ,a.tree_cover_sqkm
          ,a.shrubland_sqkm
          ,a.grassland_sqkm
          ,a.cropland_sqkm
          ,a.built_up_sqkm
          ,a.sparse_vegetation_sqkm
          ,a.snow_and_ice_sqkm
          ,a.permanent_water_bodies_sqkm
          ,a.herbaceous_wetland_sqkm
          ,a.mangroves_sqkm
          ,a.moss_and_lichen_sqkm
          ,a.total_sqkm
          -- spatial autocorrelation
          ,a.ghspop_moran_loc 
          ,a.osm_motorway_roads_length_km_moran
          ,a.osm_other_major_roads_length_km_moran
          ,a.built_up_sqkm_moran_loc
          -- CO2
          ,a.fossil_fuel
        FROM all_parameters_urban_centers_grid a
    """
    df = pd.read_sql(query, con=con)
    df[f"reference_completeness_{unit}"] = round(df[f"osm_building_{unit}"] / df[f"reference_building_{unit}"], 3)

    columns = [
        "ghspop",
        "vnl",
        "osm_motorway_roads_length_km",
        "osm_other_major_roads_length_km",
        "osm_airports_area_sqkm",
        # "osm_amenity_count",
        "tree_cover_sqkm",
        "shrubland_sqkm",
        "grassland_sqkm",
        "cropland_sqkm",
        "built_up_sqkm",
        "sparse_vegetation_sqkm",
        "snow_and_ice_sqkm",
        "permanent_water_bodies_sqkm",
        "herbaceous_wetland_sqkm",
        "mangroves_sqkm",
        "moss_and_lichen_sqkm",
        "ghspop_moran_loc",
        "osm_motorway_roads_length_km_moran",
        "osm_other_major_roads_length_km_moran",
        "built_up_sqkm_moran_loc",
        "fossil_fuel",
        "wsf_built_up_area_sqkm",
    ]
    for column in columns:
        df[column] = df[column].fillna(0)

    logging.info(len(df))

    df.dropna(subset=[
        "total_sqkm",
        "shdi"
    ], inplace=True)

    logging.info(len(df))

    df["region_wb"] = pd.Categorical(df["region_wb"])
    df['region_code'] = df.region_wb.cat.codes

    return df

# Display summary stats for explanatory variables
Note: Feature importance is logged when running the model in `scripts/run_prediction.py`.

In [None]:
grid_df = load_urban_centers_grid()

In [26]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
display(grid_df[COVARIATE_COLUMNS].describe())

Unnamed: 0,built_up_sqkm,built_up_sqkm_moran_loc,tree_cover_sqkm,sparse_vegetation_sqkm,ghspop,ghspop_moran_loc,vnl,shdi,osm_other_major_roads_length_km,region_code
count,663761.0,663761.0,663761.0,663761.0,663761.0,663761.0,663761.0,663761.0,663761.0,663761.0
mean,0.403,0.126,0.228,0.083,5318.811,0.141,121.6,0.77,8.356,2.433
std,0.27,0.365,0.229,0.108,8193.263,0.557,231.98,0.126,6.314,2.155
min,0.0,-1.872,0.0,0.0,0.0,-2.796,0.0,0.276,0.0,0.0
25%,0.173,-0.026,0.039,0.017,1622.0,-0.027,27.0,0.694,3.344,0.0
50%,0.376,0.036,0.149,0.045,2935.0,0.03,78.0,0.771,7.296,2.0
75%,0.614,0.198,0.361,0.105,5929.0,0.169,163.0,0.898,12.05,4.0
max,1.005,27.243,1.008,1.003,442591.0,35.912,71509.0,0.985,57.635,6.0


In [24]:
feature_importance = {
    'built_up_sqkm': 0.6809662247111471,
    'built_up_sqkm_moran_loc': 0.01988053508452799,
    'tree_cover_sqkm': 0.026500526254896822,
    'sparse_vegetation_sqkm': 0.028962523717579694,
    'ghspop': 0.07120179247278358,
    'ghspop_moran_loc': 0.019240979999603704,
    'vnl': 0.021913688552227174,
    'shdi': 0.041209153160937186,
    'osm_other_major_roads_length_km': 0.05906996101074447,
    'region_code': 0.03105461503555239
}