Note: for performance scores on urban centers level check this file `Figure 7 - scatterplots of residuals`.

In [12]:
import sqlite3
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from sklearn import metrics

In [13]:
def load_avg_prediction_dataframe(training_data, split):
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""
        with agg_prediction as (
            select
              a.grid_fid
              ,a.urban_center_id
              ,a.split
              ,avg(a.prediction) as prediction
            from performance_20_clusters_reference_and_osm as a 
            group by grid_fid, urban_center_id, split
        )
        select
          a.grid_fid
          ,a.urban_center_id
          ,a.region_wb
          ,'rf_adjusted' as model_name
          ,b.split
          ,b.prediction
          ,a.reference_osm_completeness
          ,a.reference_building_area_sqkm
          ,a.osm_building_area_sqkm_2023 / b.prediction as prediction_osm_completeness
        from all_parameters_urban_centers_grid a
        left join agg_prediction b
            on a.grid_fid = b.grid_fid
        where
            reference_building_area_sqkm is not null
            and
            prediction is not null
    """
    df = pd.read_sql(query, con=con)
    print(f"got dataframe with {len(df)} samples")
    return df


def get_all_samples():
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""
        select 
          a.grid_fid as id
          ,a.urban_center_id
          ,b.region_wb 
        from all_parameters_urban_centers_grid a
        left join ne_10m_admin_0_countries b
            on a.iso_a3 = b.iso_a3
    """
    df = pd.read_sql(query, con=con)
    print(f"got dataframe with {len(df)} samples from table: all_parameters_urban_centers_grid")
    return df

## configuration

In [14]:
models = [
    "rf_adjusted",
]
splits = [
    "cluster_20",
]

score_names = [
    ['r2', metrics.r2_score],
    ['explained_variance', metrics.explained_variance_score],
    ['neg_mean_squared_error', metrics.mean_squared_error],
    ['neg_mean_absolute_error', metrics.mean_absolute_error],
]

training_data_sets = [
    "reference_and_osm"
]

wb_regions_groups = [
    ["Latin America & Caribbean"],
    ["East Asia & Pacific"],
    ["South Asia"],
    ["Europe & Central Asia"],
    ["North America"],
    ["Middle East & North Africa"],
    ["Sub-Saharan Africa"],
]

In [15]:
df_all_samples = get_all_samples()

results_list = []
results = {}

for split in splits:
    results[split] = {}
    for training_data in training_data_sets:
        results[split][training_data] = {}
        avg_prediction_df = load_avg_prediction_dataframe(training_data, split)
        
        missing_data = avg_prediction_df[avg_prediction_df.isnull().any(axis=1)]
        display(missing_data)
        
        for model_name in models:
            print(model_name)
            results[split][training_data][model_name] = {}
            results[split][training_data][model_name]["samples"] = []
            results[split][training_data][model_name]["reference_samples"] = []
            for score_name, score_function in score_names:
                results[split][training_data][model_name][score_name] = []
            
            for i, wb_regions in enumerate(wb_regions_groups):
                y_test = avg_prediction_df.loc[
                    avg_prediction_df["region_wb"].isin(wb_regions)
                ]["reference_building_area_sqkm"]

                y_pred = avg_prediction_df.loc[
                    avg_prediction_df["region_wb"].isin(wb_regions)
                ]["prediction"]
                
                if len(y_test) < 1:
                    print(f"no test samples for {wb_regions}")
                    continue
                
                samples = len(df_all_samples.loc[
                    df_all_samples["region_wb"].isin(wb_regions)
                ])
                
                reference_samples = len(y_test)
                
                results[split][training_data][model_name]["samples"].append(samples)
                results[split][training_data][model_name]["reference_samples"].append(reference_samples)

                for score_name, score in score_names:
                    val = score(y_test, y_pred)
                    results[split][training_data][model_name][score_name].append(val)

            # get weighted average global score
            list_item = [training_data, model_name, split]
            samples = results[split][training_data][model_name]["samples"]
            
            print(samples)
            
            for score_name, score in score_names:
                vals = results[split][training_data][model_name][score_name]
                avg_score = np.average(vals, weights=samples)
                list_item.append(avg_score)
            results_list.append(list_item)
        
columns = [
    "training_data",
    "model_name",
    "split",
    "r2",
    "explained_variance",
    "MSE",
    "MAE"
]
list_df = pd.DataFrame(results_list, columns=columns)
display(list_df.sort_values("model_name", ascending=False))

got dataframe with 671113 samples from table: all_parameters_urban_centers_grid
got dataframe with 442933 samples


Unnamed: 0,grid_fid,urban_center_id,region_wb,model_name,split,prediction,reference_osm_completeness,reference_building_area_sqkm,prediction_osm_completeness


rf_adjusted
[60321, 207631, 111920, 95817, 103621, 39154, 52649]


Unnamed: 0,training_data,model_name,split,r2,explained_variance,MSE,MAE
0,reference_and_osm,rf_adjusted,cluster_20,0.739129,0.741497,0.002465,0.033932


In [16]:
for split in splits:
    for model_name in [
        "rf_adjusted",
    ]:
        results_list_regions = []
        for i, wb_region in enumerate(wb_regions_groups):
            results_list_regions.append([
                model_name,
                wb_region,
                split,
                results[split]["reference_and_osm"][model_name]["samples"][i],
                results[split]["reference_and_osm"][model_name]["reference_samples"][i],
                results[split]["reference_and_osm"][model_name]["r2"][i],
                results[split]["reference_and_osm"][model_name]["explained_variance"][i],
                results[split]["reference_and_osm"][model_name]["neg_mean_squared_error"][i],
                results[split]["reference_and_osm"][model_name]["neg_mean_absolute_error"][i],
            ])


        columns = [
            "model_name",
            "region",
            "split",
            "samples",
            "reference_samples",
            "r2",
            "explained_variance",
            "MSE",
            "MAE"
        ]
        list_df = pd.DataFrame(results_list_regions, columns=columns)
        display(list_df.sort_values("region", ascending=True))
        
        print(list_df["reference_samples"].sum())

Unnamed: 0,model_name,region,split,samples,reference_samples,r2,explained_variance,MSE,MAE
1,rf_adjusted,[East Asia & Pacific],cluster_20,207631,86417,0.765553,0.765582,0.002504,0.036276
3,rf_adjusted,[Europe & Central Asia],cluster_20,95817,76714,0.711372,0.712091,0.002008,0.029893
0,rf_adjusted,[Latin America & Caribbean],cluster_20,60321,51792,0.67901,0.680341,0.004917,0.048681
5,rf_adjusted,[Middle East & North Africa],cluster_20,39154,31748,0.765276,0.76796,0.002792,0.037197
4,rf_adjusted,[North America],cluster_20,103621,99064,0.68043,0.685607,0.001286,0.026023
2,rf_adjusted,[South Asia],cluster_20,111920,55743,0.833454,0.837093,0.00242,0.030334
6,rf_adjusted,[Sub-Saharan Africa],cluster_20,52649,41455,0.649888,0.65721,0.002507,0.035925


442933
