In [1]:
import pandas as pd
import sqlite3

In [14]:
def load_dataframe(model_name, training_data, column):
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""
        with hot_stats as
            (
            select
              a.urban_center_id
              ,case
                when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) > a.osm_building_area_sqkm_2023 then a.osm_building_area_sqkm_2023
                when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) < 0 then 0
                else sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000)
              end as overall
              ,sum("CREATION_AREA")  / (1000*1000) as created
              ,sum("CHANGE_AREA")  / (1000*1000) as changed
              ,sum("DELETION_AREA")  / (1000*1000) as deleted
            from all_parameters_urban_centers a
            left join osm_user_contributions_per_urban_center_per_day_with_flag hot
                on a.urban_center_id = hot.urban_center_id 
            where hot.hot_tm_user = 1 and hot.timestamp <= '2022-01-01'
            group by
                a.urban_center_id
                ,a.osm_building_area_sqkm_2023
            ),
            corporate_stats as (
              select
                  a.urban_center_id
                  ,case
                    when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) > a.osm_building_area_sqkm_2023 then a.osm_building_area_sqkm_2023
                    when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) < 0 then 0
                    else sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000)
                  end as overall
                  ,sum("CREATION_AREA")  / (1000*1000) as created
                  ,sum("CHANGE_AREA")  / (1000*1000) as changed
                  ,sum("DELETION_AREA")  / (1000*1000) as deleted
                from all_parameters_urban_centers a
                left join osm_user_contributions_per_urban_center_per_day_with_flag hot
                    on a.urban_center_id = hot.urban_center_id
                left join osm_user_contributions_per_urban_center_with_data_teams_csv b 
                    on a.urban_center_id = b.urban_center_id
                where
                    hot.user_id = b.user_id and
                    hot.timestamp = b.timestamp and
                    hot.urban_center_id  = b.urban_center_id and
                    hot.timestamp <= '2022-01-01'
                group by 
                    a.urban_center_id
                    ,a.osm_building_area_sqkm_2023
            )
            select 
              {column} as group_name
              ,count(*) as n
              ,round(avg(prediction_osm_completeness_2023), 2) as prediction_osm_completeness_2023
              ,round((sum(hot.overall) / sum(osm_building_area_sqkm_2023)), 3) as humanitarian
              ,sum(hot.overall) as humanitarian_overall
              ,round((sum(corporate.overall) / sum(osm_building_area_sqkm_2023)), 3) as corporate
              ,sum(corporate.overall) as corporate_overall
            from rf_adjusted_prediction_reference_and_osm_urban_centers a
            left join hot_stats as hot
                on hot.urban_center_id = a.urban_center_id 
            left join corporate_stats as corporate
                on corporate.urban_center_id = a.urban_center_id 
            where {column} is not null
            group by group_name
            order by group_name 
    """
    df = pd.read_sql(query, con=con)
    return df

In [16]:
model_name = "rf_adjusted"
training_data = "reference_and_osm"

columns = [
    "'global'",
    "region_wb",
    "shdi_2019_class",
    "ghspop_2020_class"
]

for column in columns:
    df = load_dataframe(model_name, training_data, column)
    display(df)

Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,global,13189,0.24,0.071,2452.669301,0.001,36.746796


Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,East Asia & Pacific,3068,0.18,0.097,761.770748,0.002,12.006835
1,Europe & Central Asia,1351,0.71,0.014,133.145224,0.001,6.206655
2,Latin America & Caribbean,1073,0.2,0.11,241.065818,0.003,7.604951
3,Middle East & North Africa,901,0.12,0.068,78.573414,0.002,2.165647
4,North America,378,0.64,0.003,27.635821,0.001,6.65946
5,South Asia,3997,0.09,0.165,325.843519,0.0,0.275399
6,Sub-Saharan Africa,2421,0.29,0.366,884.634755,0.001,1.82785


Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,high,3883,0.16,0.115,853.868814,0.002,13.261594
1,low,2289,0.27,0.418,463.899969,0.001,1.399312
2,medium,4960,0.14,0.215,885.733979,0.003,12.33406
3,very high,1967,0.59,0.011,249.166538,0.0,9.751831


Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,large metropolitan areas,287,0.41,0.078,1256.006407,0.001,17.402425
1,medium-size urban areas,1348,0.32,0.062,306.632983,0.0,1.017019
2,metropolitan areas,563,0.36,0.067,428.208377,0.001,9.062322
3,small urban areas,10930,0.22,0.065,461.821534,0.001,9.265031
