In [1]:
import pandas as pd
import sqlite3

In [8]:
def load_dataframe(model_name, training_data, column):
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""
        with hot_stats as
            (
            select
              a.urban_center_id
              ,case
                when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) > a.osm_building_area_sqkm then a.osm_building_area_sqkm
                when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) < 0 then 0
                else sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000)
              end as overall
              ,sum("CREATION_AREA")  / (1000*1000) as created
              ,sum("CHANGE_AREA")  / (1000*1000) as changed
              ,sum("DELETION_AREA")  / (1000*1000) as deleted
            from all_parameters_urban_centers a
            left join osm_user_contributions_per_urban_center_per_day_with_flag hot
                on a.urban_center_id = hot.urban_center_id 
            where hot.hot_tm_user = 1 and hot.timestamp <= '2022-01-01'
            group by
                a.urban_center_id
                ,a.osm_building_area_sqkm
            ),
            corporate_stats as (
              select
                  a.urban_center_id
                  ,case
                    when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) > a.osm_building_area_sqkm then a.osm_building_area_sqkm
                    when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) < 0 then 0
                    else sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000)
                  end as overall
                  ,sum("CREATION_AREA")  / (1000*1000) as created
                  ,sum("CHANGE_AREA")  / (1000*1000) as changed
                  ,sum("DELETION_AREA")  / (1000*1000) as deleted
                from all_parameters_urban_centers a
                left join osm_user_contributions_per_urban_center_per_day_with_flag hot
                    on a.urban_center_id = hot.urban_center_id
                left join osm_user_contributions_per_urban_center_with_data_teams_csv b 
                    on a.urban_center_id = b.urban_center_id
                where
                    hot.user_id = b.user_id and
                    hot.timestamp = b.timestamp and
                    hot.urban_center_id  = b.urban_center_id and
                    hot.timestamp <= '2022-01-01'
                group by 
                    a.urban_center_id
                    ,a.osm_building_area_sqkm
            )
            select 
              {column} as group_name
              ,count(*) as n
              ,round(avg(prediction_osm_completeness), 2) as prediction_osm_completeness_2022
              ,round((sum(hot.overall) / sum(osm_building_area_sqkm)), 3) as humanitarian
              ,sum(hot.overall) as humanitarian_overall
              ,round((sum(corporate.overall) / sum(osm_building_area_sqkm)), 3) as corporate
              ,sum(corporate.overall) as corporate_overall
            from rf_adjusted_prediction_reference_and_osm_urban_centers a
            left join hot_stats as hot
                on hot.urban_center_id = a.urban_center_id 
            left join corporate_stats as corporate
                on corporate.urban_center_id = a.urban_center_id 
            where {column} is not null
            group by group_name
            order by group_name 
    """
    df = pd.read_sql(query, con=con)
    return df

In [9]:
model_name = "rf_adjusted"
training_data = "reference_and_osm"

columns = [
    "'global'",
    "region_wb",
    "shdi_class",
    "ghspop_class"
]

for column in columns:
    df = load_dataframe(model_name, training_data, column)
    display(df)

Unnamed: 0,group_name,n,prediction_osm_completeness_2022,humanitarian,humanitarian_overall,corporate,corporate_overall
0,global,13189,0.21,0.077,2424.905412,0.001,36.746696


Unnamed: 0,group_name,n,prediction_osm_completeness_2022,humanitarian,humanitarian_overall,corporate,corporate_overall
0,East Asia & Pacific,3068,0.17,0.109,758.488386,0.002,12.006835
1,Europe & Central Asia,1351,0.67,0.014,133.148356,0.001,6.206655
2,Latin America & Caribbean,1073,0.17,0.117,238.966176,0.004,7.604951
3,Middle East & North Africa,901,0.11,0.076,79.799847,0.002,2.165647
4,North America,378,0.57,0.003,27.635821,0.001,6.65946
5,South Asia,3997,0.07,0.174,324.319633,0.0,0.275399
6,Sub-Saharan Africa,2421,0.24,0.425,862.547193,0.001,1.82775


Unnamed: 0,group_name,n,prediction_osm_completeness_2022,humanitarian,humanitarian_overall,corporate,corporate_overall
0,high,3883,0.15,0.127,852.471887,0.002,13.261594
1,low,2289,0.23,0.468,456.304067,0.001,1.399212
2,medium,4960,0.12,0.233,867.091687,0.003,12.33406
3,very high,1967,0.55,0.012,249.037771,0.0,9.751831


Unnamed: 0,group_name,n,prediction_osm_completeness_2022,humanitarian,humanitarian_overall,corporate,corporate_overall
0,large metropolitan areas,309,0.36,0.086,1262.806832,0.001,17.665096
1,medium-size urban areas,1922,0.25,0.069,323.141655,0.0,1.036177
2,metropolitan areas,687,0.3,0.076,457.370534,0.001,8.865385
3,small urban areas,10271,0.2,0.062,381.58639,0.001,9.180039
