In [6]:
import pandas as pd
import sqlite3

In [10]:
def load_dataframe(model_name, training_data, column):
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""
        with hot_stats as
            (
            select
              a.urban_center_id
              ,case
                when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) > a.osm_building_area_sqkm_2023 then a.osm_building_area_sqkm_2023
                when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) < 0 then 0
                else sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000)
              end as overall
              ,sum("CREATION_AREA")  / (1000*1000) as created
              ,sum("CHANGE_AREA")  / (1000*1000) as changed
              ,sum("DELETION_AREA")  / (1000*1000) as deleted
            from all_parameters_urban_centers a
            --left join osm_user_contributions_per_urban_center_per_day_with_flag_OLD hot
            left join osm_user_contributions_per_urban_center_per_day_with_flag hot
                on a.urban_center_id = hot.urban_center_id 
            where
                hot.hot_tm_user = 1
                and hot.timestamp <= '2023-01-01'
            group by
                a.urban_center_id
                ,a.osm_building_area_sqkm_2023
            ),
            corporate_stats as (
              select
                  a.urban_center_id
                  ,case
                    when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) > a.osm_building_area_sqkm_2023 then a.osm_building_area_sqkm_2023
                    when sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000) < 0 then 0
                    else sum("CREATION_AREA" + "CHANGE_AREA" - "DELETION_AREA") / (1000*1000)
                  end as overall
                  ,sum("CREATION_AREA")  / (1000*1000) as created
                  ,sum("CHANGE_AREA")  / (1000*1000) as changed
                  ,sum("DELETION_AREA")  / (1000*1000) as deleted
                from all_parameters_urban_centers a
                left join osm_user_contributions_per_urban_center_per_day_with_flag_OLD hot
                --left join osm_user_contributions_per_urban_center_per_day_with_flag hot
                    on a.urban_center_id = hot.urban_center_id
                left join osm_user_contributions_per_urban_center_with_data_teams_csv b 
                    on a.urban_center_id = b.urban_center_id
                where
                    hot.user_id = b.user_id and
                    date(hot.timestamp) = date(b.timestamp) and
                    hot.urban_center_id  = b.urban_center_id and
                    hot.timestamp <= '2023-01-01'
                group by 
                    a.urban_center_id
                    ,a.osm_building_area_sqkm_2023
            )
            select 
              {column} as group_name
              ,count(*) as n
              ,round(avg(prediction_osm_completeness_2023), 2) as prediction_osm_completeness_2023
              ,round((sum(hot.overall) / sum(osm_building_area_sqkm_2023)), 3) as humanitarian
              ,sum(hot.overall) as humanitarian_overall
              ,round((sum(corporate.overall) / sum(osm_building_area_sqkm_2023)), 3) as corporate
              ,sum(corporate.overall) as corporate_overall
            from rf_adjusted_prediction_reference_and_osm_urban_centers a
            left join hot_stats as hot
                on hot.urban_center_id = a.urban_center_id 
            left join corporate_stats as corporate
                on corporate.urban_center_id = a.urban_center_id 
            where {column} is not null
            group by group_name
            order by group_name 
    """
    df = pd.read_sql(query, con=con)
    return df

In [11]:
model_name = "rf_adjusted"
training_data = "reference_and_osm"

columns = [
    "'global'",
    "region_wb",
    "shdi_2019_class",
    "ghspop_2020_class"
]

for column in columns:
    df = load_dataframe(model_name, training_data, column)
    display(df)

Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,global,13189,0.24,0.1,3460.281099,0.001,36.746796


Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,East Asia & Pacific,3068,0.2,0.134,1045.906484,0.002,12.006835
1,Europe & Central Asia,1351,0.71,0.011,104.033038,0.001,6.206655
2,Latin America & Caribbean,1073,0.2,0.152,334.867426,0.003,7.604951
3,Middle East & North Africa,901,0.12,0.169,195.929692,0.002,2.165647
4,North America,378,0.64,0.021,193.418548,0.001,6.65946
5,South Asia,3997,0.09,0.178,350.267513,0.0,0.275399
6,Sub-Saharan Africa,2421,0.3,0.511,1235.858398,0.001,1.82785


Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,high,3883,0.17,0.158,1170.339389,0.002,13.261594
1,low,2289,0.28,0.523,580.545276,0.001,1.399312
2,medium,4960,0.15,0.303,1249.682433,0.003,12.33406
3,very high,1967,0.59,0.021,459.714,0.0,9.751831


Unnamed: 0,group_name,n,prediction_osm_completeness_2023,humanitarian,humanitarian_overall,corporate,corporate_overall
0,large metropolitan areas,287,0.41,0.104,1673.188487,0.001,17.402425
1,medium-size urban areas,1348,0.32,0.095,467.93563,0.0,1.017019
2,metropolitan areas,563,0.37,0.094,596.051184,0.001,9.062322
3,small urban areas,10930,0.23,0.101,723.105799,0.001,9.265031


## Population and Count stats for Completeness classes

In [23]:
def load_dataframe():
    con = sqlite3.connect("../data/global_urban_building_completeness.gpkg")
    query = f"""   
        select 
            case 
                when prediction_osm_completeness_2023 < 0.2 then '20'
                when prediction_osm_completeness_2023 >= 0.2 and prediction_osm_completeness_2023 < 0.4 then '20-40'
                when prediction_osm_completeness_2023 >= 0.4 and prediction_osm_completeness_2023 < 0.6 then '40-60'
                when prediction_osm_completeness_2023 >= 0.6 and prediction_osm_completeness_2023 < 0.8 then '60-80'
                when prediction_osm_completeness_2023 >= 0.8 then '80'
            end as completeness_class
            ,count(*) n_urban_centers
            ,sum(osm_building_area_sqkm_2023) buildings
            ,sum(ghspop_2020) / 1000000.0 as ghspop_2020 
        from rf_adjusted_prediction_reference_and_osm_urban_centers a
        group by completeness_class
        order by completeness_class 
    """
    df = pd.read_sql(query, con=con)
    return df

In [24]:
df = load_dataframe()
df["ghspop_2020_share"] = df["ghspop_2020"]  / df["ghspop_2020"].sum()
df["share_urban_centers"] = df["n_urban_centers"]  / df["n_urban_centers"].sum()
display(df)

Unnamed: 0,completeness_class,n_urban_centers,buildings,ghspop_2020,ghspop_2020_share,share_urban_centers
0,20,9163,2932.789766,1450.993887,0.486114,0.694746
1,20-40,908,3630.44748,435.473228,0.145893,0.068845
2,40-60,588,4930.389251,309.735159,0.103768,0.044583
3,60-80,682,5815.2842,296.715572,0.099406,0.05171
4,80,1848,17221.775807,491.964808,0.164819,0.140117
