In [1]:

import os
import json
import pandas as pd
import geopandas as gpd
import re
import numpy as np

### Definitions

In [2]:
# dictionary definitions

json_data_dic = {0: "blds_continent_data_18-23_ai.json",
            1: "blds_africa_states_18-23_ai.json",
            2: "blds_northamerica_states_18-23_ai.json",
            3: "blds_europe_states_18-23_ai.json",
            4: "blds_asia_states_18-23_ai.json",
            
            }

shp_dic = {0: "contionents-borders.shp",
                      1: "africa-states-borders.shp",
                      2: "northamerica-states-borders.shp",
                      3: "europe-states-borders.shp",
                      4: "asia-states-borders.shp",}

geojson_folder_dic = {0: "geojson-continent-states",
                      1: "geojson-africa-states",
                      2: "geojson-northamerica-states",
                      3: "geojson-europe-states",
                      4: "geojson-asia-states",}

In [3]:
# user input definition - json data

while True:
    try:
        user_json_spec = int(input("Which data do you want to join?\
                            Enter 0 for blds by continents (2018–2023),\
                            1 for blds by Africa states (2018–2023),\
                            2 for blds by North-Central America states (2018–2023),\
                            3 for blds by Europe states (2018–2023),\
                            4 for blds by Asia states (2018–2023),\
                                :"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_json_spec not in range(len(json_data_dic)):
        print(f"Please enter one of these numbers: {json_data_dic.keys()}.")
        continue
    break


In [4]:
# user input definition - spatial data to conect

while True:
    try:
        user_spatial = int(input("Which data do you want to plot?\
                            Enter 0 for stats by continents (2018–2023),\
                            1 for stats by Africa states (2018–2023),\
                            2 for stats by North-Central America states (2018–2023),\
                            3 for stats by Europe states (2018–2023),\
                            4 for stats by Asia states (2018–2023):"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_spatial not in range(len(shp_dic)):
        print(f"Please enter one of these numbers: {shp_dic.keys()}.")
        continue
    break


#### Read the downloaded/nonspatial and spatial data

In [5]:

os.chdir('..')
home_dir = os.getcwd()
ns_d_path = os.path.join(home_dir, "downloaded-data", "")
s_d_path = os.path.join(home_dir, "shp", "")


In [6]:
# replace... blds_continents_data_18-23_ai.json; blds_africa_states_18-23_ai.json; blds_northamerica_states_18-23_ai.json
with open(f"{ns_d_path}{json_data_dic[user_json_spec]}", "r") as f:
    ns_d = json.load(f)
# ns_d



In [7]:
ns_d


[[{'groupByObject': [0, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 284919.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 350579.0},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 395155.0},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 470680.0},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 629701.0},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 753591.0}]},
  {'groupByObject': [0, 'source=microsoft/BuildingFootprints'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 6327.0}]}],
 [{'groupByObject': [10, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 1361613.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'v

In [8]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [9]:

# change name of the layer - africa-states-borders.shp; northamerica-states-borders.shp

s_df = gpd.read_file(f"{s_d_path}{shp_dic[user_spatial]}")
# s_df["s_id"] = s_df.index
s_df.head()


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ..."
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ..."
2,Somalia,2,Sovereign country,Somalia,Somalia,Africa,Somalia,"POLYGON ((46.46696 6.53829, 46.48805 6.55864, ..."
3,Kenya,2,Sovereign country,Kenya,Kenya,Africa,Kenya,"MULTIPOLYGON (((35.70585 4.61945, 35.70594 4.6..."
4,Malawi,2,Sovereign country,Malawi,Malawi,Africa,Malawi,"MULTIPOLYGON (((34.96461 -11.57356, 34.65125 -..."


#### Edit the obtained data into desired shape to create a DataFrame

In [10]:
d = {
    "feature": [],
    "source": [],
    "timestamp": [],
    "value": []
}


In [11]:
for region in ns_d:
    for dic1 in region:
        feature, source = dic1["groupByObject"][0], dic1["groupByObject"][1]
        for res in dic1["result"]:
            d["feature"].append(feature)
            d["source"].append(source)
            d["timestamp"].append(res["timestamp"])
            d["value"].append(res["value"])

ns_df = pd.DataFrame(d)
# ns_df

In [12]:
# extract only the year
ns_df["year"] = pd.to_datetime(ns_df["timestamp"]).dt.year

In [13]:
ns_df

Unnamed: 0,feature,source,timestamp,value,year
0,0,remainder,2018-01-01T00:00:00Z,284919.0,2018
1,0,remainder,2019-01-01T00:00:00Z,350579.0,2019
2,0,remainder,2020-01-01T00:00:00Z,395155.0,2020
3,0,remainder,2021-01-01T00:00:00Z,470680.0,2021
4,0,remainder,2022-01-01T00:00:00Z,629701.0,2022
...,...,...,...,...,...
655,9,source=microsoft/BuildingFootprints,2019-01-01T00:00:00Z,0.0,2019
656,9,source=microsoft/BuildingFootprints,2020-01-01T00:00:00Z,0.0,2020
657,9,source=microsoft/BuildingFootprints,2021-01-01T00:00:00Z,0.0,2021
658,9,source=microsoft/BuildingFootprints,2022-01-01T00:00:00Z,0.0,2022


#### Append the names of the regions and numbers to the DataFrame

In [14]:

# replace... geojson-continent-borders, geojson-africa-states; geojson-northamerica-states
geojson_dir = os.path.join(home_dir, f"{geojson_folder_dic[user_spatial]}", "")

geojson_names = []
for (dir_path, dir_names, file_names) in os.walk(geojson_dir):  # give the path
    geojson_names.extend(file_names)
print(geojson_names)
print(os.getcwd())


['0_Ethiopia.geojson', '10_Democratic Republic of the Congo.geojson', '11_Namibia.geojson', '12_South Africa.geojson', '13_Libya.geojson', '14_Tunisia.geojson', '15_Zambia.geojson', '16_Sierra Leone.geojson', '17_Guinea.geojson', '18_Liberia.geojson', '19_Central African Republic.geojson', '1_South Sudan.geojson', '20_Sudan.geojson', '21_Djibouti.geojson', '22_Eritrea.geojson', '23_Ivory Coast.geojson', '24_Mali.geojson', '25_Senegal.geojson', '26_Nigeria.geojson', '27_Benin.geojson', '28_Angola.geojson', '29_Botswana.geojson', '2_Somalia.geojson', '30_Zimbabwe.geojson', '31_Chad.geojson', '32_Algeria.geojson', '33_Mozambique.geojson', '34_Eswatini.geojson', '35_Burundi.geojson', '36_Rwanda.geojson', '37_Uganda.geojson', '38_Lesotho.geojson', '39_Cameroon.geojson', '3_Kenya.geojson', '40_Gabon.geojson', '41_Niger.geojson', '42_Burkina Faso.geojson', '43_Togo.geojson', '44_Ghana.geojson', '45_Guinea-Bissau.geojson', '46_Egypt.geojson', '47_Mauritania.geojson', '48_Equatorial Guinea.geoj

In [15]:
geojson_names_2 = geojson_names

id_name_dic = {int(re.search(r'^\d+', k)[0]): re.search(r'\_.+\.', v)[0][1:-1]
                for k, v in zip(geojson_names, geojson_names_2)}
id_name_ser = pd.Series(id_name_dic)
id_name_df = pd.DataFrame(id_name_ser, columns=["name"])
id_name_df = id_name_df.reset_index().rename(columns={"index": "id"})
id_name_df["id"] = id_name_df["id"].astype(int)
# id_name_df


In [16]:
id_name_df


Unnamed: 0,id,name
0,0,Ethiopia
1,10,Democratic Republic of the Congo
2,11,Namibia
3,12,South Africa
4,13,Libya
5,14,Tunisia
6,15,Zambia
7,16,Sierra Leone
8,17,Guinea
9,18,Liberia


In [17]:
# merge df with region names based on ids
ns_df = ns_df.merge(id_name_df, how="left", left_on="feature",
               right_on="id")
ns_df.head(20)


Unnamed: 0,feature,source,timestamp,value,year,id,name
0,0,remainder,2018-01-01T00:00:00Z,284919.0,2018,0,Ethiopia
1,0,remainder,2019-01-01T00:00:00Z,350579.0,2019,0,Ethiopia
2,0,remainder,2020-01-01T00:00:00Z,395155.0,2020,0,Ethiopia
3,0,remainder,2021-01-01T00:00:00Z,470680.0,2021,0,Ethiopia
4,0,remainder,2022-01-01T00:00:00Z,629701.0,2022,0,Ethiopia
5,0,remainder,2023-01-01T00:00:00Z,753591.0,2023,0,Ethiopia
6,0,source=microsoft/BuildingFootprints,2018-01-01T00:00:00Z,0.0,2018,0,Ethiopia
7,0,source=microsoft/BuildingFootprints,2019-01-01T00:00:00Z,0.0,2019,0,Ethiopia
8,0,source=microsoft/BuildingFootprints,2020-01-01T00:00:00Z,0.0,2020,0,Ethiopia
9,0,source=microsoft/BuildingFootprints,2021-01-01T00:00:00Z,0.0,2021,0,Ethiopia


In [18]:
ns_df.loc[ns_df.isna().any(axis=1),:]

Unnamed: 0,feature,source,timestamp,value,year,id,name


In [19]:
# ns_df.sort_values(by=["value"], ascending=False, inplace=False).head(20)


#### Data preparation and filtering

In [20]:
ns_df

Unnamed: 0,feature,source,timestamp,value,year,id,name
0,0,remainder,2018-01-01T00:00:00Z,284919.0,2018,0,Ethiopia
1,0,remainder,2019-01-01T00:00:00Z,350579.0,2019,0,Ethiopia
2,0,remainder,2020-01-01T00:00:00Z,395155.0,2020,0,Ethiopia
3,0,remainder,2021-01-01T00:00:00Z,470680.0,2021,0,Ethiopia
4,0,remainder,2022-01-01T00:00:00Z,629701.0,2022,0,Ethiopia
...,...,...,...,...,...,...,...
655,9,source=microsoft/BuildingFootprints,2019-01-01T00:00:00Z,0.0,2019,9,Republic of the Congo
656,9,source=microsoft/BuildingFootprints,2020-01-01T00:00:00Z,0.0,2020,9,Republic of the Congo
657,9,source=microsoft/BuildingFootprints,2021-01-01T00:00:00Z,0.0,2021,9,Republic of the Congo
658,9,source=microsoft/BuildingFootprints,2022-01-01T00:00:00Z,0.0,2022,9,Republic of the Congo


In [21]:
# get the number of AI Microsoft (source=microsoft/BuildingFootprints) / manually mapped (remainder) buildings for every year
grouped = ns_df.groupby(["name", "year"])["value"].sum()
# grouped

In [22]:
grouped

name      year
Algeria   2018     261826.0
          2019     365631.0
          2020     404078.0
          2021     428225.0
          2022     538250.0
                    ...    
Zimbabwe  2019    3481873.0
          2020    3668184.0
          2021    3696191.0
          2022    3809584.0
          2023    3869906.0
Name: value, Length: 330, dtype: float64

In [23]:
# merge the DataFrame with groupBy DataFrame and compute the percentage
merged_data = ns_df.merge(grouped, on=["name", 'year'], how='left', )
merged_data = merged_data.rename(columns={"value_x":"num_of_blds", "value_y":"total_blds"})
merged_data["percentage"] = (merged_data["num_of_blds"] / merged_data["total_blds"]) * 100
# merged_data.head(10)


In [24]:
# percentage verification
merged_data.groupby(["name", "year"])["percentage"].sum()


name      year
Algeria   2018    100.0
          2019    100.0
          2020    100.0
          2021    100.0
          2022    100.0
                  ...  
Zimbabwe  2019    100.0
          2020    100.0
          2021    100.0
          2022    100.0
          2023    100.0
Name: percentage, Length: 330, dtype: float64

In [25]:
ns_df_rem = merged_data.loc[merged_data["source"] == "remainder", :]
ns_df_ai = merged_data.loc[merged_data["source"] == "source=microsoft/BuildingFootprints", :]


In [26]:
ns_df_rem_sel = ns_df_rem.groupby("name")[["num_of_blds", "year", "percentage", "total_blds"]].last().reset_index()
ns_df_ai_sel = ns_df_ai.groupby("name")[["num_of_blds", "year", "percentage", "total_blds"]].last().reset_index()
ns_df_ai_sel.head()

Unnamed: 0,name,num_of_blds,year,percentage,total_blds
0,Algeria,539.0,2023,0.094237,571962.0
1,Angola,513.0,2023,0.08821,581565.0
2,Benin,439.0,2023,0.05292,829557.0
3,Bir Tawil,0.0,2023,0.0,7.0
4,Botswana,9.0,2023,0.000584,1542327.0


In [27]:
ns_df_rem_sel.head()

Unnamed: 0,name,num_of_blds,year,percentage,total_blds
0,Algeria,571423.0,2023,99.905763,571962.0
1,Angola,581052.0,2023,99.91179,581565.0
2,Benin,829118.0,2023,99.94708,829557.0
3,Bir Tawil,7.0,2023,100.0,7.0
4,Botswana,1542318.0,2023,99.999416,1542327.0


In [28]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)


In [29]:
s_df.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ..."
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ..."


In [30]:
merged_d_rem = s_df.merge(ns_df_rem_sel, how="left",
                          left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_ai = s_df.merge(ns_df_ai_sel, how="left",
                         left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_rem.head(2)


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,num_of_blds,year,percentage,total_blds
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ...",Ethiopia,753591.0,2023.0,99.16741,759918.0
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ...",,,,,


In [31]:
# verify the merge is successful
merged_d_rem.loc[merged_d_rem.isna().any(axis=1),:]


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,num_of_blds,year,percentage,total_blds
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ...",,,,,
8,Western Sahara,2,Indeterminate,Western Sahara,W. Sahara,Africa,Western Sahara,"POLYGON ((-8.81703 27.66146, -8.81654 27.66147...",,,,,
9,Republic of the Congo,2,Sovereign country,Republic of the Congo,Congo,Africa,Republic of the Congo,"POLYGON ((18.62639 3.47687, 18.63455 3.44922, ...",,,,,
10,Democratic Republic of the Congo,2,Sovereign country,Democratic Republic of the Congo,Dem. Rep. Congo,Africa,Democratic Republic of the Congo,"MULTIPOLYGON (((18.62639 3.47687, 18.59300 3.7...",,,,,
19,Central African Republic,2,Sovereign country,Central African Republic,Central African Rep.,Africa,Central African Republic,"POLYGON ((22.86106 10.91915, 22.86323 10.89182...",,,,,
23,Ivory Coast,2,Sovereign country,Ivory Coast,Côte d'Ivoire,Africa,Ivory Coast,"MULTIPOLYGON (((-7.98966 10.16199, -7.97090 10...",,,,,
34,eSwatini,2,Sovereign country,eSwatini,eSwatini,Africa,Eswatini,"POLYGON ((31.94924 -25.95810, 31.97529 -25.980...",,,,,
48,Equatorial Guinea,2,Sovereign country,Equatorial Guinea,Eq. Guinea,Africa,Equatorial Guinea,"MULTIPOLYGON (((9.79957 2.34174, 9.81106 2.324...",,,,,
49,Gambia,2,Sovereign country,Gambia,Gambia,Africa,The Gambia,"POLYGON ((-16.75365 13.06501, -16.76887 13.077...",,,,,
53,São Tomé and Principe,2,Sovereign country,São Tomé and Principe,São Tomé and Principe,Africa,São Tomé and Príncipe,"MULTIPOLYGON (((6.68141 0.40713, 6.70387 0.391...",,,,,


In [32]:
merged_d_ai.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,num_of_blds,year,percentage,total_blds
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ...",Ethiopia,6327.0,2023.0,0.83259,759918.0
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ...",,,,,


In [33]:
print(merged_d_rem.columns)
print(merged_d_ai.columns)


Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'num_of_blds', 'year', 'percentage', 'total_blds'],
      dtype='object')
Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'num_of_blds', 'year', 'percentage', 'total_blds'],
      dtype='object')


#### Export the data into layers

In [34]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [35]:
# change name of the layer
# merged_d_rem.to_file(f"maps/shp/mm_blds_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


In [36]:
# change name of the layer
# merged_d_ai.to_file(f"maps/shp/ai_blds_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')
