In [1]:

import os
import json
import pandas as pd
import geopandas as gpd
import re
import numpy as np

### Definitions

In [2]:
# dictionary definitions

json_data_dic = {0: "blds_continent_data_18-23_ai.json",
            1: "blds_africa_states_18-23_ai.json",
            2: "blds_northamerica_states_18-23_ai.json",
            3: "blds_europe_states_18-23_ai.json",
            4: "blds_asia_states_18-23_ai.json",
            
            }

shp_dic = {0: "contionents-borders.shp",
                      1: "africa-states-borders.shp",
                      2: "northamerica-states-borders.shp",
                      3: "europe-states-borders.shp",
                      4: "asia-states-borders.shp",}

geojson_folder_dic = {0: "geojson-continent-states",
                      1: "geojson-africa-states",
                      2: "geojson-northamerica-states",
                      3: "geojson-europe-states",
                      4: "geojson-asia-states",}

In [3]:
# user input definition - json data

while True:
    try:
        user_json_spec = int(input("Which data do you want to join?\
                            Enter 0 for blds by continents (2018–2023),\
                            1 for blds by Africa states (2018–2023),\
                            2 for blds by North-Central America states (2018–2023),\
                            3 for blds by Europe states (2018–2023),\
                            4 for blds by Asia states (2018–2023),\
                                :"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_json_spec not in range(len(json_data_dic)):
        print(f"Please enter one of these numbers: {json_data_dic.keys()}.")
        continue
    break


In [4]:
# user input definition - spatial data to conect

while True:
    try:
        user_spatial = int(input("Which data do you want to plot?\
                            Enter 0 for stats by continents (2018–2023),\
                            1 for stats by Africa states (2018–2023),\
                            2 for stats by North-Central America states (2018–2023),\
                            3 for stats by Europe states (2018–2023),\
                            4 for stats by Asia states (2018–2023):"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_spatial not in range(len(shp_dic)):
        print(f"Please enter one of these numbers: {shp_dic.keys()}.")
        continue
    break


#### Read the downloaded/nonspatial and spatial data

In [5]:

os.chdir('..')
home_dir = os.getcwd()
ns_d_path = os.path.join(home_dir, "downloaded-data", "")
s_d_path = os.path.join(home_dir, "shp", "")


In [6]:
# replace... blds_continents_data_18-23_ai.json; blds_africa_states_18-23_ai.json; blds_northamerica_states_18-23_ai.json
with open(f"{ns_d_path}{json_data_dic[user_json_spec]}", "r") as f:
    ns_d = json.load(f)
# ns_d



In [7]:
ns_d


[[{'groupByObject': [0, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 61906.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 76289.0},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 86564.0},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 106221.0},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 134409.0},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 149282.0}]},
  {'groupByObject': [0, 'source=microsoft/BuildingFootprints'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 150.0}]}],
 [{'groupByObject': [10, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 781204.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value'

In [8]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [9]:

# change name of the layer - africa-states-borders.shp; northamerica-states-borders.shp

s_df = gpd.read_file(f"{s_d_path}{shp_dic[user_spatial]}")
# s_df["s_id"] = s_df.index
s_df.head()


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Costa Rica,2,Sovereign country,Costa Rica,Costa Rica,North America,Costa Rica,"MULTIPOLYGON (((-83.69650 10.93659, -83.68687 ..."
1,Nicaragua,2,Sovereign country,Nicaragua,Nicaragua,North America,Nicaragua,"MULTIPOLYGON (((-85.70174 11.08088, -85.70242 ..."
2,Haiti,2,Sovereign country,Haiti,Haiti,North America,Haiti,"MULTIPOLYGON (((-71.75744 19.71011, -71.74861 ..."
3,Dominican Republic,2,Sovereign country,Dominican Republic,Dominican Rep.,North America,Dominican Republic,"MULTIPOLYGON (((-71.75744 19.71011, -71.73827 ..."
4,El Salvador,2,Sovereign country,El Salvador,El Salvador,North America,El Salvador,"MULTIPOLYGON (((-90.09831 13.73140, -90.11431 ..."


#### Edit the obtained data into desired shape to create a DataFrame

In [10]:
d = {
    "feature": [],
    "source": [],
    "timestamp": [],
    "value": []
}


In [11]:
for region in ns_d:
    for dic1 in region:
        feature, source = dic1["groupByObject"][0], dic1["groupByObject"][1]
        for res in dic1["result"]:
            d["feature"].append(feature)
            d["source"].append(source)
            d["timestamp"].append(res["timestamp"])
            d["value"].append(res["value"])

ns_df = pd.DataFrame(d)
# ns_df

In [12]:
# extract only the year
ns_df["year"] = pd.to_datetime(ns_df["timestamp"]).dt.year

In [13]:
ns_df

Unnamed: 0,feature,source,timestamp,value,year
0,0,remainder,2018-01-01T00:00:00Z,61906.0,2018
1,0,remainder,2019-01-01T00:00:00Z,76289.0,2019
2,0,remainder,2020-01-01T00:00:00Z,86564.0,2020
3,0,remainder,2021-01-01T00:00:00Z,106221.0,2021
4,0,remainder,2022-01-01T00:00:00Z,134409.0,2022
...,...,...,...,...,...
283,9,source=microsoft/BuildingFootprints,2019-01-01T00:00:00Z,0.0,2019
284,9,source=microsoft/BuildingFootprints,2020-01-01T00:00:00Z,0.0,2020
285,9,source=microsoft/BuildingFootprints,2021-01-01T00:00:00Z,39290.0,2021
286,9,source=microsoft/BuildingFootprints,2022-01-01T00:00:00Z,110173.0,2022


#### Append the names of the regions and numbers to the DataFrame

In [14]:

# replace... geojson-continent-borders, geojson-africa-states; geojson-northamerica-states
geojson_dir = os.path.join(home_dir, f"{geojson_folder_dic[user_spatial]}", "")

geojson_names = []
for (dir_path, dir_names, file_names) in os.walk(geojson_dir):  # give the path
    geojson_names.extend(file_names)
print(geojson_names)
print(os.getcwd())


['0_Costa Rica.geojson', '10_Mexico.geojson', '11_Belize.geojson', '12_Panama.geojson', '13_Bahamas.geojson', '14_Trinidad and Tobago.geojson', '15_Grenada.geojson', '16_St. Vin. and Gren..geojson', '17_Barbados.geojson', '18_Saint Lucia.geojson', '19_Dominica.geojson', '1_Nicaragua.geojson', '20_Antigua and Barb..geojson', '21_St. Kitts and Nevis.geojson', '22_Jamaica.geojson', '23_Bajo Nuevo Bank.geojson', '24_Serranilla Bank.geojson', '2_Haiti.geojson', '3_Dominican Rep..geojson', '4_El Salvador.geojson', '5_Guatemala.geojson', '6_Cuba.geojson', '7_Honduras.geojson', '8_United States of America.geojson', '9_Canada.geojson']
c:\Users\milan\OneDrive - MUNI\VŠ\PhD\Zahraniční stáž\Work\HeiGIT_notebooks\analysis\ai-assisted-osm-mapping-stats


In [15]:
geojson_names_2 = geojson_names

id_name_dic = {int(re.search(r'^\d+', k)[0]): re.search(r'\_.+\.', v)[0][1:-1]
                for k, v in zip(geojson_names, geojson_names_2)}
id_name_ser = pd.Series(id_name_dic)
id_name_df = pd.DataFrame(id_name_ser, columns=["name"])
id_name_df = id_name_df.reset_index().rename(columns={"index": "id"})
id_name_df["id"] = id_name_df["id"].astype(int)
# id_name_df


In [16]:
id_name_df


Unnamed: 0,id,name
0,0,Costa Rica
1,10,Mexico
2,11,Belize
3,12,Panama
4,13,Bahamas
5,14,Trinidad and Tobago
6,15,Grenada
7,16,St. Vin. and Gren.
8,17,Barbados
9,18,Saint Lucia


In [17]:
# merge df with region names based on ids
ns_df = ns_df.merge(id_name_df, how="left", left_on="feature",
               right_on="id")
ns_df.head(20)


Unnamed: 0,feature,source,timestamp,value,year,id,name
0,0,remainder,2018-01-01T00:00:00Z,61906.0,2018,0,Costa Rica
1,0,remainder,2019-01-01T00:00:00Z,76289.0,2019,0,Costa Rica
2,0,remainder,2020-01-01T00:00:00Z,86564.0,2020,0,Costa Rica
3,0,remainder,2021-01-01T00:00:00Z,106221.0,2021,0,Costa Rica
4,0,remainder,2022-01-01T00:00:00Z,134409.0,2022,0,Costa Rica
5,0,remainder,2023-01-01T00:00:00Z,149282.0,2023,0,Costa Rica
6,0,source=microsoft/BuildingFootprints,2018-01-01T00:00:00Z,0.0,2018,0,Costa Rica
7,0,source=microsoft/BuildingFootprints,2019-01-01T00:00:00Z,0.0,2019,0,Costa Rica
8,0,source=microsoft/BuildingFootprints,2020-01-01T00:00:00Z,0.0,2020,0,Costa Rica
9,0,source=microsoft/BuildingFootprints,2021-01-01T00:00:00Z,0.0,2021,0,Costa Rica


In [18]:
ns_df.loc[ns_df.isna().any(axis=1),:]

Unnamed: 0,feature,source,timestamp,value,year,id,name


In [19]:
# ns_df.sort_values(by=["value"], ascending=False, inplace=False).head(20)


#### Data preparation and filtering

In [20]:
# get the number of AI Microsoft (source=microsoft/BuildingFootprints) / manually mapped (remainder) buildings for every year
grouped = ns_df.groupby(["name", "year"])["value"].sum()
# grouped

In [21]:
# merge the DataFrame with groupBy DataFrame and compute the percentage
merged_data = ns_df.merge(grouped, on=["name", 'year'], how='left', )
merged_data = merged_data.rename(columns={"value_x":"num_of_blds", "value_y":"total_blds"})
merged_data["percentage"] = (merged_data["num_of_blds"] / merged_data["total_blds"]) * 100
# merged_data.head(10)


In [22]:
# percentage verification
merged_data.groupby(["name", "year"])["percentage"].sum()


name                      year
Antigua and Barb.         2018    100.0
                          2019    100.0
                          2020    100.0
                          2021    100.0
                          2022    100.0
                                  ...  
United States of America  2019    100.0
                          2020    100.0
                          2021    100.0
                          2022    100.0
                          2023    100.0
Name: percentage, Length: 150, dtype: float64

In [23]:
ns_df_rem = merged_data.loc[merged_data["source"] == "remainder", :]
ns_df_ai = merged_data.loc[merged_data["source"] == "source=microsoft/BuildingFootprints", :]


In [24]:
ns_df_rem_sel = ns_df_rem.groupby("name")[["num_of_blds", "year", "percentage", "total_blds"]].last().reset_index()
ns_df_ai_sel = ns_df_ai.groupby("name")[["num_of_blds", "year", "percentage", "total_blds"]].last().reset_index()
ns_df_ai_sel.head()

Unnamed: 0,name,num_of_blds,year,percentage,total_blds
0,Antigua and Barb.,0.0,2023,0.0,39148.0
1,Bahamas,0.0,2023,0.0,92148.0
2,Bajo Nuevo Bank,0.0,2023,,0.0
3,Barbados,17.0,2023,0.011399,149138.0
4,Belize,0.0,2023,0.0,147207.0


In [25]:
ns_df_rem_sel.head()

Unnamed: 0,name,num_of_blds,year,percentage,total_blds
0,Antigua and Barb.,39148.0,2023,100.0,39148.0
1,Bahamas,92148.0,2023,100.0,92148.0
2,Barbados,149121.0,2023,99.988601,149138.0
3,Belize,147207.0,2023,100.0,147207.0
4,Canada,5638598.0,2023,97.194618,5801348.0


In [26]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)


In [27]:
s_df.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Costa Rica,2,Sovereign country,Costa Rica,Costa Rica,North America,Costa Rica,"MULTIPOLYGON (((-83.69650 10.93659, -83.68687 ..."
1,Nicaragua,2,Sovereign country,Nicaragua,Nicaragua,North America,Nicaragua,"MULTIPOLYGON (((-85.70174 11.08088, -85.70242 ..."


In [28]:
merged_d_rem = s_df.merge(ns_df_rem_sel, how="left",
                          left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_ai = s_df.merge(ns_df_ai_sel, how="left",
                         left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_rem.head(2)


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,num_of_blds,year,percentage,total_blds
0,Costa Rica,2,Sovereign country,Costa Rica,Costa Rica,North America,Costa Rica,"MULTIPOLYGON (((-83.69650 10.93659, -83.68687 ...",Costa Rica,149282.0,2023.0,99.89962,149432.0
1,Nicaragua,2,Sovereign country,Nicaragua,Nicaragua,North America,Nicaragua,"MULTIPOLYGON (((-85.70174 11.08088, -85.70242 ...",Nicaragua,148643.0,2023.0,99.874353,148830.0


In [29]:
# verify the merge is successful
merged_d_rem.loc[merged_d_rem.isna().any(axis=1),:]


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,num_of_blds,year,percentage,total_blds
23,Bajo Nuevo Bank (Petrel Is.),2,Indeterminate,Bajo Nuevo Bank (Petrel Is.),Bajo Nuevo Bank,North America,Bajo Nuevo Bank,"POLYGON ((-79.98929 15.79495, -79.98782 15.796...",,,,,
24,Serranilla Bank,2,Indeterminate,Serranilla Bank,Serranilla Bank,North America,Serranilla Bank,"POLYGON ((-78.63707 15.86209, -78.64041 15.864...",,,,,


In [30]:
merged_d_ai.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,num_of_blds,year,percentage,total_blds
0,Costa Rica,2,Sovereign country,Costa Rica,Costa Rica,North America,Costa Rica,"MULTIPOLYGON (((-83.69650 10.93659, -83.68687 ...",Costa Rica,150.0,2023,0.10038,149432.0
1,Nicaragua,2,Sovereign country,Nicaragua,Nicaragua,North America,Nicaragua,"MULTIPOLYGON (((-85.70174 11.08088, -85.70242 ...",Nicaragua,187.0,2023,0.125647,148830.0


In [31]:
print(merged_d_rem.columns)
print(merged_d_ai.columns)


Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'num_of_blds', 'year', 'percentage', 'total_blds'],
      dtype='object')
Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'num_of_blds', 'year', 'percentage', 'total_blds'],
      dtype='object')


#### Export the data into layers

In [32]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [35]:
# change name of the layer
merged_d_rem.to_file(f"maps/shp/mm_blds_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


  merged_d_rem.to_file(f"maps/shp/mm_blds_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


In [36]:
# change name of the layer
merged_d_ai.to_file(f"maps/shp/ai_blds_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


  merged_d_ai.to_file(f"maps/shp/ai_blds_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')
