In [1]:
import os
import json
import pandas as pd
import geopandas as gpd
import re
import numpy as np

### Definitions

In [2]:
# dictionary definitions

json_data_dic = {
            0: "roads_continent_data_18-23_ai.json",
            1: "roads_africa_states_18-23_ai.json",
            2: "roads_northamerica_states_18-23_ai.json",
            3: "roads_europe_states_18-23_ai.json",
            4: "roads_asia_states_18-23_ai.json",
            }

shp_dic = {
            0: "contionents-borders.shp",
            1: "africa-states-borders.shp",
            2: "northamerica-states-borders.shp",
            3: "europe-states-borders.shp",
            4: "asia-states-borders.shp",
                      }

geojson_folder_dic = {
                    0: "geojson-continent-states",
                    1: "geojson-africa-states",
                    2: "geojson-northamerica-states",
                    3: "geojson-europe-states",
                    4: "geojson-asia-states",
                      }

In [3]:
# user input definition - json data

while True:
    try:
        user_json_spec = int(input("Which data do you want to join?\
                            Enter a number:\
                            0 for roads by continents (2018–2023),\
                            1 for roads by Africa states (2018–2023),\
                            2 for roads by North-Central America states (2018–2023),\
                            3 for roads by Europe states (2018–2023),\
                            4 for roads by Asia states (2018–2023),\
                                :"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_json_spec not in range(len(json_data_dic)):
        print(f"Please enter one of these numbers: {json_data_dic.keys()}.")
        continue
    break


In [4]:
# user input definition - spatial data to conect

while True:
    try:
        user_spatial = int(input("Which data do you want to plot?\
                            Enter 0 for stats by continents (2018–2023),\
                            1 for stats by Africa states (2018–2023),\
                            2 for stats by North-Central America states (2018–2023),\
                            3 for stats by Europe states (2018–2023),\
                            4 for stats by Asia states (2018–2023):"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_spatial not in range(len(shp_dic)):
        print(f"Please enter one of these numbers: {shp_dic.keys()}.")
        continue
    break


#### Read the downloaded/nonspatial and spatial data

In [5]:

os.chdir('..')
home_dir = os.getcwd()
ns_d_path = os.path.join(home_dir, "downloaded-data", "")
s_d_path = os.path.join(home_dir, "shp", "")


In [6]:
# replace... blds_continents_data_18-23_ai.json; blds_africa_states_18-23_ai.json; blds_northamerica_states_18-23_ai.json
with open(f"{ns_d_path}{json_data_dic[user_json_spec]}", "r") as f:
    ns_d = json.load(f)
# ns_d



In [7]:
ns_d


[[{'groupByObject': [0, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 180487646.72},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 229330580.6},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 241543940.76},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 260468976.28},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 286336593.29},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 300970970.6}]},
  {'groupByObject': [0, 'source=maxar'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 6176933.08},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 10896248.26},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 12435673.65}]}],
 [{'groupByObject': [10, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 257777598.24},
    {'timestamp': '201

In [8]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [9]:

# change name of the layer - africa-states-borders.shp; northamerica-states-borders.shp

s_df = gpd.read_file(f"{s_d_path}{shp_dic[user_spatial]}")
# s_df["s_id"] = s_df.index
s_df.head()


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ..."
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ..."
2,Somalia,2,Sovereign country,Somalia,Somalia,Africa,Somalia,"POLYGON ((46.46696 6.53829, 46.48805 6.55864, ..."
3,Kenya,2,Sovereign country,Kenya,Kenya,Africa,Kenya,"MULTIPOLYGON (((35.70585 4.61945, 35.70594 4.6..."
4,Malawi,2,Sovereign country,Malawi,Malawi,Africa,Malawi,"MULTIPOLYGON (((34.96461 -11.57356, 34.65125 -..."


#### Edit the obtained data into desired shape to create a DataFrame

In [10]:
d = {
    "feature": [],
    "source": [],
    "timestamp": [],
    "value": []
}


In [11]:
for region in ns_d:
    for dic1 in region:
        feature, source = dic1["groupByObject"][0], dic1["groupByObject"][1]
        for res in dic1["result"]:
            d["feature"].append(feature)
            d["source"].append(source)
            d["timestamp"].append(res["timestamp"])
            d["value"].append(res["value"])

ns_df = pd.DataFrame(d)
# ns_df

In [12]:
# extract only the year
ns_df["year"] = pd.to_datetime(ns_df["timestamp"]).dt.year

# convert meters to kilometers
ns_df["value_km"] = ns_df["value"] / 1000

In [13]:
ns_df

Unnamed: 0,feature,source,timestamp,value,year,value_km
0,0,remainder,2018-01-01T00:00:00Z,1.804876e+08,2018,180487.64672
1,0,remainder,2019-01-01T00:00:00Z,2.293306e+08,2019,229330.58060
2,0,remainder,2020-01-01T00:00:00Z,2.415439e+08,2020,241543.94076
3,0,remainder,2021-01-01T00:00:00Z,2.604690e+08,2021,260468.97628
4,0,remainder,2022-01-01T00:00:00Z,2.863366e+08,2022,286336.59329
...,...,...,...,...,...,...
655,9,source=maxar,2019-01-01T00:00:00Z,0.000000e+00,2019,0.00000
656,9,source=maxar,2020-01-01T00:00:00Z,0.000000e+00,2020,0.00000
657,9,source=maxar,2021-01-01T00:00:00Z,0.000000e+00,2021,0.00000
658,9,source=maxar,2022-01-01T00:00:00Z,7.901474e+04,2022,79.01474


#### Append the names of the regions and numbers to the DataFrame

In [14]:

# replace... geojson-continent-borders, geojson-africa-states; geojson-northamerica-states
geojson_dir = os.path.join(home_dir, f"{geojson_folder_dic[user_spatial]}", "")

geojson_names = []
for (dir_path, dir_names, file_names) in os.walk(geojson_dir):  # give the path
    geojson_names.extend(file_names)
print(geojson_names)
print(os.getcwd())


['0_Ethiopia.geojson', '10_Democratic Republic of the Congo.geojson', '11_Namibia.geojson', '12_South Africa.geojson', '13_Libya.geojson', '14_Tunisia.geojson', '15_Zambia.geojson', '16_Sierra Leone.geojson', '17_Guinea.geojson', '18_Liberia.geojson', '19_Central African Republic.geojson', '1_South Sudan.geojson', '20_Sudan.geojson', '21_Djibouti.geojson', '22_Eritrea.geojson', '23_Ivory Coast.geojson', '24_Mali.geojson', '25_Senegal.geojson', '26_Nigeria.geojson', '27_Benin.geojson', '28_Angola.geojson', '29_Botswana.geojson', '2_Somalia.geojson', '30_Zimbabwe.geojson', '31_Chad.geojson', '32_Algeria.geojson', '33_Mozambique.geojson', '34_Eswatini.geojson', '35_Burundi.geojson', '36_Rwanda.geojson', '37_Uganda.geojson', '38_Lesotho.geojson', '39_Cameroon.geojson', '3_Kenya.geojson', '40_Gabon.geojson', '41_Niger.geojson', '42_Burkina Faso.geojson', '43_Togo.geojson', '44_Ghana.geojson', '45_Guinea-Bissau.geojson', '46_Egypt.geojson', '47_Mauritania.geojson', '48_Equatorial Guinea.geoj

In [15]:
geojson_names_2 = geojson_names

id_name_dic = {int(re.search(r'^\d+', k)[0]): re.search(r'\_.+\.', v)[0][1:-1]
                for k, v in zip(geojson_names, geojson_names_2)}
id_name_ser = pd.Series(id_name_dic)
id_name_df = pd.DataFrame(id_name_ser, columns=["name"])
id_name_df = id_name_df.reset_index().rename(columns={"index": "id"})
id_name_df["id"] = id_name_df["id"].astype(int)
# id_name_df


In [16]:
id_name_df


Unnamed: 0,id,name
0,0,Ethiopia
1,10,Democratic Republic of the Congo
2,11,Namibia
3,12,South Africa
4,13,Libya
5,14,Tunisia
6,15,Zambia
7,16,Sierra Leone
8,17,Guinea
9,18,Liberia


In [17]:
# merge df with region names based on ids
ns_df = ns_df.merge(id_name_df, how="left", left_on="feature",
               right_on="id")
ns_df.head(20)


Unnamed: 0,feature,source,timestamp,value,year,value_km,id,name
0,0,remainder,2018-01-01T00:00:00Z,180487600.0,2018,180487.64672,0.0,Ethiopia
1,0,remainder,2019-01-01T00:00:00Z,229330600.0,2019,229330.5806,0.0,Ethiopia
2,0,remainder,2020-01-01T00:00:00Z,241543900.0,2020,241543.94076,0.0,Ethiopia
3,0,remainder,2021-01-01T00:00:00Z,260469000.0,2021,260468.97628,0.0,Ethiopia
4,0,remainder,2022-01-01T00:00:00Z,286336600.0,2022,286336.59329,0.0,Ethiopia
5,0,remainder,2023-01-01T00:00:00Z,300971000.0,2023,300970.9706,0.0,Ethiopia
6,0,source=maxar,2018-01-01T00:00:00Z,0.0,2018,0.0,0.0,Ethiopia
7,0,source=maxar,2019-01-01T00:00:00Z,0.0,2019,0.0,0.0,Ethiopia
8,0,source=maxar,2020-01-01T00:00:00Z,0.0,2020,0.0,0.0,Ethiopia
9,0,source=maxar,2021-01-01T00:00:00Z,6176933.0,2021,6176.93308,0.0,Ethiopia


In [18]:
ns_df.loc[ns_df.isna().any(axis=1),:]

Unnamed: 0,feature,source,timestamp,value,year,value_km,id,name
480,feature1,remainder,2018-01-01T00:00:00Z,150069000.0,2018,150068.99739,,
481,feature1,remainder,2019-01-01T00:00:00Z,166600200.0,2019,166600.17954,,
482,feature1,remainder,2020-01-01T00:00:00Z,308629800.0,2020,308629.78029,,
483,feature1,remainder,2021-01-01T00:00:00Z,366448700.0,2021,366448.74572,,
484,feature1,remainder,2022-01-01T00:00:00Z,383330600.0,2022,383330.62688,,
485,feature1,remainder,2023-01-01T00:00:00Z,385392300.0,2023,385392.33478,,
486,feature1,source=maxar,2018-01-01T00:00:00Z,0.0,2018,0.0,,
487,feature1,source=maxar,2019-01-01T00:00:00Z,0.0,2019,0.0,,
488,feature1,source=maxar,2020-01-01T00:00:00Z,0.0,2020,0.0,,
489,feature1,source=maxar,2021-01-01T00:00:00Z,4887.21,2021,4.88721,,


In [19]:
# ns_df.sort_values(by=["value"], ascending=False, inplace=False).head(20)


#### Data preparation and filtering

In [20]:
# get the number of AI Microsoft (source=microsoft/BuildingFootprints) / manually mapped (remainder) buildings for every year
grouped = ns_df.groupby(["name", "year"])["value"].sum()
# grouped

In [21]:
# merge the DataFrame with groupBy DataFrame and compute the percentage
merged_data = ns_df.merge(grouped, on=["name", 'year'], how='left', )
merged_data = merged_data.rename(columns={"value_x":"len_of_roads", "value_y":"total_roads"})
merged_data["percentage"] = (merged_data["len_of_roads"] / merged_data["total_roads"]) * 100
merged_data.head(10)


Unnamed: 0,feature,source,timestamp,len_of_roads,year,value_km,id,name,total_roads,percentage
0,0,remainder,2018-01-01T00:00:00Z,180487600.0,2018,180487.64672,0.0,Ethiopia,180487600.0,100.0
1,0,remainder,2019-01-01T00:00:00Z,229330600.0,2019,229330.5806,0.0,Ethiopia,229330600.0,100.0
2,0,remainder,2020-01-01T00:00:00Z,241543900.0,2020,241543.94076,0.0,Ethiopia,241543900.0,100.0
3,0,remainder,2021-01-01T00:00:00Z,260469000.0,2021,260468.97628,0.0,Ethiopia,266645900.0,97.68347
4,0,remainder,2022-01-01T00:00:00Z,286336600.0,2022,286336.59329,0.0,Ethiopia,297232800.0,96.334104
5,0,remainder,2023-01-01T00:00:00Z,300971000.0,2023,300970.9706,0.0,Ethiopia,313406600.0,96.032096
6,0,source=maxar,2018-01-01T00:00:00Z,0.0,2018,0.0,0.0,Ethiopia,180487600.0,0.0
7,0,source=maxar,2019-01-01T00:00:00Z,0.0,2019,0.0,0.0,Ethiopia,229330600.0,0.0
8,0,source=maxar,2020-01-01T00:00:00Z,0.0,2020,0.0,0.0,Ethiopia,241543900.0,0.0
9,0,source=maxar,2021-01-01T00:00:00Z,6176933.0,2021,6176.93308,0.0,Ethiopia,266645900.0,2.31653


In [22]:
# percentage verification
merged_data.groupby(["name", "year"])["percentage"].sum()


name      year
Algeria   2018    100.0
          2019    100.0
          2020    100.0
          2021    100.0
          2022    100.0
                  ...  
Zimbabwe  2019    100.0
          2020    100.0
          2021    100.0
          2022    100.0
          2023    100.0
Name: percentage, Length: 324, dtype: float64

In [23]:
ns_df_rem = merged_data.loc[merged_data["source"] == "remainder", :]
ns_df_ai = merged_data.loc[merged_data["source"] == "source=maxar", :]


In [24]:
ns_df_rem_sel = ns_df_rem.groupby("name")[["len_of_roads", "year", "percentage", "total_roads"]].last().reset_index()
ns_df_ai_sel = ns_df_ai.groupby("name")[["len_of_roads", "year", "percentage", "total_roads"]].last().reset_index()
ns_df_ai_sel.head()

Unnamed: 0,name,len_of_roads,year,percentage,total_roads
0,Algeria,1580443.38,2023,0.533287,296358700.0
1,Angola,589916.04,2023,0.322686,182814400.0
2,Benin,423180.59,2023,0.640173,66104140.0
3,Bir Tawil,0.0,2023,0.0,499639.1
4,Botswana,699073.54,2023,0.456002,153305000.0


In [25]:
ns_df_rem_sel.head()

Unnamed: 0,name,len_of_roads,year,percentage,total_roads
0,Algeria,294778200.0,2023,99.466713,296358700.0
1,Angola,182224500.0,2023,99.677314,182814400.0
2,Benin,65680960.0,2023,99.359827,66104140.0
3,Bir Tawil,499639.1,2023,100.0,499639.1
4,Botswana,152605900.0,2023,99.543998,153305000.0


In [26]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)


In [27]:
s_df.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ..."
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ..."


In [28]:
merged_d_rem = s_df.merge(ns_df_rem_sel, how="left",
                          left_on="NAME_EN", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_ai = s_df.merge(ns_df_ai_sel, how="left",
                         left_on="NAME_EN", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_rem.head(2)


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ...",Ethiopia,300971000.0,2023.0,96.032096,313406600.0
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ...",South Sudan,96821430.0,2023.0,94.940547,101981100.0


In [29]:
# verify the merge is successful
merged_d_rem.loc[merged_d_rem.isna().any(axis=1),:]

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
46,Egypt,2,Sovereign country,Egypt,Egypt,Africa,Egypt,"MULTIPOLYGON (((34.24835 31.21145, 34.25861 31...",,,,,


In [30]:
merged_d_ai.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
0,Ethiopia,2,Sovereign country,Ethiopia,Ethiopia,Africa,Ethiopia,"POLYGON ((34.07070 9.45459, 34.06689 9.53118, ...",Ethiopia,12435673.65,2023.0,3.967904,313406600.0
1,South Sudan,2,Sovereign country,South Sudan,S. Sudan,Africa,South Sudan,"POLYGON ((35.92084 4.61933, 35.85654 4.61960, ...",South Sudan,5159686.36,2023.0,5.059453,101981100.0


In [31]:
print(merged_d_rem.columns)
print(merged_d_ai.columns)


Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'len_of_roads', 'year', 'percentage',
       'total_roads'],
      dtype='object')
Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'len_of_roads', 'year', 'percentage',
       'total_roads'],
      dtype='object')


#### Export the data into layers

In [32]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [35]:
# change name of the layer
# merged_d_rem.to_file(f"maps/shp/mm_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


  merged_d_rem.to_file(f"maps/shp/mm_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


In [36]:
# change name of the layer
# merged_d_ai.to_file(f"maps/shp/ai_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


  merged_d_ai.to_file(f"maps/shp/ai_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')
