In [1]:
import os
import json
import pandas as pd
import geopandas as gpd
import re
import numpy as np

### Definitions

In [2]:
# dictionary definitions

json_data_dic = {
            0: "roads_continent_data_18-23_ai.json",
            1: "roads_africa_states_18-23_ai.json",
            2: "roads_northamerica_states_18-23_ai.json",
            3: "roads_europe_states_18-23_ai.json",
            4: "roads_asia_states_18-23_ai.json",
            }

shp_dic = {
            0: "contionents-borders.shp",
            1: "africa-states-borders.shp",
            2: "northamerica-states-borders.shp",
            3: "europe-states-borders.shp",
            4: "asia-states-borders.shp",
                      }

geojson_folder_dic = {
                    0: "geojson-continent-states",
                    1: "geojson-africa-states",
                    2: "geojson-northamerica-states",
                    3: "geojson-europe-states",
                    4: "geojson-asia-states",
                      }

In [3]:
# user input definition - json data

while True:
    try:
        user_json_spec = int(input("Which data do you want to join?\
                            Enter a number:\
                            0 for roads by continents (2018–2023),\
                            1 for roads by Africa states (2018–2023),\
                            2 for roads by North-Central America states (2018–2023),\
                            3 for roads by Europe states (2018–2023),\
                            4 for roads by Asia states (2018–2023),\
                                :"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_json_spec not in range(len(json_data_dic)):
        print(f"Please enter one of these numbers: {json_data_dic.keys()}.")
        continue
    break


In [4]:
# user input definition - spatial data to conect

while True:
    try:
        user_spatial = int(input("Which data do you want to plot?\
                            Enter 0 for stats by continents (2018–2023),\
                            1 for stats by Africa states (2018–2023),\
                            2 for stats by North-Central America states (2018–2023),\
                            3 for stats by Europe states (2018–2023),\
                            4 for stats by Asia states (2018–2023):"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_spatial not in range(len(shp_dic)):
        print(f"Please enter one of these numbers: {shp_dic.keys()}.")
        continue
    break


#### Read the downloaded/nonspatial and spatial data

In [5]:

os.chdir('..')
home_dir = os.getcwd()
ns_d_path = os.path.join(home_dir, "downloaded-data", "")
s_d_path = os.path.join(home_dir, "shp", "")


In [6]:
# replace... blds_continents_data_18-23_ai.json; blds_africa_states_18-23_ai.json; blds_northamerica_states_18-23_ai.json
with open(f"{ns_d_path}{json_data_dic[user_json_spec]}", "r") as f:
    ns_d = json.load(f)
# ns_d



In [7]:
ns_d


[[{'groupByObject': [0, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 412573538.52},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 729335700.28},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 1096717478.93},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 1206910324.62},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 1246753928.28},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 1255846149.67}]},
  {'groupByObject': [0, 'source=maxar'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 8588.36},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 34164.54},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 2673150.37},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 3060945.91}]}],
 [{'groupByObject': [10, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 9869635.94},
    {'timestamp': 

In [8]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [9]:

# change name of the layer - africa-states-borders.shp; northamerica-states-borders.shp

s_df = gpd.read_file(f"{s_d_path}{shp_dic[user_spatial]}")
# s_df["s_id"] = s_df.index
s_df.head()


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Indonesia,2,Sovereign country,Indonesia,Indonesia,Asia,Indonesia,"MULTIPOLYGON (((117.70361 4.16341, 117.70361 4..."
1,Malaysia,2,Sovereign country,Malaysia,Malaysia,Asia,Malaysia,"MULTIPOLYGON (((117.70361 4.16341, 117.69711 4..."
2,Cyprus,2,Sovereign country,Cyprus,Cyprus,Asia,Cyprus,"MULTIPOLYGON (((33.78183 34.97622, 33.78094 34..."
3,India,2,Sovereign country,India,India,Asia,India,"MULTIPOLYGON (((77.80035 35.49541, 77.81533 35..."
4,China,1,Sovereignty,China,China,Asia,People's Republic of China,"MULTIPOLYGON (((78.91769 33.38626, 78.91595 33..."


#### Edit the obtained data into desired shape to create a DataFrame

In [10]:
d = {
    "feature": [],
    "source": [],
    "timestamp": [],
    "value": []
}


In [11]:
for region in ns_d:
    for dic1 in region:
        feature, source = dic1["groupByObject"][0], dic1["groupByObject"][1]
        for res in dic1["result"]:
            d["feature"].append(feature)
            d["source"].append(source)
            d["timestamp"].append(res["timestamp"])
            d["value"].append(res["value"])

ns_df = pd.DataFrame(d)
# ns_df

In [12]:
# extract only the year
ns_df["year"] = pd.to_datetime(ns_df["timestamp"]).dt.year

# convert meters to kilometers
ns_df["value_km"] = ns_df["value"] / 1000

In [13]:
ns_df

Unnamed: 0,feature,source,timestamp,value,year,value_km
0,0,remainder,2018-01-01T00:00:00Z,4.125735e+08,2018,4.125735e+05
1,0,remainder,2019-01-01T00:00:00Z,7.293357e+08,2019,7.293357e+05
2,0,remainder,2020-01-01T00:00:00Z,1.096717e+09,2020,1.096717e+06
3,0,remainder,2021-01-01T00:00:00Z,1.206910e+09,2021,1.206910e+06
4,0,remainder,2022-01-01T00:00:00Z,1.246754e+09,2022,1.246754e+06
...,...,...,...,...,...,...
625,9,source=maxar,2019-01-01T00:00:00Z,0.000000e+00,2019,0.000000e+00
626,9,source=maxar,2020-01-01T00:00:00Z,0.000000e+00,2020,0.000000e+00
627,9,source=maxar,2021-01-01T00:00:00Z,8.688680e+03,2021,8.688680e+00
628,9,source=maxar,2022-01-01T00:00:00Z,1.986731e+04,2022,1.986731e+01


#### Append the names of the regions and numbers to the DataFrame

In [14]:

# replace... geojson-continent-borders, geojson-africa-states; geojson-northamerica-states
geojson_dir = os.path.join(home_dir, f"{geojson_folder_dic[user_spatial]}", "")

geojson_names = []
for (dir_path, dir_names, file_names) in os.walk(geojson_dir):  # give the path
    geojson_names.extend(file_names)
print(geojson_names)
print(os.getcwd())


['0_Indonesia.geojson', '10_Bhutan.geojson', '11_Oman.geojson', '12_Uzbekistan.geojson', '13_Kazakhstan.geojson', '14_Tajikistan.geojson', '15_Mongolia.geojson', '16_Russia.geojson', '17_Vietnam.geojson', '18_Cambodia.geojson', '19_United Arab Emirates.geojson', '1_Malaysia.geojson', '20_Georgia.geojson', '21_Azerbaijan.geojson', '22_Turkey.geojson', '23_Laos.geojson', '24_Kyrgyzstan.geojson', '25_Armenia.geojson', '26_Iraq.geojson', '27_Iran.geojson', '28_Qatar.geojson', '29_Saudi Arabia.geojson', '2_Cyprus.geojson', '30_Pakistan.geojson', '31_Thailand.geojson', '32_Kuwait.geojson', '33_Timor-Leste.geojson', '34_Brunei.geojson', '35_Myanmar.geojson', '36_Bangladesh.geojson', '37_Afghanistan.geojson', '38_Turkmenistan.geojson', '39_Jordan.geojson', '3_India.geojson', '40_Nepal.geojson', '41_Yemen.geojson', '42_N. Cyprus.geojson', '43_Cyprus U.N. Buffer Zone.geojson', '44_Siachen Glacier.geojson', '45_Philippines.geojson', '46_Sri Lanka.geojson', '47_Taiwan.geojson', '48_Japan.geojson',

In [15]:
geojson_names_2 = geojson_names

id_name_dic = {int(re.search(r'^\d+', k)[0]): re.search(r'\_.+\.', v)[0][1:-1]
                for k, v in zip(geojson_names, geojson_names_2)}
id_name_ser = pd.Series(id_name_dic)
id_name_df = pd.DataFrame(id_name_ser, columns=["name"])
id_name_df = id_name_df.reset_index().rename(columns={"index": "id"})
id_name_df["id"] = id_name_df["id"].astype(int)
# id_name_df


In [16]:
id_name_df


Unnamed: 0,id,name
0,0,Indonesia
1,10,Bhutan
2,11,Oman
3,12,Uzbekistan
4,13,Kazakhstan
5,14,Tajikistan
6,15,Mongolia
7,16,Russia
8,17,Vietnam
9,18,Cambodia


In [17]:
# merge df with region names based on ids
ns_df = ns_df.merge(id_name_df, how="left", left_on="feature",
               right_on="id")
ns_df.head(20)


Unnamed: 0,feature,source,timestamp,value,year,value_km,id,name
0,0,remainder,2018-01-01T00:00:00Z,412573500.0,2018,412573.5,0,Indonesia
1,0,remainder,2019-01-01T00:00:00Z,729335700.0,2019,729335.7,0,Indonesia
2,0,remainder,2020-01-01T00:00:00Z,1096717000.0,2020,1096717.0,0,Indonesia
3,0,remainder,2021-01-01T00:00:00Z,1206910000.0,2021,1206910.0,0,Indonesia
4,0,remainder,2022-01-01T00:00:00Z,1246754000.0,2022,1246754.0,0,Indonesia
5,0,remainder,2023-01-01T00:00:00Z,1255846000.0,2023,1255846.0,0,Indonesia
6,0,source=maxar,2018-01-01T00:00:00Z,0.0,2018,0.0,0,Indonesia
7,0,source=maxar,2019-01-01T00:00:00Z,0.0,2019,0.0,0,Indonesia
8,0,source=maxar,2020-01-01T00:00:00Z,8588.36,2020,8.58836,0,Indonesia
9,0,source=maxar,2021-01-01T00:00:00Z,34164.54,2021,34.16454,0,Indonesia


In [18]:
ns_df.loc[ns_df.isna().any(axis=1),:]

Unnamed: 0,feature,source,timestamp,value,year,value_km,id,name


In [19]:
# ns_df.sort_values(by=["value"], ascending=False, inplace=False).head(20)


#### Data preparation and filtering

In [20]:
# get the number of AI Microsoft (source=microsoft/BuildingFootprints) / manually mapped (remainder) buildings for every year
grouped = ns_df.groupby(["name", "year"])["value"].sum()
# grouped

In [21]:
# merge the DataFrame with groupBy DataFrame and compute the percentage
merged_data = ns_df.merge(grouped, on=["name", 'year'], how='left', )
merged_data = merged_data.rename(columns={"value_x":"len_of_roads", "value_y":"total_roads"})
merged_data["percentage"] = (merged_data["len_of_roads"] / merged_data["total_roads"]) * 100
# merged_data.head(10)


In [22]:
# percentage verification
merged_data.groupby(["name", "year"])["percentage"].sum()


name         year
Afghanistan  2018    100.0
             2019    100.0
             2020    100.0
             2021    100.0
             2022    100.0
                     ...  
Yemen        2019    100.0
             2020    100.0
             2021    100.0
             2022    100.0
             2023    100.0
Name: percentage, Length: 318, dtype: float64

In [23]:
ns_df_rem = merged_data.loc[merged_data["source"] == "remainder", :]
ns_df_ai = merged_data.loc[merged_data["source"] == "source=maxar", :]


In [24]:
ns_df_rem_sel = ns_df_rem.groupby("name")[["len_of_roads", "year", "percentage", "total_roads"]].last().reset_index()
ns_df_ai_sel = ns_df_ai.groupby("name")[["len_of_roads", "year", "percentage", "total_roads"]].last().reset_index()
ns_df_ai_sel.head()

Unnamed: 0,name,len_of_roads,year,percentage,total_roads
0,Afghanistan,16340195.25,2023,8.026059,203589300.0
1,Armenia,9618.81,2023,0.01612,59670880.0
2,Azerbaijan,315971.85,2023,0.333903,94629970.0
3,Bahrain,322.18,2023,0.005814,5541255.0
4,Bangladesh,5615375.52,2023,2.931745,191537000.0


In [25]:
ns_df_rem_sel.head()

Unnamed: 0,name,len_of_roads,year,percentage,total_roads
0,Afghanistan,187249100.0,2023,91.973941,203589300.0
1,Armenia,59661260.0,2023,99.98388,59670880.0
2,Azerbaijan,94314000.0,2023,99.666097,94629970.0
3,Bahrain,5540933.0,2023,99.994186,5541255.0
4,Bangladesh,185921600.0,2023,97.068255,191537000.0


In [26]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)


In [27]:
s_df.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry
0,Indonesia,2,Sovereign country,Indonesia,Indonesia,Asia,Indonesia,"MULTIPOLYGON (((117.70361 4.16341, 117.70361 4..."
1,Malaysia,2,Sovereign country,Malaysia,Malaysia,Asia,Malaysia,"MULTIPOLYGON (((117.70361 4.16341, 117.69711 4..."


In [28]:
merged_d_rem = s_df.merge(ns_df_rem_sel, how="left",
                          left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_ai = s_df.merge(ns_df_ai_sel, how="left",
                         left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_rem.head(2)


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
0,Indonesia,2,Sovereign country,Indonesia,Indonesia,Asia,Indonesia,"MULTIPOLYGON (((117.70361 4.16341, 117.70361 4...",Indonesia,1255846000.0,2023.0,99.756857,1258907000.0
1,Malaysia,2,Sovereign country,Malaysia,Malaysia,Asia,Malaysia,"MULTIPOLYGON (((117.70361 4.16341, 117.69711 4...",Malaysia,325645900.0,2023.0,99.796276,326310600.0


In [29]:
# verify the merge is successful
merged_d_rem.loc[merged_d_rem.isna().any(axis=1),:]

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
52,Scarborough Reef,2,Indeterminate,Scarborough Reef,Scarborough Reef,Asia,Scarborough Shoal,"POLYGON ((117.75389 15.15437, 117.75569 15.151...",,,,,


In [30]:
merged_d_ai.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,CONTINENT,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
0,Indonesia,2,Sovereign country,Indonesia,Indonesia,Asia,Indonesia,"MULTIPOLYGON (((117.70361 4.16341, 117.70361 4...",Indonesia,3060945.91,2023,0.243143,1258907000.0
1,Malaysia,2,Sovereign country,Malaysia,Malaysia,Asia,Malaysia,"MULTIPOLYGON (((117.70361 4.16341, 117.69711 4...",Malaysia,664772.09,2023,0.203724,326310600.0


In [31]:
print(merged_d_rem.columns)
print(merged_d_ai.columns)


Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'len_of_roads', 'year', 'percentage',
       'total_roads'],
      dtype='object')
Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'CONTINENT', 'NAME_EN',
       'geometry', 'name', 'len_of_roads', 'year', 'percentage',
       'total_roads'],
      dtype='object')


#### Export the data into layers

In [32]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [35]:
# change name of the layer
merged_d_rem.to_file(f"maps/shp/mm_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


  merged_d_rem.to_file(f"maps/shp/mm_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


In [36]:
# change name of the layer
merged_d_ai.to_file(f"maps/shp/ai_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')


  merged_d_ai.to_file(f"maps/shp/ai_roads_{geojson_folder_dic[user_json_spec][8:]}_18-21.shp", driver='ESRI Shapefile')
