In [1]:
import os
import json
import pandas as pd
import geopandas as gpd
import re
import numpy as np

### Definitions

In [2]:
# dictionary definitions

json_data_dic = {
            0: "roads_continent_data_18-23_ai.json",
            1: "roads_africa_states_18-23_ai.json",
            2: "roads_northamerica_states_18-23_ai.json",
            3: "roads_europe_states_18-23_ai.json",
            4: "roads_asia_states_18-23_ai.json",
            }
shp_dic = {
            0: "contionents-borders.shp",
            1: "africa-states-borders.shp",
            2: "northamerica-states-borders.shp",
            3: "europe-states-borders.shp",
            4: "asia-states-borders.shp",
                      }

geojson_folder_dic = {
                    0: "geojson-continent-states",
                    1: "geojson-africa-states",
                    2: "geojson-northamerica-states",
                    3: "geojson-europe-states",
                    4: "geojson-asia-states",
                      }

In [3]:
# user input definition - json data

while True:
    try:
        user_json_spec = int(input("Which data do you want to join?\
                            Enter a number:\
                            0 for roads by continents (2018–2023),\
                            1 for roads by Africa states (2018–2023),\
                            2 for roads by North-Central America states (2018–2023),\
                            3 for roads by Europe states (2018–2023),\
                            4 for roads by Asia states (2018–2023),\
                                :"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_json_spec not in range(len(json_data_dic)):
        print(f"Please enter one of these numbers: {json_data_dic.keys()}.")
        continue
    break


In [4]:
# user input definition - spatial data to conect

while True:
    try:
        user_spatial = int(input("Which data do you want to plot?\
                            Enter 0 for stats by continents (2018–2023),\
                            1 for stats by Africa states (2018–2023),\
                            2 for stats by North-Central America states (2018–2023),\
                            3 for stats by Europe states (2018–2023),\
                            4 for stats by Asia states (2018–2023):"
                                ))
    except ValueError:
        print("Please enter a number.")
        continue
    if user_spatial not in range(len(shp_dic)):
        print(f"Please enter one of these numbers: {shp_dic.keys()}.")
        continue
    break


#### Read the downloaded/nonspatial and spatial data

In [5]:

os.chdir('..')
home_dir = os.getcwd()
ns_d_path = os.path.join(home_dir, "downloaded-data", "")
s_d_path = os.path.join(home_dir, "shp", "")


In [6]:
# replace... blds_continents_data_18-23_ai.json; blds_africa_states_18-23_ai.json; blds_northamerica_states_18-23_ai.json
with open(f"{ns_d_path}{json_data_dic[user_json_spec]}", "r") as f:
    ns_d = json.load(f)
# ns_d



In [7]:
ns_d


[[{'groupByObject': [0, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 742852532.54},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 760699085.41},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 783185595.45},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 814566574.98},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 846346134.27},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 869777221.85}]},
  {'groupByObject': [0, 'source=maxar'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2019-01-01T00:00:00Z', 'value': 0.0},
    {'timestamp': '2020-01-01T00:00:00Z', 'value': 26099.16},
    {'timestamp': '2021-01-01T00:00:00Z', 'value': 508688.76},
    {'timestamp': '2022-01-01T00:00:00Z', 'value': 557589.33},
    {'timestamp': '2023-01-01T00:00:00Z', 'value': 644999.82}]}],
 [{'groupByObject': [10, 'remainder'],
   'result': [{'timestamp': '2018-01-01T00:00:00Z', 'value': 212846445.12},
    {'timestamp': '2

In [8]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [9]:

# change name of the layer - africa-states-borders.shp; northamerica-states-borders.shp

s_df = gpd.read_file(f"{s_d_path}{shp_dic[user_spatial]}")
# s_df["s_id"] = s_df.index
s_df.head()


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,NAME_EN,geometry
0,United Kingdom,1,Sovereignty,United Kingdom,United Kingdom,United Kingdom,"MULTIPOLYGON (((33.78094 34.97635, 33.76043 34..."
1,France,1,Sovereignty,France,France,France,"MULTIPOLYGON (((-54.11153 2.11427, -54.13491 2..."
2,Ukraine,2,Sovereign country,Ukraine,Ukraine,Ukraine,"MULTIPOLYGON (((31.76434 52.10057, 31.82584 52..."
3,Belarus,2,Sovereign country,Belarus,Belarus,Belarus,"POLYGON ((23.60624 51.51740, 23.60231 51.53078..."
4,Netherlands,1,Sovereignty,Netherlands,Netherlands,Netherlands,"MULTIPOLYGON (((-63.10700 18.06212, -63.08589 ..."


#### Edit the obtained data into desired shape to create a DataFrame

In [10]:
d = {
    "feature": [],
    "source": [],
    "timestamp": [],
    "value": []
}


In [11]:
for region in ns_d:
    for dic1 in region:
        feature, source = dic1["groupByObject"][0], dic1["groupByObject"][1]
        for res in dic1["result"]:
            d["feature"].append(feature)
            d["source"].append(source)
            d["timestamp"].append(res["timestamp"])
            d["value"].append(res["value"])

ns_df = pd.DataFrame(d)
# ns_df

In [12]:
# extract only the year
ns_df["year"] = pd.to_datetime(ns_df["timestamp"]).dt.year

# convert meters to kilometers
ns_df["value"] = ns_df["value"] / 1000

In [13]:
ns_df

Unnamed: 0,feature,source,timestamp,value,year
0,0,remainder,2018-01-01T00:00:00Z,742852.53254,2018
1,0,remainder,2019-01-01T00:00:00Z,760699.08541,2019
2,0,remainder,2020-01-01T00:00:00Z,783185.59545,2020
3,0,remainder,2021-01-01T00:00:00Z,814566.57498,2021
4,0,remainder,2022-01-01T00:00:00Z,846346.13427,2022
...,...,...,...,...,...
523,9,source=maxar,2019-01-01T00:00:00Z,0.00000,2019
524,9,source=maxar,2020-01-01T00:00:00Z,0.00000,2020
525,9,source=maxar,2021-01-01T00:00:00Z,19.13492,2021
526,9,source=maxar,2022-01-01T00:00:00Z,35.05722,2022


#### Append the names of the regions and numbers to the DataFrame

In [14]:

# replace... geojson-continent-borders, geojson-africa-states; geojson-northamerica-states
geojson_dir = os.path.join(home_dir, f"{geojson_folder_dic[user_spatial]}", "")

geojson_names = []
for (dir_path, dir_names, file_names) in os.walk(geojson_dir):  # give the path
    geojson_names.extend(file_names)
print(geojson_names)
print(os.getcwd())


['0_United Kingdom.geojson', '10_Norway.geojson', '11_Sweden.geojson', '12_Finland.geojson', '13_Luxembourg.geojson', '14_Belgium.geojson', '15_North Macedonia.geojson', '16_Albania.geojson', '17_Kosovo.geojson', '18_Spain.geojson', '19_Denmark.geojson', '1_France.geojson', '20_Romania.geojson', '21_Hungary.geojson', '22_Slovakia.geojson', '23_Poland.geojson', '24_Ireland.geojson', '25_Greece.geojson', '26_Austria.geojson', '27_Italy.geojson', '28_Switzerland.geojson', '29_Liechtenstein.geojson', '2_Ukraine.geojson', '30_Serbia.geojson', '31_Croatia.geojson', '32_Slovenia.geojson', '33_Bulgaria.geojson', '34_San Marino.geojson', '35_Monaco.geojson', '36_Andorra.geojson', '37_Montenegro.geojson', '38_Bosnia and Herz..geojson', '39_Portugal.geojson', '3_Belarus.geojson', '40_Moldova.geojson', '41_Vatican.geojson', '42_Iceland.geojson', '43_Malta.geojson', '4_Netherlands.geojson', '5_Lithuania.geojson', '6_Czechia.geojson', '7_Germany.geojson', '8_Estonia.geojson', '9_Latvia.geojson']
c:\

In [15]:
geojson_names_2 = geojson_names

id_name_dic = {int(re.search(r'^\d+', k)[0]): re.search(r'\_.+\.', v)[0][1:-1]
                for k, v in zip(geojson_names, geojson_names_2)}
id_name_ser = pd.Series(id_name_dic)
id_name_df = pd.DataFrame(id_name_ser, columns=["name"])
id_name_df = id_name_df.reset_index().rename(columns={"index": "id"})
id_name_df["id"] = id_name_df["id"].astype(int)
# id_name_df


In [16]:
id_name_df


Unnamed: 0,id,name
0,0,United Kingdom
1,10,Norway
2,11,Sweden
3,12,Finland
4,13,Luxembourg
5,14,Belgium
6,15,North Macedonia
7,16,Albania
8,17,Kosovo
9,18,Spain


In [17]:
# merge df with region names based on ids
ns_df = ns_df.merge(id_name_df, how="left", left_on="feature",
               right_on="id")
ns_df.head(20)


Unnamed: 0,feature,source,timestamp,value,year,id,name
0,0,remainder,2018-01-01T00:00:00Z,742852.53254,2018,0,United Kingdom
1,0,remainder,2019-01-01T00:00:00Z,760699.08541,2019,0,United Kingdom
2,0,remainder,2020-01-01T00:00:00Z,783185.59545,2020,0,United Kingdom
3,0,remainder,2021-01-01T00:00:00Z,814566.57498,2021,0,United Kingdom
4,0,remainder,2022-01-01T00:00:00Z,846346.13427,2022,0,United Kingdom
5,0,remainder,2023-01-01T00:00:00Z,869777.22185,2023,0,United Kingdom
6,0,source=maxar,2018-01-01T00:00:00Z,0.0,2018,0,United Kingdom
7,0,source=maxar,2019-01-01T00:00:00Z,0.0,2019,0,United Kingdom
8,0,source=maxar,2020-01-01T00:00:00Z,26.09916,2020,0,United Kingdom
9,0,source=maxar,2021-01-01T00:00:00Z,508.68876,2021,0,United Kingdom


In [18]:
ns_df.loc[ns_df.isna().any(axis=1),:]

Unnamed: 0,feature,source,timestamp,value,year,id,name


In [19]:
# ns_df.sort_values(by=["value"], ascending=False, inplace=False).head(20)


#### Data preparation and filtering

In [20]:
# get the number of AI Microsoft (source=microsoft/BuildingFootprints) / manually mapped (remainder) buildings for every year
grouped = ns_df.groupby(["source", "year"])["value"].sum()
# grouped

In [21]:
# merge the DataFrame with groupBy DataFrame and compute the percentage
merged_data = ns_df.merge(grouped, on=["source", 'year'], how='left', )
merged_data = merged_data.rename(columns={"value_x":"len_of_roads", "value_y":"total_roads"})
merged_data["percentage"] = (merged_data["len_of_roads"] / merged_data["total_roads"]) * 100
# merged_data.head(10)


In [22]:
# percentage verification
merged_data.groupby(["source", "year"])["percentage"].sum()


source        year
remainder     2018    100.0
              2019    100.0
              2020    100.0
              2021    100.0
              2022    100.0
              2023    100.0
source=maxar  2018      0.0
              2019      0.0
              2020    100.0
              2021    100.0
              2022    100.0
              2023    100.0
Name: percentage, dtype: float64

In [23]:
ns_df_rem = merged_data.loc[merged_data["source"] == "remainder", :]
ns_df_ai = merged_data.loc[merged_data["source"] == "source=maxar", :]


In [24]:
ns_df_rem_sel = ns_df_rem.groupby("name")[["len_of_roads", "year", "percentage", "total_roads"]].last().reset_index()
ns_df_ai_sel = ns_df_ai.groupby("name")[["len_of_roads", "year", "percentage", "total_roads"]].last().reset_index()
ns_df_ai_sel.head()

Unnamed: 0,name,len_of_roads,year,percentage,total_roads
0,Albania,40.85834,2023,0.4371,9347.5901
1,Andorra,0.0,2023,0.0,9347.5901
2,Austria,1.76416,2023,0.018873,9347.5901
3,Belarus,12.98063,2023,0.138866,9347.5901
4,Belgium,0.45483,2023,0.004866,9347.5901


In [25]:
ns_df_rem_sel.head()

Unnamed: 0,name,len_of_roads,year,percentage,total_roads
0,Albania,48914.42838,2023,0.298112,16408060.0
1,Andorra,1635.76901,2023,0.009969,16408060.0
2,Austria,436386.41305,2023,2.659585,16408060.0
3,Belarus,303412.40723,2023,1.849166,16408060.0
4,Belgium,194746.62977,2023,1.186896,16408060.0


In [26]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)


In [27]:
s_df.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,NAME_EN,geometry
0,United Kingdom,1,Sovereignty,United Kingdom,United Kingdom,United Kingdom,"MULTIPOLYGON (((33.78094 34.97635, 33.76043 34..."
1,France,1,Sovereignty,France,France,France,"MULTIPOLYGON (((-54.11153 2.11427, -54.13491 2..."


In [28]:
merged_d_rem = s_df.merge(ns_df_rem_sel, how="left",
                          left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_ai = s_df.merge(ns_df_ai_sel, how="left",
                         left_on="NAME", right_on="name").rename(columns={"name_y": "added_name"})  # change left_on
merged_d_rem.head(2)


Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
0,United Kingdom,1,Sovereignty,United Kingdom,United Kingdom,United Kingdom,"MULTIPOLYGON (((33.78094 34.97635, 33.76043 34...",United Kingdom,869777.2,2023,5.300913,16408060.0
1,France,1,Sovereignty,France,France,France,"MULTIPOLYGON (((-54.11153 2.11427, -54.13491 2...",France,2323392.0,2023,14.160064,16408060.0


In [29]:
# verify the merge is successful
merged_d_rem.loc[merged_d_rem.isna().any(axis=1),:]

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads


In [30]:
merged_d_ai.head(2)

Unnamed: 0,SOVEREIGNT,LEVEL,TYPE,ADMIN,NAME,NAME_EN,geometry,name,len_of_roads,year,percentage,total_roads
0,United Kingdom,1,Sovereignty,United Kingdom,United Kingdom,United Kingdom,"MULTIPOLYGON (((33.78094 34.97635, 33.76043 34...",United Kingdom,644.99982,2023,6.900172,9347.5901
1,France,1,Sovereignty,France,France,France,"MULTIPOLYGON (((-54.11153 2.11427, -54.13491 2...",France,104.11323,2023,1.113798,9347.5901


In [31]:
print(merged_d_rem.columns)
print(merged_d_ai.columns)


Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'NAME_EN', 'geometry',
       'name', 'len_of_roads', 'year', 'percentage', 'total_roads'],
      dtype='object')
Index(['SOVEREIGNT', 'LEVEL', 'TYPE', 'ADMIN', 'NAME', 'NAME_EN', 'geometry',
       'name', 'len_of_roads', 'year', 'percentage', 'total_roads'],
      dtype='object')


#### Export the data into layers

In [32]:
os.getcwd()

'c:\\Users\\milan\\OneDrive - MUNI\\VŠ\\PhD\\Zahraniční stáž\\Work\\HeiGIT_notebooks\\analysis\\ai-assisted-osm-mapping-stats'

In [33]:
# change name of the layer
merged_d_rem.to_file(
    f"maps/shp/mm_roads_{geojson_folder_dic[user_json_spec][8:]}.shp", driver='ESRI Shapefile')


  merged_d_rem.to_file(


In [34]:
# change name of the layer
merged_d_ai.to_file(
    f"maps/shp/ai_roads_{geojson_folder_dic[user_json_spec][8:]}.shp", driver='ESRI Shapefile')


  merged_d_ai.to_file(
