In [None]:
from typing import Union

import numpy as np
import pandas as pd

import utils.dataset_processing_utils as dputil

SEED = 42
np.random.seed(SEED)

In [2]:
borders_df = dputil.load_dataset("cia_factbook.csv", encoding="utf-8")

NameError: name 'dputil' is not defined

### CIA FactBook for border data

figure (a) world map with internal borders in red and coastlines in blue



<img src="world_border_map.png" alt="world map with internal borders in red and coastlines in blue" width="800" height="600">

#### data manipulation:

In this section our aim is to convert both border length and coastline length from textual variable to numerical ones, by omitting the unit values (km) and any other non-numerical value in the description.

In [44]:
column_names = {
    "Country": "country",
    "Geography: Land boundaries - total": "border_length_km",
    "Geography: Coastline": "coastline_length_km"
}
borders_df_edited = borders_df.rename(columns=column_names)

# for "border_length_km" remove all non-numerical values
def handle_textual_borders(x: Union[str, float]) -> Union[list, str]:
    if pd.isna(x):
        return "-1"
    else:
        x = x.replace(",", "")
        numerics = [y for y in x.split(' ') if y.isnumeric()]
        if len(numerics) < 1:
            return "-1"
        else:
            return numerics[0]
borders_df_edited["border_length_km"] = borders_df_edited["border_length_km"].apply(handle_textual_borders).astype(int)
borders_df_edited["coastline_length_km"] = borders_df_edited["coastline_length_km"].apply(handle_textual_borders).astype(int)
borders_df_edited.sample(10, random_state=SEED)

Unnamed: 0,country,border_length_km,coastline_length_km
66,Djibouti,528,314
45,Central African Republic,5920,0
9,Antigua and Barbuda,0,153
73,Eritrea,1840,2234
136,Liberia,1667,579
104,Hong Kong,33,733
205,Senegal,2684,531
247,Vanuatu,0,2528
125,Kingman Reef,0,-1
211,Slovakia,1587,0


as we can see, there are some territories that are not countries (like `Kingman Reef`), therefore they're irrelevant in our analysis. We'll remove such values before we merge this dataset with the others.

There are also some countries with missing information, that we'll add manually.

In [45]:
borders_df_edited.query("coastline_length_km < 0").head(50)
# borders_df_edited.query("border_length_km < 0 or coastline_length_km < 0").head(50)

Unnamed: 0,country,border_length_km,coastline_length_km
1,Akrotiri,-1,-1
12,Aruba,0,-1
13,Ashmore and Cartier Islands,0,-1
19,Baker Island,0,-1
23,Belgium,1297,-1
31,Bouvet Island,0,-1
49,Christmas Island,0,-1
50,Clipperton Island,0,-1
65,Dhekelia,-1,-1
83,French Southern and Antarctic Lands,0,-1


In [None]:
# The data was extracted with the help of ChatGPT, with our manual validation of selected values
inland_border_manual_insert = {
    "Botswana": 4347,
    "Gibraltar": 1,
    "Italy": 1836,
    "Spain": 1928,
    "Morocco": 2019,
    "Zambia": 5664,
}
coastline_length_manual_insert = {
    "Belgium": 65,
    "Montenegro": 294,
    "Slovenia": 47,
    "Taiwan": 1566
}
borders_df_edited["border_length_km"] = borders_df_edited["border_length_km"].apply(lambda x: inland_border_manual_insert[x] if x in inland_border_manual_insert.keys() else x)
borders_df_edited["coastline_length_km"] = borders_df_edited["coastline_length_km"].apply(lambda x: coastline_length_manual_insert[x] if x in coastline_length_manual_insert.keys() else x)