## Preparing and cleaning files necessary for (country-level) capital stock projection workflow

## Importing necessary modules and functions

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import re
from io import StringIO
from pathlib import Path
from tempfile import TemporaryDirectory
from zipfile import ZipFile

import dask_geopandas as dgp
import numpy as np
import pandas as pd

import pyreadr
import rioxarray
import sliiders.settings as sset
import xarray as xr

from PyPDF2 import PdfFileReader
from sliiders.cia_wfb_clean import organize_gather_cia_wfb_2000_2022
from sliiders.country_level_ypk import ppp_conversion_specific_year
from sliiders.io import read_shapefile, save, save_geoparquet
from sliiders.spatial import grid_val_to_ix
from tqdm import tqdm

## Clean GADM

In [5]:
adm0 = read_shapefile(sset.PATH_GADM, layer=0).set_crs(epsg=4326)

# handle autonomous regions which are given separate iso3 code but we will consider part
# of associated sovereign
autonomous = adm0[
    adm0.GID_0.str.startswith("Z") & adm0.GID_0.str[1].isin(list("1234567890"))
]
autonomous = (
    autonomous.join(
        pd.read_parquet(sset.PATH_HIST_CCODE_MAPPING).ccode, on="COUNTRY", how="left"
    )
    .drop(columns="COUNTRY")
    .rename(columns={"ccode": "GID_0", "GID_0": "parent"})
)
adm0 = adm0.drop(columns="COUNTRY")
full = pd.concat(
    (
        adm0.loc[adm0.GID_0.isin(autonomous.GID_0.unique())],
        autonomous.drop(columns="parent"),
    )
).dissolve(by="GID_0")
adm0 = pd.concat(
    (adm0.set_index("GID_0").drop(full.index.union(autonomous.parent)), full)
).geometry.sort_index()
assert adm0.index.is_unique

In [6]:
adm1 = read_shapefile(sset.PATH_GADM, layer=1)[["GID_0", "GID_1", "geometry"]].set_crs(
    epsg=4326
)

# manual fixes

# drop a NA country
adm1 = adm1[adm1.GID_0.ne("NA")]

# fix Ukraine GID_1 with "?" as name
adm1.loc[adm1.GID_0.eq("UKR") & adm1.GID_1.eq("?"), "GID_1"] = "UKR.28_1"

# Fix Ghana adm1 names which don't have "."
adm1.loc[adm1.GID_0.eq("GHA"), "GID_1"] = (
    "GHA." + adm1.loc[adm1.GID_0.eq("GHA"), "GID_1"].str[3:]
)

mapper = autonomous.set_index("parent").GID_0.to_dict()
adm1["GID_0"] = adm1.GID_0.replace(mapper)
parent = adm1.GID_1.str.split(".").str
adm1["GID_1"] = pd.Series(parent[0]).replace(mapper).values + ("." + parent[1])
adm1 = adm1.set_index("GID_1")
dups = adm1[adm1.index.duplicated(keep=False)]
agg = dups.reset_index().dissolve(by="GID_1")
adm1 = pd.concat((adm1.drop(agg.index), agg)).sort_index()
assert adm1.index.is_unique

In [16]:
# we are treating HKG and MAC as their own adm0's, which we must manually pull out from
# GADM adm0
adm0 = pd.concat(
    (
        adm0,
        adm1.loc[["CHN.HKG", "CHN.MAC"]]
        .set_index(pd.Index(["HKG", "MAC"], name="GID_0"))
        .geometry,
    )
).sort_index()
adm1 = adm1.drop(["CHN.HKG", "CHN.MAC"]).sort_index()

In [28]:
save_geoparquet(adm0.to_frame("geometry"), sset.PATH_GADM_ADM0_INT)
save_geoparquet(adm1, sset.PATH_GADM_ADM1_INT)


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  obj.to_parquet(_path, **kwargs)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  obj.to_parquet(_path, **kwargs)


## Get country codes from GADM

In [29]:
ccode_mapping = (
    read_shapefile(sset.PATH_GADM, layer=0)
    .set_index("COUNTRY")
    .GID_0.rename("ccode")
    .rename_axis("name")
)
# drop numerical china/india/pakistan GID_0's
ccode_mapping = ccode_mapping[ccode_mapping.str[1] != "0"]

# add on manaully added segments (which account for uninhabited areas not in GADM)
ccode_mapping = pd.concat(
    (ccode_mapping, pd.read_parquet(sset.PATH_SEG_PTS_MANUAL).ccode)
)

# add some manual mappers
# Netherlands Antilles in CIA WFB corresponds to these three (not ABW)
manual = sset.CCODE_MANUAL.copy()
manual["Netherlands Antilles"] = "BES+CUW+SXM"
ccode_mapping = pd.concat([ccode_mapping, manual]).sort_index()

# Handle no-accent names
alt_index = (
    ccode_mapping.index.str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .astype(str)
)
alt = pd.Series(ccode_mapping.values, index=alt_index, name="ccode")
ccode_mapping = (
    pd.concat((ccode_mapping, alt))
    .reset_index()
    .drop_duplicates()
    .set_index("name")
    .ccode.sort_index()
)

# getting list of valid ccodes including some previously uncaptured mixtures (i.e.
# France + overseas depts)
valid_ccodes = np.setdiff1d(
    np.unique(
        np.concatenate(
            (
                ccode_mapping.unique(),
                [k for v in sset.PPP_CCODE_IF_MSNG.values() for k in v],
            )
        )
    ),
    sset.EXCLUDED_ISOS,
)

In [12]:
save(pd.DataFrame({"ccode": valid_ccodes}), sset.PATH_ALL_VALID_HIST_CCODES)
save(ccode_mapping.to_frame(), sset.PATH_HIST_CCODE_MAPPING)

## UN WPP: overall populations data

In [123]:
un_df = []
for ix, sex in enumerate(["MALE", "FEMALE"]):
    df = pd.read_excel(
        sset.DIR_UN_WPP_RAW
        / f"WPP2022_POP_F02_{ix+2}_POPULATION_5-YEAR_AGE_GROUPS_{sex}.xlsx",
        sheet_name="Estimates",
        skiprows=16,
        index_col=0,
    )
    un_df.append(
        df[df.Type.eq("Country/Area")]
        .drop(
            columns=[
                "Variant",
                "Region, subregion, country or area *",
                "Notes",
                "Location code",
                "ISO2 Alpha-code",
                "Type",
                "SDMX code**",
                "Parent code",
            ]
        )
        .rename(columns={"ISO3 Alpha-code": "ccode", "Year": "year"})
        .astype({"year": int})
        .assign(sex=sex.lower())
    )

# rename kosovo to match gadm
un_df = (
    pd.concat(un_df)
    .replace({"ccode": {"XKX": "XKO"}})
    .set_index(["ccode", "year", "sex"])
)

# pop is in thousands
un_df *= 1000

# make sure all ccodes match
un_ccodes = un_df.index.get_level_values("ccode").unique()
assert un_ccodes.isin(valid_ccodes).all()

In [124]:
save(un_df, sset.PATH_UN_WPP_INT)

In [125]:
del un_df

## GEG-15

In [17]:
df_geg = dgp.read_file(sset.PATH_GEG15_RAW, npartitions=8)
df_geg = (
    df_geg.assign(lon=df_geg.geometry.x, lat=df_geg.geometry.y)
    .drop(columns="geometry")[["iso3", "tot_val", "lon", "lat"]]
    .compute()
    .set_index(["lon", "lat", "iso3"])
)
assert np.isin(df_geg.index.get_level_values("iso3").unique(), valid_ccodes).all()

# convert from millions to ones
df_geg["tot_val"] *= 1e6

In [18]:
save(df_geg, sset.PATH_GEG15_INT)

In [19]:
del df_geg

## Landscan

In [4]:
df = rioxarray.open_rasterio(
    sset.PATH_LANDSCAN_RAW, mask_and_scale=True, chunks="auto"
).squeeze()
df = df.where(df > 0).to_series().dropna().rename("population").to_frame()
df[["x_ix", "y_ix"]] = grid_val_to_ix(
    df.reset_index()[["x", "y"]].values,
    sset.POP_GRID_WIDTH,
    lon_mask=[True, False],
)
df = df.set_index(["x_ix", "y_ix"])

In [10]:
df.to_parquet(
    str(sset.PATH_EXPOSURE_POP_INT), storage_options=sset.STORAGE_OPTIONS
)

In [7]:
del df

## CIA World Factbook organization

Here, the following are carried out:
1. Clean each yearly version into `pandas.DataFrame` format
2. Attach ISO-3166 alpha-3 codes for easier merging
3. Merge the different versions into one dataset; update older data with newer data whenever possible
4. For GDP and GDP per capita, make sure that they are in constant 2017 PPP USD terms, as the raw dataset has varying PPP USD years

### Clean the yearly versions and attach country codes

In [None]:
cia_wfb_pop, cia_wfb_gdp, cia_wfb_gdppc = organize_gather_cia_wfb_2000_2022(
    sset.DIR_CIA_RAW, ccode_mapping
)

  0%|          | 0/23 [00:00<?, ?it/s]

In [None]:
# manual pop additions (too difficult to parse html)

# Akrotiri and Dhekelia
xad_df = pd.Series(
    [8500, 15700, 18195],
    index=pd.Index([2004, 2008, 2020], name="year", dtype="uint64"),
    name="pop",
)

# British Indian Ocean Territories
iot_df = pd.Series(
    [1200, 3200, 2500, 4000, 3000, 3000],
    index=pd.Index([1960, 1995, 2001, 2004, 2014, 2020], name="year", dtype="uint64"),
    name="pop",
)
extra_pops = (
    pd.concat(
        [xad_df.to_frame().assign(ccode="XAD"), iot_df.to_frame().assign(ccode="IOT")]
    )
    .assign(wfb_year=9999)
    .reset_index()
    .set_index(["ccode", "year"])
)

assert not cia_wfb_pop.index.get_level_values("ccode").isin(["XAD", "IOT"]).any()
cia_wfb_pop = pd.concat([cia_wfb_pop, extra_pops]).sort_index()

### Turning into constant 2017 PPP USD terms for GDP and GDP per capita

In [None]:
# fetching the PPP conversion factors
ppp_to_17 = ppp_conversion_specific_year(
    2017,
    sset.PATH_PWT_RAW,
    to=True,
    extrap_sim=True,
    fill_msng_ctries=sset.PPP_CCODE_IF_MSNG,
)

# checking the country codes that are not in `ppp_to_17`
print(
    "Missing from the PPP conversion table:\n",
    cia_wfb_gdp.index.get_level_values("ccode")
    .union(cia_wfb_gdppc.index.get_level_values("ccode"))
    .difference(ppp_to_17.index.get_level_values("ccode")),
)

# changing the 'year' index to be named 'usd_year'
ppp_to_17 = (
    ppp_to_17.reset_index()
    .rename(columns={"year": "usd_year"})
    .set_index(["ccode", "usd_year"])
)


# extrapolate to 2020
ppp_xr = ppp_to_17.to_xarray()
ppp_xr_add = ppp_xr.sel(usd_year=2019, drop=True).expand_dims(usd_year=[2020])

fill_val = xr.DataArray("2019 value held fixed to 2021").broadcast_like(ppp_xr_add)
ppp_xr_add["conv_fill"] = fill_val
ppp_xr_add["pl_gdpo_fill"] = fill_val
ppp_to_17 = xr.concat((ppp_xr, ppp_xr_add), dim="usd_year").to_dataframe()

In [None]:
# fetching the USD GDP deflators
defla_to_17 = (
    pd.read_excel(sset.PATH_PWT_RAW)
    .rename(columns={"year": "usd_year"})
    .set_index(["countrycode", "usd_year"])
)
defla_to_17 = (
    defla_to_17.loc[(["USA"], slice(None)), ["pl_gdpo"]]
    .reset_index()
    .drop(["countrycode"], axis=1)
    .set_index(["usd_year"])
)
defla_to_17["gdp_defla"] = defla_to_17.loc[2017, "pl_gdpo"] / defla_to_17["pl_gdpo"]
defla_to_17.drop(["pl_gdpo"], axis=1, inplace=True)

# PWT ends in 2019. We add 2020 and 2021 based on WB USA deflation data
# (https://data.worldbank.org/indicator/NY.GDP.DEFL.KD.ZG?locations=US)
defla_to_17 = pd.concat(
    (
        defla_to_17,
        defla_to_17.loc[2019, "gdp_defla"]
        * pd.DataFrame(
            {"gdp_defla": [0.988]},
            index=pd.Index([2020], name="usd_year"),
        ),
    )
)

# merging with the PPP conversion rates
ppp_to_17 = ppp_to_17.merge(defla_to_17, left_index=True, right_index=True, how="left")

In [None]:
# we manually check if USD year terms agree with one another; if they don't, we check
# the WFB versions and use the available USD years (some are assumed from their years)
check_usd_year = cia_wfb_gdppc.merge(
    cia_wfb_gdp, how="outer", left_index=True, right_index=True
)
check_usd_year = check_usd_year.loc[
    (check_usd_year.usd_year_y != check_usd_year.usd_year_x)
    & ~pd.isnull(check_usd_year.usd_year_y)
    & ~pd.isnull(check_usd_year.usd_year_x)
]

mismatched_usd_year = np.unique(check_usd_year.index.get_level_values("ccode"))
print(
    "Manually check the following countries:\n",
    mismatched_usd_year,
)

In [None]:
# manual cleansing for USD years
take_usd_year_from_gdp = [
    ("AND", [2010, 2011, 2013, 2014, 2015]),
    ("ASM", [2014, 2015]),
    ("GGY", [2014]),
    ("GNQ", [2011, 2012]),
    ("GRL", [2013, 2014]),
    ("JEY", [2015]),
    ("MAC", [2006, 2008, 2014, 2016]),
    ("MCO", [2006, 2009, 2011, 2013, 2014]),
    ("MHL", [2008]),
    ("MNP", [2014, 2015, 2016]),
    ("PLW", [2008]),
    ("PSE", [2012, 2013]),
    ("SOM", [2013, 2009, 2008]),
    ("SSD", [2010]),
    ("TUV", [2010]),
    ("VIR", [2011, 2012, 2014, 2015, 2016]),
]

take_usd_year_from_gdppc = [
    (["FSM", "NRU", "PLW"], 2013),
]

for i in take_usd_year_from_gdp:
    cia_wfb_gdppc.loc[i, "usd_year"] = cia_wfb_gdp.loc[i, "usd_year"]

for i in take_usd_year_from_gdppc:
    cia_wfb_gdp.loc[i, "usd_year"] = cia_wfb_gdppc.loc[i, "usd_year"]

In [None]:
def adjust_dollars(df, name):

    alt_names = {"gdp": "rgdpna_17", "gdppc": "rgdpna_pc_17"}
    # GDP per capita; not using index merging due to ccode-usd_year indices not being
    # unique in CIA WFB datasets
    out = df.reset_index().join(ppp_to_17, how="left", on=["ccode", "usd_year"])

    out.loc[out.conv.isnull(), ["conv_fill", "pl_gdpo_fill"]] = "neutral_assumption"

    out.loc[out.conv.isnull(), "conv"] = 1

    # only turning USD values to 2017 USD values, as we aren't too sure about PPP base year
    out[f"{name}_usd_17"] = out[[name, "gdp_defla"]].product(axis=1)

    # assuming PPP year = USD year, turning to constant 2017 PPP USD terms
    out[alt_names[name]] = out[["conv", f"{name}_usd_17"]].product(axis=1)
    return out


ppp_17_gdppc_df = adjust_dollars(cia_wfb_gdppc, "gdppc")
ppp_17_gdp_df = adjust_dollars(cia_wfb_gdp, "gdp")

### Merging population, GDP, and GDP per capita datasets altogether

In [None]:
def _prep_merge(df, name):
    return (
        df.rename(
            columns={
                k: f"{k}_{name}" for k in ["wfb_year", "conv_fill", "pl_gdpo_fill"]
            }
        )
        .rename(columns={"usd_year": f"orig_usd_year_{name}"})
        .drop(columns=["gdp_defla", "conv", name])
        .set_index(["ccode", "year"])
    )


gdp_merge_ready = _prep_merge(ppp_17_gdp_df, "gdp")
gdppc_merge_ready = _prep_merge(ppp_17_gdppc_df, "gdppc")

all_merged = (
    cia_wfb_pop.rename(columns={"wfb_year": "wfb_year_pop"})
    .join(
        [gdp_merge_ready, gdppc_merge_ready],
        how="outer",
    )
    .sort_index()
)

### Save

In [None]:
# re-ordering and changing data types for cleaner viewing
ordering = [
    "pop",
    "gdp_usd_17",
    "rgdpna_17",
    "gdppc_usd_17",
    "rgdpna_pc_17",
    "wfb_year_pop",
    "wfb_year_gdp",
    "wfb_year_gdppc",
    "orig_usd_year_gdp",
    "orig_usd_year_gdppc",
    "conv_fill_gdp",
    "conv_fill_gdppc",
    "pl_gdpo_fill_gdp",
    "pl_gdpo_fill_gdppc",
]
all_merged = all_merged[ordering]

# won't have ppp for years after PWT
max_year = ppp_to_17.index.get_level_values("usd_year").max()
all_merged = all_merged.loc[(slice(None), slice(None, max_year)), :]

In [None]:
assert all_merged.index.is_unique

In [None]:
save(all_merged, sset.PATH_CIA_INT)

## Credit Suisse Global Wealth Databook (GWDB)

In [None]:
GWDB_REG = [
    "Africa",
    "Asia-Pacific",
    "China",
    "Europe",
    "India",
    "Latin America",
    "North America",
]

In [None]:
def parse_region_mapping(page_text):

    # manual cleanup
    page_text = page_text.replace(
        "St. Vincent and the \nGrenadines", "St. Vincent and the Grenadines"
    )
    page_text_info = page_text.split("\n")
    page_text_info = [
        i for i in page_text_info if len(i.strip()) and (not i.startswith("Source: "))
    ]

    st = [ix for ix, i in enumerate(page_text_info) if "USD bn" in i and "%" in i]
    assert len(st) == 1
    st = st[0] + 1

    out = []
    for line in page_text_info[st:]:
        c = line.replace(" -", "-")
        if line.startswith("Sources:"):
            continue
        matches = re.findall("|".join(GWDB_REG), c)
        assert len(set(matches)) == 1, line
        match = matches[0]
        sep = c.rfind(match) - 1
        ctry = c[:sep].strip()
        out.append((ctry, match))
    return pd.DataFrame(out, columns=["country", "gwdb_region"])


def parse_page(page_text):
    year_info = re.findall("\(end.*-.*\)", page_text)
    assert len(year_info) == 1
    year = int(year_info[0][-5:-1])

    # manual cleanup
    page_text = page_text.replace(
        "St. Vincent and the Grena-\ndines", "St. Vincent and the Grenadines"
    )
    page_text_info = page_text.split("\n")
    page_text_info = [
        i for i in page_text_info if len(i.strip()) and (not i.startswith("Source: "))
    ]

    st = [
        ix
        for ix, i in enumerate(page_text_info)
        if "thousand" in i and "USD bn" in i and "%" in i
    ]
    assert len(st) == 1
    st = st[0] + 1

    out = []
    for line in page_text_info[st:]:
        start = re.search("[0-9]", line).start()
        c = (
            line[:start]
            .rstrip()
            .replace(" -", "-")
            .replace("- ", "-")
            .replace(" ", "_")
        )
        end = re.search("(\d)[^\d]*$", line).start()
        s = line[end + 1 :].strip().replace(" ", "_")
        out.append(c + " " + line[start : end + 1].replace(",", "") + " " + s)

    data = pd.read_csv(
        StringIO("\n".join(out)),
        delim_whitespace=True,
        names=[
            "country",
            "n_adults",
            "pct_adults",
            "total_wealth",
            "pct_wealth",
            "wealth_per_adult",
            "financial_wealth_per_adult",
            "nonfinancial_wealth_per_adult",
            "debt_per_adult",
            "median_wealth_per_adult",
            "estimation_method",
        ],
    )
    data["country"] = data.country.str.replace("_", " ")
    data["estimation_method"] = data.estimation_method.str.replace("_", " ").str.lower()

    # pop is in thousands
    data["n_adults"] *= 1000

    # wealth is in billions
    data["total_wealth"] *= 1e9
    data["year"] = year
    return data

In [None]:
# reading in the file
GWDB = PdfFileReader(sset.PATH_GWDB_RAW.open("rb"))

gwdb_df = []
region_mapping = []
for lx in tqdm(range(len(GWDB.pages))):
    text = GWDB.getPage(lx).extractText()
    if (
        "Table2-2:Wealthestimatesbycountry" in text.replace(" ", "")
        and "Contents" not in text
    ):
        gwdb_df.append(parse_page(text))
    elif "Table2-1:Countrydetails" in text.replace(" ", "") and "Contents" not in text:
        region_mapping.append(parse_region_mapping(text))

region_mapping = pd.concat(region_mapping).set_index("country").gwdb_region

# handle duplicate india and china b/c they are regions and countries
gwdb_df = pd.concat(gwdb_df).set_index(["country", "year"])
gwdb_df = gwdb_df[~gwdb_df.index.duplicated(keep="first")]

gwdb_df = gwdb_df.join(region_mapping, on="country", how="left")

# check that we mapped everyone
missing_reg = (
    gwdb_df[gwdb_df.gwdb_region.isnull()].index.get_level_values("country").unique()
)
assert np.isin(missing_reg, np.concatenate((region_mapping.unique(), ["World"]))).all()

gwdb_df = gwdb_df.join(ccode_mapping.rename_axis("country"), how="left").reset_index()
assert not len(
    gwdb_df.loc[
        gwdb_df.ccode.isnull() & ~gwdb_df.country.isin(GWDB_REG + ["World"]), "country"
    ].unique()
)

# Fill region ccodes with just their name
gwdb_df["ccode"] = gwdb_df.ccode.fillna(gwdb_df.country)
gwdb_df = gwdb_df.set_index(["ccode", "year"]).sort_index()

In [None]:
save(gwdb_df, sset.PATH_GWDB_INT)

## Fariss et al. (2022, Journal of Conflict Resolution) GDP, GDPpc, and population dataset

We attach 3-digit ISO codes in `sset.ALL_ISOS` instead of the Gleditsch and Ward (GW) country codes that are being used.

In [38]:
# Gleditsch and Ward code to ISO codes for those that do not match with one another
GLEDITSCH_WARD_TO_ISO = [
    ["BHM", "BHS"],
    ["AAB", "ATG"],
    ["ALG", "DZA"],
    ["ANG", "AGO"],
    ["AUS", "AUT"],
    ["AUL", "AUS"],
    ["MNG", "MNE"],
    ["SLV", "SVN"],
    ["BAH", "BHR"],
    ["CAM", "KHM"],
    ["BUL", "BGR"],
    ["BUI", "BDI"],
    ["BAR", "BRB"],
    ["BFO", "BFA"],
    ["BHM", "BHS"],
    ["BHU", "BTN"],
    ["BNG", "BGD"],
    ["BOS", "BIH"],
    ["BRU", "BRN"],
    ["CAO", "CMR"],
    ["CAP", "CPV"],
    ["CDI", "CIV"],
    ["CEN", "CAF"],
    ["CHA", "TCD"],
    ["CON", "COG"],
    ["COS", "CRI"],
    ["CRO", "HRV"],
    ["CZR", "CZE"],
    ["DEN", "DNK"],
    ["DRC", "COD"],
    ["DRV", "VNM"],
    ["EQG", "GNQ"],
    ["ETM", "TLS"],
    ["FRN", "FRA"],
    ["GAM", "GMB"],
    ["GFR", "DEU"],
    ["GRG", "GEO"],
    ["GRN", "GRD"],
    ["GUA", "GTM"],
    ["GUI", "GIN"],
    ["HAI", "HTI"],
    ["HON", "HND"],
    ["ICE", "ISL"],
    ["INS", "IDN"],
    ["IRE", "IRL"],
    ["KBI", "KIR"],
    ["KOS", "XKO"],
    ["KUW", "KWT"],
    ["KYR", "KGZ"],
    ["KZK", "KAZ"],
    ["LAT", "LVA"],
    ["LEB", "LBN"],
    ["LES", "LSO"],
    ["LIB", "LBY"],
    ["LIT", "LTU"],
    ["MAA", "MRT"],
    ["MAD", "MDV"],
    ["MAG", "MDG"],
    ["MAL", "MYS"],
    ["MAS", "MUS"],
    ["MAW", "MWI"],
    ["MLD", "MDA"],
    ["MNC", "MCO"],
    ["MON", "MNG"],
    ["MOR", "MAR"],
    ["MSI", "MHL"],
    ["MYA", "MMR"],
    ["MZM", "MOZ"],
    ["NAU", "NRU"],
    ["NEP", "NPL"],
    ["NEW", "NZL"],
    ["NIG", "NGA"],
    ["NIR", "NER"],
    ["NTH", "NLD"],
    ["OMA", "OMN"],
    ["PAL", "PLW"],
    ["PAR", "PRY"],
    ["PHI", "PHL"],
    ["POR", "PRT"],
    ["ROK", "KOR"],
    ["RUM", "ROU"],
    ["SAF", "ZAF"],
    ["SAL", "SLV"],
    ["SER", "SRB"],
    ["SEY", "SYC"],
    ["SIE", "SLE"],
    ["SIN", "SGP"],
    ["SKN", "KNA"],
    ["SLO", "SVK"],
    ["SLU", "LCA"],
    ["SNM", "SMR"],
    ["SOL", "SLB"],
    ["SPN", "ESP"],
    ["SRI", "LKA"],
    ["SUD", "SDN"],
    ["SVG", "VCT"],
    ["SWD", "SWE"],
    ["TAJ", "TJK"],
    ["TAW", "TWN"],
    ["TAZ", "TZA"],
    ["THI", "THA"],
    ["TOG", "TGO"],
    ["TRI", "TTO"],
    ["UAE", "ARE"],
    ["UKG", "GBR"],
    ["URU", "URY"],
    ["VAN", "VUT"],
    ["ZAM", "ZMB"],
    ["ZIM", "ZWE"],
]

In [39]:
# importing GW code data table, and attaching ISO codes
gwstates = (
    pyreadr.read_r(sset.DIR_YPK_RAW / "gwstates.rda")["gwstates"]
    .rename(columns={"gwcode": "gwno", "country_name": "country"})[
        ["gwno", "country", "gwc", "microstate"]
    ]
    .merge(
        pd.DataFrame(GLEDITSCH_WARD_TO_ISO, columns=["gwc", "ccode"]),
        on=["gwc"],
        how="left",
    )
    .drop_duplicates()
)
gwstates.loc[pd.isnull(gwstates.ccode), "ccode"] = gwstates.loc[
    pd.isnull(gwstates.ccode), "gwc"
].values

# unzipping the raw file, and attaching the necessary ISO codes
PATH_FARISS = sset.DIR_YPK_RAW / "Fariss_JCR_2022.zip"
with TemporaryDirectory() as tl:
    temp_loc = Path(tl)
    with ZipFile(sset.PATH_FARISS, mode="r") as z:
        z.extractall(temp_loc)

    # sorted, so GDP, GDPpc, and population
    filenames = ["fariss_gdp.parquet", "fariss_gdppc.parquet", "fariss_pop.parquet"]
    templst = np.sort(list(temp_loc.glob("*")))
    for j, file in enumerate(templst):
        fname = filenames[j]
        file = pyreadr.read_r(file)[None].merge(
            gwstates[["gwno", "ccode"]], on="gwno", how="left"
        )
        # making sure the country codes line up with PWT
        file.loc[file.ccode == "FRA", "ccode"] = sset.FRA_OVERSEAS_DEPT
        file.loc[(file.ccode == "CYP") & (file.year <= 1973), "ccode"] = "CYP+ZNC"
        file.loc[(file.ccode == "SRB") & (file.year <= 1999), "ccode"] = "SRB+XKO"
        file = (
            file.loc[~pd.isnull(file.ccode), :]
            .set_index(["indicator", "ccode", "year"])
            .sort_index()
        )
        save(file, sset.DIR_FARISS_INT / fname)

## UN Data

In [45]:
# load GDP data
un_ypc = (
    pd.read_csv(sset.DIR_UN_AMA_RAW / "un_snaama_nom_gdppc.csv", na_values="...")
    .drop(columns="Unit")
    .dropna(how="any")
    .rename(
        columns={
            "Country/Area": "name",
            "Year": "year",
            "GDP, Per Capita GDP - US Dollars": "gdppc_nom_current",
        }
    )
)
un_y = (
    pd.read_csv(sset.DIR_UN_AMA_RAW / "un_snaama_nom_gdp.csv", na_values="...")
    .drop(columns="Unit")
    .dropna(how="any")
    .rename(
        columns={
            "Country/Area": "name",
            "Year": "year",
            "GDP, at current prices - US Dollars": "gdp_nom_current",
        }
    )
)
un_y = un_y.join(un_ypc.iloc[:, -1])

# adjust for tanzania being split between mainland and zanzibar
tanzania = (
    un_y[un_y.name.str.contains("Tanzania")].groupby(["year"], as_index=False).sum()
)
tanzania["name"] = "United Republic of Tanzania"
un_y = pd.concat((un_y[~un_y["name"].str.contains("Tanzania")], tanzania))

# load mapping of country to (sub)region
un_regions = pd.read_csv(
    sset.PATH_UN_REGION_DATA_RAW,
    sep=";",
    usecols=["Country or Area", "ISO-alpha3 Code", "Sub-region Name", "Region Name"],
).rename(
    columns={
        "Country or Area": "name",
        "ISO-alpha3 Code": "ccode",
        "Sub-region Name": "subregion",
        "Region Name": "region",
    }
)

# add a few old countries manually to match with gdp dataset
manual = [
    (
        "Former Netherlands Antilles",
        "Latin America and the Caribbean",
        "Americas",
        "BES+CUW+SXM",
    ),
    ("Former Sudan", "Sub-Saharan Africa", "Africa", "SDN+SSD"),
    ("Kosovo", "Eastern Europe", "Europe", "XKO"),
    ("Cyprus/Northern Cyprus", "Western Asia", "Asia", "CYP+ZNC"),
    ("Akrotiri and Dhekelia", "Western Asia", "Asia", "XAD"),
    ("Paracel Islands", "Eastern Asia", "Asia", "XPI"),
]
un_regions = pd.concat(
    (un_regions, pd.DataFrame(manual, columns=["name", "subregion", "region", "ccode"]))
)

# some manual mapping to make names align
mapper_dict = {
    "China, Hong Kong Special Administrative Region": "China, Hong Kong SAR",
    "Côte d’Ivoire": "Côte d'Ivoire",
    "Iran (Islamic Republic of)": "Iran, Islamic Republic of",
    "Eswatini": "Kingdom of Eswatini",
    "North Macedonia": "Republic of North Macedonia",
    "Türkiye": "Turkey",
    "United States of America": "United States",
    "China": "China (mainland)",
}
mapper_subregion = {"South-eastern Asia": "South-Eastern Asia"}
un_regions["name"] = un_regions.name.replace(mapper_dict)
un_regions["subregion"] = un_regions.subregion.replace(mapper_subregion)
un_regions = un_regions.set_index("name")

# join datasets
un_merged = un_y.join(un_regions, on="name", how="left")

# check results
mismatched = un_merged[
    (un_merged.subregion.isnull()) & (un_merged.year >= sset.HISTORICAL_YEARS[0])
].name.unique()
ignore = [
    "Caribbean",
    "Central America",
    "China (mainland)",
    "Eastern Africa",
    "Middle Africa",
    "South America",
    "South-Eastern Asia",
    "Southern Africa",
    "Western Africa",
    "World",
]
region_names = np.concatenate(
    (un_regions.region.unique(), un_regions.subregion.unique(), ignore)
)
assert np.isin(mismatched, region_names).all()

# check that we don't have excess ccodes
ccodes = un_merged.ccode.dropna().unique()
assert np.isin(ccodes, valid_ccodes).all()

# set index
un_merged = un_merged.set_index(["name", "year"]).sort_index()

In [46]:
save(un_regions, sset.PATH_UN_REGION_DATA_INT)
save(un_merged, sset.PATH_UN_AMA_INT)