## Preparing and cleaning files necessary for (country-level) capital stock projection workflow

## Importing necessary modules and functions

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import shutil
from operator import itemgetter

import dask.dataframe as ddf
import dask.delayed as delayed
import fiona
import numpy as np
import pandas as pd
import pycountry as pyctry
from dask_gateway import Gateway
from py7zr import unpack_7zarchive
from tqdm.auto import tqdm

from sliiders.country_level_ypk import ppp_conversion_specific_year
from sliiders import settings as sset
from sliiders import spatial
from sliiders.cia_wfb_clean import (
    organize_gather_cia_wfb_2000_2020,
    wfb_merge_year_by_year,
)

# dask gateway setup
gateway = Gateway()
image_name = sset.DASK_IMAGE

In [None]:
# creating necessary directory
sset.DIR_YPK_INT.mkdir(exist_ok=True, parents=True)
sset.PATH_CIA_INT.parent.mkdir(exist_ok=True, parents=True)

## Maddison Project: scale change

In [None]:
# original file format was excel spreadsheet, so we will read this as is
madd = pd.read_excel(sset.PATH_MPD_RAW)

# population is in 1000s of people; we will save it to be in millions of people
madd["pop"] = madd["pop"] / 1000  ## divide by a thousand to get things in millions

# GDPpc is currently in ones of USD; we want gdp to be in millions of USD
# one USD per million people = 1 million USD per person
madd["gdp"] = madd["gdppc"] * madd["pop"]

# indexing and exporting
madd.rename(columns={"countrycode": "ccode"}, inplace=True)
madd["gdppc_unit"] = "ones of USD (constant 2011 PPP USD)"
madd["gdp_unit"] = "millions of USD (constant 2011 PPP USD)"
madd["pop_unit"] = "millions of people"
madd.set_index(["ccode", "year"], inplace=True)
madd.to_parquet(sset.DIR_YPK_INT / "maddison_project.parquet")

## UN WPP: overall populations data

### Assign country (ISO) codes: initial try with obvious cases

In [None]:
# importing data
un_df = pd.read_csv(sset.DIR_UN_WPP_RAW / "UN_WPP2019_TotalPopulation.csv")

# let's check whether there are any with "dependencies" considered together with sov.s
for i in set(un_df.Location):
    if "ependenc" in i:
        print(i)

In [None]:
# very minor clean-up for iso country codes; initial
countryname_to_iso = dict([])

for i in list(set(un_df.Location)):
    name = pyctry.countries.get(name=i)
    oname = pyctry.countries.get(official_name=i)

    if name is not None or oname is not None:
        to_use = name
        if name is None:
            to_use = oname
        countryname_to_iso[i] = to_use.alpha_3
    else:
        countryname_to_iso[i] = None

# some mandotory clean-ups required
# Will not print them as there are too many, but can be checked via print command
# print(no_isos)
no_isos = [k for k, v in countryname_to_iso.items() if v is None]

In [None]:
# after examining the no_isos list, I conduct the following (manual) clean-up
to_update = {
    "Micronesia (Fed. States of)": "FSM",
    "State of Palestine": "PSE",
    "China (and dependencies)": "CHN+D",
    "China, Macao SAR": "MAC",
    "China, Hong Kong SAR": "HKG",
    "Bolivia (Plurinational State of)": "BOL",
    "Saint Helena": "SHN",
    "Holy See": "VAT",
    "Venezuela (Bolivarian Republic of)": "VEN",
    "Iran (Islamic Republic of)": "IRN",
    "United Kingdom (and dependencies)": "GBR+D",
    "New Zealand (and dependencies)": "NZL+D",
    "Dem. People's Republic of Korea": "PRK",
    "China, Taiwan Province of China": "TWN",
    "Democratic Republic of the Congo": "COD",
    "Republic of Korea": "KOR",
    "United States Virgin Islands": "VIR",
    "Denmark (and dependencies)": "DNK+D",
    "France (and dependencies)": "FRA+D",
    "United States of America (and dependencies)": "USA+D",
    "Wallis and Futuna Islands": "WLF",
    "Channel Islands": "GGY+JEY",
    "Netherlands (and dependencies)": "NLD+D",
}

# updating the ISO codes
countryname_to_iso.update(to_update)

### Detecting cases spanning multiple regions

We do not want to account for cases like "Europe" where there are multiple countries / territories / sovereignties associated with it. Therefore, we will assign, to these multiple-region cases, the code `WIDE`.

In [None]:
# re-checking for clean-ups; again, this is too long a list to print
no_isos_2 = [k for k, v in countryname_to_iso.items() if v is None]

# the whole of no_isos_2 is "WIDE"
for i, ctry in enumerate(no_isos_2):
    countryname_to_iso[ctry] = "WIDE"

# applying the dictionary to get country codes (ISO)
un_df["ccode"] = un_df.Location.map(countryname_to_iso)

### Exporting

In [None]:
un_df.rename(columns={"Time": "year"}, inplace=True)
un_df.set_index(["ccode", "year"], inplace=True)
un_df.to_parquet(sset.DIR_YPK_INT / "un_population.parquet")

## UN WPP: population-by-age-group

In [None]:
## attaching country codes; first import un_pop information
by_age = pd.read_csv(sset.DIR_UN_WPP_RAW / "UN_WPP2019_Population_by_Age.csv")

## attaching the country codes
un_df_dic = dict(zip(un_df.Location, un_df.index.get_level_values("ccode")))
by_age["ccode"] = by_age.Location.map(un_df_dic)

## double checking if any are missing country codes
print("The missing-ccode rows are:", by_age[pd.isnull(by_age.ccode)].shape[0])

## saving the ccodes as indices
by_age.set_index(["ccode"], inplace=True)

## exporting
by_age.to_parquet(sset.DIR_YPK_INT / "un_population_by_age.parquet")

## GEG-15

In [None]:
# cluster setup
N_CLUSTER = 20
cluster = gateway.new_cluster(worker_image=image_name, profile="micro")
client = cluster.get_client()
cluster.scale(N_CLUSTER)
cluster

In [None]:
@delayed
def clean_chunk(start, num, shp_path):
    with fiona.open(shp_path, "r") as shp:
        chunk = shp[start : (start + num)]
    properties = pd.DataFrame((map(itemgetter("properties"), chunk)))
    geometries = list(map(itemgetter("geometry"), chunk))
    coordinates = pd.DataFrame(
        map(itemgetter("coordinates"), geometries), columns=["lon", "lat"]
    )
    df = coordinates.merge(properties, left_index=True, right_index=True)
    return df

In [None]:
DIR_GAR = sset.DIR_GEG15_RAW / "gar-exp"
with fiona.open(DIR_GAR / "gar_exp.shp") as shp:
    num_geoms = len(shp)

data_chunked = []
for ii in range(0, num_geoms, 1000):
    data_chunked.append(clean_chunk(ii, 1000, str(DIR_GAR / "gar_exp.shp")))

In [None]:
df = ddf.from_delayed(data_chunked)

In [None]:
df = df.repartition(npartitions=16).persist()

In [None]:
sset.DIR_GEG15_INT.mkdir(parents=True, exist_ok=True)
df.to_parquet(sset.DIR_GEG15_INT / "gar_exp.parquet")

In [None]:
cluster.scale(0)
client.close()
cluster.close()
cluster.shutdown()

## Unzip and process Landscan

In [None]:
spatial.process_landscan(
    landscan_zip=sset.DIR_LANDSCAN_RAW / f"{sset.LANDSCAN_VERS}.zip",
    dir_landscan_raw=sset.DIR_LANDSCAN_RAW / sset.LANDSCAN_VERS,
    dir_landscan_int=sset.DIR_LANDSCAN_INT,
    landscan_year=sset.LANDSCAN_YEAR,
)

## CIA World Factbook organization

Here, the following are carried out:
1. Clean each yearly version into `pandas.DataFrame` format
2. Attach ISO-3166 alpha-3 codes for easier merging
3. Merge the different versions into one dataset; update older data with newer data whenever possible
4. For GDP and GDP per capita, make sure that they are in constant 2017 PPP USD terms, as the raw dataset has varying PPP USD years

### Cleaning the yearly versions and attaching country codes

In [None]:
# yearly versions, 2000 to 2020
cia_wfb_pop, cia_wfb_gdp, cia_wfb_gdppc = organize_gather_cia_wfb_2000_2020()

### Merge the yearly datasets, updating the previous version with a newer version

In [None]:
# merging year-by-year, separately for GDP, GDPpc, and population
for i, gdp_df in enumerate(cia_wfb_gdp):
    pop_df = cia_wfb_pop[i]
    gdppc_df = cia_wfb_gdppc[i]
    if i == 0:
        updated_gdp_df = gdp_df.copy()
        updated_gdppc_df = gdppc_df.copy()
        updated_pop_df = pop_df.copy()
    else:
        updated_gdp_df = wfb_merge_year_by_year(updated_gdp_df, gdp_df, "gdp")
        updated_gdppc_df = wfb_merge_year_by_year(updated_gdppc_df, gdppc_df, "gdppc")
        updated_pop_df = wfb_merge_year_by_year(updated_pop_df, pop_df, "pop")

# we will only use positively-valued GDP and GDPpc datasets
updated_gdppc_df = updated_gdppc_df.loc[updated_gdppc_df.gdppc > 0, :].reset_index()

### Turning into constant 2017 PPP USD terms for GDP and GDP per capita

In [None]:
# fetching the PPP conversion factors
ppp_to_17 = ppp_conversion_specific_year(2017, to=True, extrap_sim=True)

# We will clean Netherland Antilles (BES+CUW+SXM, as defined in CIA WFB) with
# Curacao (CUW) PPP conversion rates; CUW has the largest economy, based on GDP
ppp_neth_antil_17 = ppp_to_17.loc[("CUW", slice(None)), :].reset_index()
ppp_neth_antil_17["ccode"] = "BES+CUW+SXM"
ppp_neth_antil_17.set_index(["ccode", "year"], inplace=True)
ppp_neth_antil_17["conv_fill"] = "copy_from_CUW"
ppp_neth_antil_17["pl_gdpo_fill"] = "copy_from_CUW"

# We use Montenegro (MNE) PPP conversion rates for SRB+MNE (Serbia and Montenegro)
# This is arbitrarily chosen, but for the years 1995-2019 there is not much difference
# in the conversion rates
ppp_srbmnt_17 = ppp_to_17.loc[("MNE", slice(None)), :].reset_index()
ppp_srbmnt_17["ccode"] = "SRB+MNE"
ppp_srbmnt_17.set_index(["ccode", "year"], inplace=True)
ppp_srbmnt_17["conv_fill"] = "copy_from_MNE"
ppp_srbmnt_17["pl_gdpo_fill"] = "copy_from_MNE"

# merging
ppp_to_17 = pd.concat(
    [ppp_to_17, ppp_neth_antil_17, ppp_srbmnt_17], axis=0
).sort_index()

# checking the country codes that are not in `ppp_to_17`
print()
print(
    "Missing from the PPP conversion table:\n",
    np.setdiff1d(
        np.union1d(
            updated_gdp_df["ccode"].unique(), updated_gdppc_df["ccode"].unique()
        ),
        ppp_to_17.index.get_level_values("ccode").unique(),
    ),
)

# changing the 'year' index to be named 'usd_year'
ppp_to_17 = (
    ppp_to_17.reset_index()
    .rename(columns={"year": "usd_year"})
    .set_index(["ccode", "usd_year"])
)

In [None]:
# fetching the USD GDP deflators
defla_to_17 = (
    pd.read_excel(sset.PATH_PWT_RAW)
    .rename(columns={"year": "usd_year"})
    .set_index(["countrycode", "usd_year"])
)
defla_to_17 = (
    defla_to_17.loc[(["USA"], slice(None)), ["pl_gdpo"]]
    .reset_index()
    .drop(["countrycode"], axis=1)
    .set_index(["usd_year"])
)
defla_to_17["gdp_defla"] = defla_to_17.loc[2017, "pl_gdpo"] / defla_to_17["pl_gdpo"]
defla_to_17.drop(["pl_gdpo"], axis=1, inplace=True)

# merging with the PPP conversion rates
ppp_to_17 = ppp_to_17.merge(defla_to_17, left_index=True, right_index=True, how="left")

In [None]:
# we manually check if USD year terms agree with one another; if they don't, we check
# the WFB versions and use the available USD years (some are assumed from their years)
check_usd_year = updated_gdppc_df.set_index(["ccode", "year"]).merge(
    updated_gdp_df.set_index(["ccode", "year"]),
    how="outer",
    left_index=True,
    right_index=True,
)
check_usd_year = check_usd_year.loc[
    (check_usd_year.usd_year_y != check_usd_year.usd_year_x)
    & ~pd.isnull(check_usd_year.usd_year_y)
    & ~pd.isnull(check_usd_year.usd_year_x)
]

print(
    "Manually check the following countries:\n",
    check_usd_year.index.get_level_values("ccode").unique().values,
)

In [None]:
# manual cleansing for USD years
take_usd_year_from_gdp = [
    ("AND", [2010, 2011, 2013, 2014, 2015]),
    ("ASM", [2014, 2015]),
    ("GGY", [2014]),
    ("GNQ", [2011, 2012]),
    ("GRL", [2013, 2014]),
    ("JEY", [2015]),
    ("MAC", [2006, 2008, 2014, 2016]),
    ("MCO", [2006, 2009, 2011, 2013, 2014]),
    ("MHL", [2008]),
    ("MNP", [2014, 2015, 2016]),
    ("PLW", [2008]),
    ("PSE", [2012, 2013]),
    ("SOM", [2013, 2009, 2008]),
    ("SSD", [2010]),
    ("TUV", [2010]),
    ("VIR", [2011, 2012, 2014, 2015, 2016]),
]

take_usd_year_from_gdppc = [
    (["FSM", "NRU", "PLW"], 2013),
]

updated_gdppc_df.set_index(["ccode", "year"], inplace=True)
updated_gdp_df.set_index(["ccode", "year"], inplace=True)
for i in take_usd_year_from_gdp:
    updated_gdppc_df.loc[i, "usd_year"] = updated_gdp_df.loc[i, "usd_year"].values

for i in take_usd_year_from_gdppc:
    updated_gdp_df.loc[i, "usd_year"] = updated_gdppc_df.loc[i, "usd_year"].values

updated_gdppc_df.reset_index(inplace=True)
updated_gdp_df.reset_index(inplace=True)

In [None]:
# GDP per capita; not using index merging due to ccode-usd_year indices not being
# unique in CIA WFB datasets
ppp_17_gdppc_df = updated_gdppc_df.merge(
    ppp_to_17.reset_index(), how="left", on=["ccode", "usd_year"]
)
ppp_17_gdppc_df.loc[
    pd.isnull(ppp_17_gdppc_df.conv), ["conv_fill", "pl_gdpo_fill"]
] = "neutral_assumption"
ppp_17_gdppc_df.loc[pd.isnull(ppp_17_gdppc_df.conv), "conv"] = 1

# only turning USD values to 2017 USD values, as we aren't too sure about PPP base year
ppp_17_gdppc_df["gdppc_usd_17"] = ppp_17_gdppc_df[["gdppc", "gdp_defla"]].product(
    axis=1
)

# assuming PPP year = USD year, turning to constant 2017 PPP USD terms
ppp_17_gdppc_df["rgdpna_pc_17"] = ppp_17_gdppc_df[["conv", "gdppc_usd_17"]].product(
    axis=1
)

# similar process for GDP
ppp_17_gdp_df = updated_gdp_df.merge(
    ppp_to_17.reset_index(), how="left", on=["ccode", "usd_year"]
)
ppp_17_gdp_df.loc[
    pd.isnull(ppp_17_gdp_df.conv), ["conv_fill", "pl_gdpo_fill"]
] = "neutral_assumption"
ppp_17_gdp_df.loc[pd.isnull(ppp_17_gdp_df.conv), "conv"] = 1
ppp_17_gdp_df["gdp_usd_17"] = ppp_17_gdp_df[["gdp", "gdp_defla"]].product(axis=1)
ppp_17_gdp_df["rgdpna_17"] = ppp_17_gdp_df[["conv", "gdp_usd_17"]].product(axis=1)

### Merging population, GDP, and GDP per capita datasets altogether

In [None]:
# merging GDP and GDPpc
gdp_rename = {
    "usd_year": "orig_usd_year_gdp",
    "wfb_year": "wfb_year_gdp",
    "conv_fill": "conv_fill_gdp",
    "pl_gdpo_fill": "pl_gdpo_fill_gdp",
}
gdp_merge_ready = ppp_17_gdp_df.rename(columns=gdp_rename).drop(
    ["gdp_defla", "conv", "gdp"], axis=1
)

gdppc_rename = {
    "usd_year": "orig_usd_year_gdppc",
    "wfb_year": "wfb_year_gdppc",
    "conv_fill": "conv_fill_gdppc",
    "pl_gdpo_fill": "pl_gdpo_fill_gdppc",
}
gdppc_merge_ready = ppp_17_gdppc_df.rename(columns=gdppc_rename).drop(
    ["gdp_defla", "conv", "gdppc"], axis=1
)

gdp_and_gdppc_merge_ready = gdp_merge_ready.set_index(["ccode", "year"]).merge(
    gdppc_merge_ready.set_index(["ccode", "year"]),
    left_index=True,
    right_index=True,
    how="outer",
)

# merging GDP + GDPpc with population
all_merged = (
    updated_pop_df.set_index(["ccode", "year"])
    .rename(columns={"wfb_year": "wfb_year_pop"})
    .merge(gdp_and_gdppc_merge_ready, left_index=True, right_index=True, how="outer")
)

### Exporting

In [None]:
# re-ordering and changing data types for cleaner viewing
ordering = [
    "pop",
    "gdp_usd_17",
    "rgdpna_17",
    "gdppc_usd_17",
    "rgdpna_pc_17",
    "wfb_year_pop",
    "wfb_year_gdp",
    "wfb_year_gdppc",
    "orig_usd_year_gdp",
    "orig_usd_year_gdppc",
    "conv_fill_gdp",
    "conv_fill_gdppc",
    "pl_gdpo_fill_gdp",
    "pl_gdpo_fill_gdppc",
]
all_merged = all_merged[ordering].reset_index()
all_merged["year"] = all_merged["year"].astype("int64")
all_merged.set_index(["ccode", "year"], inplace=True)
all_merged.sort_index(inplace=True)

# exporting
all_merged.to_parquet(sset.PATH_CIA_INT)