## Organizing historical data (GDP, GDPpc, and population for 2000-2020) in long-panel format, converting to current and constant PPP terms, taking care of missing data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from itertools import product as lstprod

import numpy as np
import pandas as pd
import statsmodels.api as sm
import xarray as xr
from sliiders import country_level_ypk as ypk_fn
from sliiders import settings as sset
from sliiders.io import save
from statsmodels.formula.api import ols

ALL_ISOS = pd.read_parquet(sset.PATH_ALL_VALID_HIST_CCODES).iloc[:, 0].values
FINAL_ISOS = [i for i in ALL_ISOS if "+" not in i]

# years to consider for the historical cases
CCODE_FILTER = ("ccode", "in", ALL_ISOS)

In [3]:
def check_missing_isos(df):
    unique_isos = df.index.get_level_values("ccode").unique()
    return unique_isos[~np.isin(unique_isos, ALL_ISOS)]

## Importing all raw data, and creating a merged, long-panel version

### PWT

In [4]:
pwt100 = pd.read_excel(sset.PATH_PWT_RAW)
pwt100.rename(columns={"countrycode": "ccode"}, inplace=True)
pwt_gdp_pop = ["ccode", "year", "pop", "rgdpo", "rgdpna", "cgdpo"]
gdp_pop_df = pwt100[pwt_gdp_pop].copy()

# fixing for FRA, CYP, and SRB as these countries actually account for more than their
# ISO code designation in PWT 10.0; FRA: FRA+GLP+GUF+MTQ+MYT+REU
# CYP: CYP+ZNC (1950-1973), CYP (1974-2019); SRB: SRB (1950-1999), SRB+XKO (2000-2019)
gdp_pop_df.loc[gdp_pop_df.ccode == "FRA", "ccode"] = sset.FRA_OVERSEAS_DEPT
gdp_pop_df.loc[
    (gdp_pop_df.ccode == "CYP") & (gdp_pop_df.year <= 1973), "ccode"
] = "CYP+ZNC"
gdp_pop_df.loc[
    (gdp_pop_df.ccode == "SRB") & (gdp_pop_df.year < 2000), "ccode"
] = "SRB+XKO"

# creating GDP per capita variables
for i in ["rgdpo", "rgdpna", "cgdpo"]:
    gdp_pop_df[f"{i}_pc"] = gdp_pop_df[i].div(gdp_pop_df["pop"])
gdp_pop_df.set_index(["ccode", "year"], inplace=True)

# convert to millions
gdp_pop_df[["pop", "rgdpo", "rgdpna", "cgdpo"]] *= 1e6

### UN population data (UN WPP)

In [5]:
unpop = (
    pd.read_parquet(
        sset.PATH_UN_WPP_INT,
        filters=[CCODE_FILTER, ("year", "<=", 2020)],
    )
    .sum(axis=1)
    .groupby(["ccode", "year"])
    .sum()
    .rename("un_pop")
    .reset_index()
)

# CYP indicates CYP+ZNC in this dataset and SRB indicates SRB+XKO
unpop["ccode"] = unpop.ccode.replace({"CYP": "CYP+ZNC", "SRB": "SRB+XKO"})
unpop = unpop.set_index(["ccode", "year"]).un_pop

# merging
gdp_pop_df = gdp_pop_df.join(unpop, how="outer").sort_index()

### WB WDI

In [6]:
# WB WDI
wdi_rename_dict = {
    "SP.POP.TOTL": "wb_pop",
    "NY.GDP.MKTP.PP.KD": "wb_rgdpna",
    "NY.GDP.PCAP.PP.KD": "wb_rgdpna_pc",
    "NY.GDP.MKTP.KD": "wb_gdp_nom",
    "NY.GDP.PCAP.KD": "wb_gdppc_nom",
}
wb_wdi = (
    pd.read_parquet(
        sset.DIR_WB_WDI_RAW / "wdi_pop_iy_gdp.parquet",
        columns=list(wdi_rename_dict.keys()) + ["country"],
    )
    .rename(columns=wdi_rename_dict)
    .reset_index()
)

# Unifying the country code conventions for Kosovo, Channel Islands, CYP (indicates
# CYP+ZNC) and FRA (indicates FRA+GLP+GUF+MTQ+MYT+REU)
wb_wdi.loc[wb_wdi.ccode == "XKX", "ccode"] = "XKO"
wb_wdi.loc[wb_wdi.ccode == "CHI", "ccode"] = "GGY+JEY"
wb_wdi.loc[wb_wdi.ccode == "CYP", "ccode"] = "CYP+ZNC"
wb_wdi.loc[wb_wdi.ccode == "FRA", "ccode"] = sset.FRA_OVERSEAS_DEPT
wb_wdi = wb_wdi[wb_wdi.ccode.isin(ALL_ISOS)]
wb_wdi.set_index(["ccode", "year"], inplace=True)

In [7]:
# merging
gdp_pop_df = gdp_pop_df.join(wb_wdi[list(wdi_rename_dict.values())], how="outer")

### Fariss et al. (2022, JCR) dataset

In [8]:
DIR_FARISS = sset.DIR_YPK_INT / "Fariss_JCR_2022"
indicators = {
    "PWT100_gdp_ppp_bc_2017": "fariss_rgdpna",
    "PWT100_gdp_ppp_bt_2017": "fariss_cgdpe",
}

far_gdp = (
    pd.read_parquet(
        DIR_FARISS / "fariss_gdp.parquet",
        filters=[CCODE_FILTER, ("indicator", "in", indicators.keys())],
        columns=["mean"],
    )["mean"]
    .drop_duplicates()
    .unstack("indicator")
    .rename(columns=indicators)
    .sort_index()
    * 1e6
)
far_pop = (
    pd.read_parquet(DIR_FARISS / "fariss_pop.parquet", filters=[CCODE_FILTER])
    .drop_duplicates()
    .loc["PWT100_pop", "mean"]
    .rename("fariss_pop")
    .sort_index()
    * 1e6
)

# based on the rgdpna-cgdpe relationship in PWT 10.0, 2017 values must be similar,
# but Fariss et al. (2022) doesn't seem to take care of this. Based on inspection,
# we will create `fariss_cgdpo` based on `fariss_rgdpna` and use the growth rates wrt
# 2017 `fariss_cgdpe`.
far_gdp_2017 = far_gdp.loc[(slice(None), 2017), :].droplevel("year")
far_gdp["fariss_cgdpo"] = (
    far_gdp.fariss_cgdpe * far_gdp_2017.fariss_rgdpna / far_gdp_2017.fariss_cgdpe
)
far_gdp = far_gdp.drop(columns="fariss_cgdpe")

# Some country-level information needs cleaning, in terms of scales
div_by_10000 = ["SOM", "SSD", "LIE", "MCO", "AND", "ERI"]
div_by_1000 = ["CUB", "PRK"]
far_gdp.loc[div_by_10000] /= 1e4
far_gdp.loc[div_by_1000] /= 1e3

far = far_gdp.join(
    far_gdp.div(far_pop, axis=0).rename(columns=lambda x: x + "_pc")
).join(far_pop, how="outer")

In [9]:
gdp_pop_df = gdp_pop_df.join(far, how="outer")

### Maddison Project Database (MPD)

In [10]:
mpd = (
    pd.read_excel(sset.PATH_MPD_RAW)
    .rename(
        columns={"countrycode": "ccode", "gdppc": "mpd_rgdpna_pc", "pop": "mpd_pop"}
    )
    .set_index(["ccode", "year"])
    .drop(columns="country", index=["CSK", "SUN", "YUG"])
    .sort_index()
)
assert len(check_missing_isos(mpd)) == 0

# population is in 1000s
mpd["mpd_pop"] *= 1000

# some minor interpolation for the case of North Korea
mpd = mpd.fillna(
    np.exp(
        np.log(mpd.loc[["PRK"], ["mpd_rgdpna_pc", "mpd_pop"]])
        .reset_index(level="ccode")
        .interpolate(method="index")
        .set_index("ccode", append=True)
    )
).reset_index()

# fixing CYP (indicates CYP+ZNC) and FRA (indicates FRA+GLP+GUF+MTQ+MYT+REU)
mpd.loc[mpd.ccode == "CYP", "ccode"] = "CYP+ZNC"
mpd.loc[mpd.ccode == "FRA", "ccode"] = sset.FRA_OVERSEAS_DEPT

mpd = mpd.set_index(["ccode", "year"])

In [11]:
gdp_pop_df = gdp_pop_df.join(mpd, how="outer")

### IMF

In [12]:
imf = (
    pd.read_excel(sset.PATH_IMF_WEO_RAW, na_values=["n/a", "--"], index_col="ISO")
    .rename(columns={"Subject Descriptor": "subject"})
    .rename_axis("ccode")
)
imf = imf.loc[imf.index.get_level_values("ccode").isin(ALL_ISOS)]

# renaming the subjects
imf_rename = {
    "Gross domestic product per capita, constant prices": "imf_rgdpna_pc",
    "Gross domestic product per capita, current prices": "imf_gdppc_nom",
    "Gross domestic product, current prices": "imf_gdp_nom",
    "Population": "imf_pop",
    "Total investment": "imf_iy_ratio",
}
imf["subject"] = imf.subject.replace(imf_rename)
v_names = {str(k): k for k in range(1980, 2021)}
imf = imf[imf.subject.isin(imf_rename.values())].rename(columns=v_names)

multipliers = (
    imf[["subject", "Scale"]]
    .drop_duplicates()
    .set_index("subject")
    .Scale.replace({"Billions": 1e9, "Units": 1, "Millions": 1e6, np.nan: 0.01})
)

# reshape and get in units of ones
imf = (
    imf.melt(
        id_vars="subject",
        var_name="year",
        value_vars=v_names.values(),
        ignore_index=False,
    )
    .set_index(["subject", "year"], append=True)
    .value.unstack("subject")
    .rename_axis(columns=None)
    * multipliers
).dropna(how="all")

In [13]:
gdp_pop_df = gdp_pop_df.join(imf, how="outer")

### OECD regional data

Among the relevant countries and regions we want to observe, only the five French overseas departments (Martinique, Mayotte, Guadeloupe, French Guiana, and La Réunion) are available in OECD regional data.

In [14]:
regpop = pd.read_csv(
    sset.DIR_OECD_REGIONS_RAW / "REGION_DEMOGR.csv",
    usecols=[
        "Territory Level and Typology",
        "TIME",
        "VAR",
        "Gender",
        "Value",
        "Region",
    ],
).rename(
    columns={
        "Territory Level and Typology": "terrtype",
        "TIME": "year",
        "Value": "oecd_pop",
        "Region": "ccode",
    }
)

In [15]:
# mapping region names and ISO codes
fra_regions = [
    "France",
    "Martinique",
    "Mayotte",
    "Guadeloupe",
    "French Guiana",
    "La Réunion",
]
fra_isos = [sset.FRA_OVERSEAS_DEPT, "MTQ", "MYT", "GLP", "GUF", "REU"]
fra_map = pd.Series(fra_isos, index=pd.Index(fra_regions, name="Region"), name="ccode")

# reading in the OECD data for population and gdp
regpop = (
    pd.read_csv(
        sset.DIR_OECD_REGIONS_RAW / "REGION_DEMOGR.csv",
        usecols=[
            "Territory Level and Typology",
            "TIME",
            "VAR",
            "Gender",
            "Value",
            "Region",
        ],
    )
    .rename(
        columns={
            "Territory Level and Typology": "terrtype",
            "TIME": "year",
            "Value": "oecd_pop",
            "Region": "ccode",
        }
    )
    .replace({"ccode": fra_map})
)
regpop = (
    regpop.loc[
        regpop.VAR.eq("T")
        & regpop.Gender.eq("Total")
        & regpop.ccode.isin(fra_map.values),
        ["ccode", "year", "oecd_pop"],
    ]
    .set_index(["ccode", "year"])
    .oecd_pop
)

regecon = (
    pd.read_csv(
        sset.DIR_OECD_REGIONS_RAW / "REGION_ECONOM.csv",
        usecols=[
            "Territory Level and Typology",
            "TIME",
            "MEAS",
            "Value",
            "Region",
        ],
    )
    .rename(
        columns={
            "Territory Level and Typology": "terrtype",
            "TIME": "year",
            "Region": "ccode",
        }
    )
    .replace({"ccode": fra_map})
)
regecon = [
    (
        regecon.loc[
            regecon.MEAS.eq(k) & regecon.ccode.isin(fra_map.values),
            ["ccode", "year", "Value"],
        ]
        .set_index(["ccode", "year"])
        .Value.rename(v)
    )
    * mult
    for k, v, mult in [
        ("USD_PPP", "oecd_rgdpna", 1e6),
        ("PC_USD_PPP", "oecd_rgdpna_pc", 1),
    ]
]
fra_detect = regpop.to_frame().join(regecon, how="outer")

In [16]:
gdp_pop_df = gdp_pop_df.join(fra_detect, how="outer")

### CIA World Factbook

CIA information has been pre-cleaned to be in 2017 PPP USD, part of which has used some extrapolation for PPP conversion rates.

In [17]:
cia = (
    pd.read_parquet(
        sset.PATH_CIA_INT,
        filters=[CCODE_FILTER],
        columns=["rgdpna_17", "rgdpna_pc_17", "pop"],
    )
    .rename(
        columns={
            "rgdpna_17": "cia_rgdpna",
            "rgdpna_pc_17": "cia_rgdpna_pc",
            "pop": "cia_pop",
        }
    )
    .sort_index()
    .dropna(how="all")
)
cia = cia[(cia.cia_pop != 0) | (cia.cia_pop.isnull())]

assert len(check_missing_isos(cia)) == 0

In [18]:
gdp_pop_df = gdp_pop_df.join(cia, how="outer")

### UN SNA AMA information

In [19]:
un_nom_y = (
    pd.read_parquet(sset.PATH_UN_AMA_INT, columns=["gdppc_nom_current", "ccode"])
    .dropna(subset="ccode")
    .reset_index()
    .set_index(["ccode", "year"])
    .gdppc_nom_current.rename("un_gdppc_nom")
)
assert len(check_missing_isos(un_nom_y)) == 0
gdp_pop_df = gdp_pop_df.join(un_nom_y, how="outer")

### Information from various disaggregated sources, for smaller regions, territories and countries

This includes national account reports and approximations from organizational reports or academic papers. We use the dataset that has been pre-cleaned in the notebook `ypk1_prep_clean.ipynb`.

In [20]:
various_sources = pd.read_parquet(sset.PATH_INC_POP_AUX)
gdp_pop_df = gdp_pop_df.join(various_sources, how="outer")

### Uninhabited areas (no population and no economic activity)

These are: the French Southern and Antarctic Lands (`ATF`), Bouvet Island (`BVT`), Clipperton Island (`XCL`), Heard and McDonald Islands (`HMD`), and South Georgia and the South Sandwich Islands (`SGS`). Also, `UMI` is uninhabited post-2000.
- `ATF`: According to the CIA World Factbook website (link [here](https://www.cia.gov/the-world-factbook/countries/french-southern-and-antarctic-lands/)), `ATF` does not have permanent population and therefore we will record this as having no population and no economic activity.
- `BVT`: uninhabited to protect nature reserve
- `XCL`: is an atoll with no permanent inhabitants since 1945.
- `HMD`: is an Australian external territory near Antarctica.
- `SGS`: is uninhabited
- `XSP`: Spratly Islands
- `UMI`: has some population up to 2000, but post-2000, has no indigenous population or is generally uninhabited.

In [21]:
# assigning 0 population and 0 GDP
atf_pop = (
    (gdp_pop_df.loc[pd.IndexSlice["FRA", 1950:], "pop"].copy() * np.nan)
    .rename("noecon_pop")
    .reset_index()
)
atf_pop["ccode"], atf_pop["atf_pop"] = "ATF", 0
atf_pop.set_index(["ccode", "year"], inplace=True)
atf_gdp = atf_pop.copy().rename(columns={"noecon_pop": "noecon_rgdpna"})
atf_pop, atf_gdp = atf_pop.noecon_pop, atf_gdp.noecon_rgdpna

# merging all
gdp_pop_df = gdp_pop_df.join(atf_pop, how="outer")
gdp_pop_df = gdp_pop_df.join(atf_gdp, how="outer")

# Attaching the other uninhabited areas' information
for i in np.setdiff1d(sset.UNINHABITED_ISOS, ["ATF", "UMI"]):
    i_df = gdp_pop_df.loc[(["ATF"], slice(None)), :].reset_index()
    i_df["ccode"] = i
    i_df.set_index(["ccode", "year"], inplace=True)
    gdp_pop_df = pd.concat([gdp_pop_df, i_df], axis=0)

# UMI, 2001-2020
other_umi = pd.DataFrame(
    {"noecon_pop": 0, "noecon_rgdpna": 0},
    index=pd.MultiIndex.from_product(
        (["UMI"], np.arange(2001, gdp_pop_df.index.get_level_values("year").max() + 1)),
        names=["ccode", "year"],
    ),
)
gdp_pop_df = pd.concat((gdp_pop_df, other_umi))

### Exporting the intermediate result

In [22]:
assert gdp_pop_df.index.is_unique
gdp_pop_df = gdp_pop_df.sort_index()

In [23]:
save(gdp_pop_df, sset.PATH_INC_POP_AGG)

## Historical population (2000-2020), creating a single sequence

### Re-reading in aggregated sources population and GDP data, in case necessary

In [24]:
gdp_pop_df = pd.read_parquet(sset.PATH_INC_POP_AGG)

### Splitting out population data

In [25]:
popraw_df = gdp_pop_df.filter(like="pop", axis="columns").dropna(how="all")

# Fill in PWT 2020 using growth rates from UN
for aux_src, aux_trg in [
    ("ggy_gov", "un_pop"),
    ("aland", "un_pop"),
    ("un", "pop"),
    ("imf", "pop"),
]:
    estimated_2020 = (
        popraw_df.loc[(slice(None), 2019), aux_trg].droplevel("year")
        * popraw_df.loc[(slice(None), slice(2020, None)), f"{aux_src}_pop"]
        / popraw_df.loc[(slice(None), 2019), f"{aux_src}_pop"].droplevel("year")
    ).rename(aux_trg)
    popraw_df[aux_trg] = popraw_df[aux_trg].fillna(estimated_2020)
popraw_df = popraw_df.sort_index()

### Helper funcs

In [26]:
def get_iso_pop(ccode, trg_yrs=sset.HISTORICAL_YEARS):
    out = popraw_df.loc[(ccode, trg_yrs), ["pop"]]
    out["pop_source"] = "PWT"
    out.loc[(ccode, 2020), "pop_source"] = "PWT_ratio_UN"
    return out


def interp_extrap_terr(
    iso, parent, pop_name, pop_source, trg_yrs=sset.HISTORICAL_YEARS
):
    df = popraw_df.loc[iso, [pop_name]].rename(columns={pop_name: "pop"})

    cen_yrs = df.index.values

    # interpolate
    out = np.exp(
        np.log(df.reindex(range(df.index.min(), df.index.max() + 1))).interpolate(
            "index"
        )
    )

    out["pop_source"] = "interp"
    out.loc[cen_yrs, "pop_source"] = pop_source

    # country-territory ratios for extrapolation
    ratio = (
        out.loc[cen_yrs, "pop"]
        / popraw_df.loc[(parent, cen_yrs), "pop"].droplevel("ccode")
    ).rename("ratio")

    extrapolated = pd.merge_asof(
        popraw_df.loc[(parent, trg_yrs), "pop"].droplevel("ccode"),
        ratio,
        left_index=True,
        right_index=True,
        direction="nearest",
    ).product(axis=1)
    return (
        out.reindex(extrapolated.index)
        .fillna(
            extrapolated.to_frame("pop").assign(
                pop_source=f"PWT_{parent}_ratio_{pop_source}_{iso}"
            )
        )
        .reset_index()
        .assign(ccode=iso)
        .set_index(["ccode", "year"])
    )

### Cleaning country-specific cases

#### Australia (`AUS`) and its territories

- Mainland Australia (`AUS`): use PWT 10.0 (2000-2019) and UN growth rates (2019-2000)
- External Territories: Use CIA WFB and interpolate/extrapolate using growth rates from mainland AUS

In [27]:
pop_cleaned = pd.concat(
    [get_iso_pop("AUS")]
    + [
        interp_extrap_terr(ccode, "AUS", "cia_pop", "CIA")
        for ccode in ["CCK", "CXR", "NFK"]
    ]
).sort_index()

#### Cyprus (`CYP`) and Northern Cyprus (`ZNC`)

- `CYP`: use PWT 10.0 (2000-2019) and IMF growth rates (2019-2020)
- `ZNC`: use UN population, which has `CYP+ZNC` information, then subtract `CYP` from these values

In [28]:
cyp_pop = popraw_df.loc[["CYP"], ["pop"]]
cyp_pop["pop_source"] = "PWT"
cyp_pop.loc[("CYP", 2020), "pop_source"] = "PWT + IMF_gr"

XNC_pop = (
    (
        popraw_df.loc["CYP+ZNC", "un_pop"].reindex(
            index=cyp_pop.droplevel("ccode").index
        )
        - cyp_pop["pop"].droplevel("ccode")
    )
    .to_frame(name="pop")
    .assign(ccode="ZNC")
    .reset_index()
    .set_index(["ccode", "year"])
)
XNC_pop["pop_source"] = "UN:CYP+ZNC + PWT:CYP_subtract"
XNC_pop.loc[("ZNC", 2020), "pop_source"] = "UN:CYP+ZNC + PWT+IMF_gr:CYP_subtract"

In [29]:
pop_cleaned = pd.concat([pop_cleaned, cyp_pop, XNC_pop])

#### Danish and Finnish territories

Mainland Denmark (`DNK`) and Finland (`FIN`) will be added later.

- Danish territories (`FRO`, `GRL`): use UN population (2000-2020)
- Aland (`ALA`, Finnish): use information from Åland Statistics

In [30]:
# FRO and GRL
fro_grl_pop = popraw_df.loc[["FRO", "GRL"], ["un_pop"]].rename(
    columns={"un_pop": "pop"}
)
fro_grl_pop["pop_source"] = "UN"

# Aland
ala_pop = popraw_df.loc[["ALA"], ["aland_pop"]].rename(columns={"aland_pop": "pop"})
ala_pop["pop_source"] = "ALA-GOV"

In [31]:
pop_cleaned = pd.concat([pop_cleaned, fro_grl_pop, ala_pop])

#### France (`FRA`) and its territories 

We use UN population as its `FRA` information does not include the Overseas Departments.
- Mainland France: `FRA`
- Overseas Departments: `GLP`, `GUF`, `MTQ`, `MYT`, `REU`
- Overseas Collectivities: `PYF`, `BLM`, `SPM`, `MAF`, `WLF`
- Other: `NCL`

In [32]:
fra_ov_dept = ["GLP", "GUF", "MTQ", "MYT", "REU"]
fra_coll_other = ["PYF", "BLM", "SPM", "MAF", "WLF", "NCL"]
fra_df = popraw_df.loc[["FRA"] + fra_ov_dept + fra_coll_other, ["un_pop"]].rename(
    columns={"un_pop": "pop"}
)
fra_df["pop_source"] = "UN"

In [33]:
pop_cleaned = pd.concat([pop_cleaned, fra_df])

#### British territories

Mainland UK (`GBR`) will be added later.

These include:
- Overseas Territories
  - `PCN`, `IOT`: use CIA WFB, interpolating when necessary
  - `AIA`, `BMU`, `VGB`, `CYM`, `FLK`, `GIB`, `MSR`, `SHN`, `TCA`: use UN information
- Crown dependencies:
  - `GGY`: use information from the Government of Guernsey, interpolating when necessary
  - `JEY`: use information from the Government of Jersey, but for 2020 population use the 2018-2020 growth rate from CIA
  - `IMN`: use UN information

In [34]:
# IMN and other overseas territories
gbr_ov_terr = ["AIA", "BMU", "VGB", "CYM", "FLK", "GIB", "MSR", "SHN", "TCA"]
gbr_imn_ov_terr_pop = popraw_df.loc[gbr_ov_terr + ["IMN"], ["un_pop"]].rename(
    columns={"un_pop": "pop"}
)
gbr_imn_ov_terr_pop["pop_source"] = "UN"

# PCN and IOT
arr = popraw_df.loc[["PCN", "IOT"], "cia_pop"].to_xarray()
arr = arr.reindex(year=np.arange(arr.year.min(), arr.year.max() + 1))

pcniot_pop = (
    (
        np.exp(
            np.log(arr)
            .interpolate_na(dim="year")
            .rename("pop")
            .sel(year=sset.HISTORICAL_YEARS)
        )
    )
    .to_series()
    .to_frame()
    .assign(pop_source="CIA")
)
pcniot_pop.loc[pcniot_pop.index.isin(popraw_df.index), "pop_source"] = "CIA_interp"

# GGY
ggy_pop = popraw_df.loc[["GGY"], ["un_pop"]].rename(columns={"un_pop": "pop"})
ggy_pop["pop_source"] = "UN"
ggy_pop.loc[("GGY", 2020), "pop_source"] = "UN + GGY-GOV_gr"

# JEY
jey_pop = popraw_df.loc[["JEY"], ["un_pop"]].rename(columns={"un_pop": "pop"})
jey_pop["pop_source"] = "UN"

In [35]:
pop_cleaned = pd.concat(
    [
        pop_cleaned,
        gbr_imn_ov_terr_pop,
        pcniot_pop,
        ggy_pop,
        jey_pop,
    ]
)

#### New Zealand territories

Mainland New Zealand (`NZL`) will be added later.
- External territories (`TKL`, `NIU`, `COK`): use UN information (2000-2020)

In [36]:
# TKL, NIU, COK
tkl_niu_cok = popraw_df.loc[["TKL", "NIU", "COK"], ["un_pop"]].rename(
    columns={"un_pop": "pop"}
)
tkl_niu_cok["pop_source"] = "UN"

In [37]:
pop_cleaned = pd.concat([pop_cleaned, tkl_niu_cok])

#### Norway (`NOR`) and its territories

Mainland Norway (`NOR`) will be added later.

- `SJM`: use Statistics Norway information (2010-2020), and use growth rates from CIA WFB (2000-2009)

In [38]:
sjm_pop = gdp_pop_df.loc["SJM", "stat_nor_pop"]
st_year = sjm_pop.dropna().index.min()
sjm_pop_alt = popraw_df.loc["SJM", "cia_pop"]
sjm_pop = sjm_pop.fillna(sjm_pop_alt * sjm_pop[st_year] / sjm_pop_alt[st_year])
sjm_pop = sjm_pop.reset_index().assign(ccode="SJM").set_index(["ccode", "year"])
sjm_pop["pop_source"] = "NOR-GOV"
sjm_pop["pop_source"] = sjm_pop.pop_source.where(
    popraw_df.loc[["SJM"], "stat_nor_pop"].notnull(), "NOR-GOV + CIA_gr"
)
sjm_pop = sjm_pop.rename(columns={"stat_nor_pop": "pop"})

In [39]:
pop_cleaned = pd.concat([pop_cleaned, sjm_pop])

#### Serbia (`SRB`) and Kosovo (`XKO`)

- `XKO`: use WB WDI population
- `SRB`: use PWT population

In [40]:
# XKO
ko_pop = popraw_df.loc[["XKO"], "wb_pop"].rename("pop").to_frame()
ko_pop["pop_source"] = "WB"

# creating alternative SRB
srb_pop = popraw_df.loc[["SRB"], ["pop"]]
srb_pop["pop_source"] = "PWT"
srb_pop.loc[("SRB", 2020), "pop_source"] = "PWT + IMF_gr"

In [41]:
pop_cleaned = pd.concat([pop_cleaned, ko_pop, srb_pop])

#### Paracel Islands

Use China growth to extrapolate single population estimate (2014) from CIA WFB

In [42]:
pop_cleaned = pd.concat(
    (pop_cleaned, interp_extrap_terr("XPI", "CHN", "cia_pop", "CIA"))
)

#### Akrotiri and Dhekelia

In [43]:
pop_cleaned = pd.concat(
    (pop_cleaned, interp_extrap_terr("XAD", "CYP", "cia_pop", "CIA"))
)

#### United States territories

It seems that there is some arbitrariness when it comes to including territories or excluding them, when calculating for the population (even within the same dataset, it is unclear whether the population for some year includes or excludes the territories). We will take the PWT10.0 population as the U.S. population without the territories and use UN growth rate when necessary; this will be added later.

US territories considered are:
- `UMI`: use US Census information for 2000, but will record as uninhabited otherwise
- `PRI`, `VIR`, `GUM`, `ASM`, `MNP`: use UN information

In [44]:
# UMI
umi_df = (
    popraw_df.loc[["UMI"], "noecon_pop"]
    .fillna(popraw_df.loc[["UMI"], "us_census_pop"])
    .rename("pop")
    .to_frame()
)
umi_df["pop_source"] = "uninhabited"
umi_df.loc[(slice(None), slice(None, 2000)), "pop_source"] = "USA-GOV"

# PRI, VIR, GUM, ASM, MNP
us_terr_df = (
    popraw_df.loc[["PRI", "VIR", "GUM", "ASM", "MNP"], ["un_pop"]]
    .dropna()
    .rename(columns={"un_pop": "pop"})
)
us_terr_df["pop_source"] = "UN"

In [45]:
pop_cleaned = pd.concat([pop_cleaned, umi_df, us_terr_df], axis=0)

#### Vatican

In [46]:
vat_df = (
    np.exp(
        np.log(popraw_df.loc["VAT", "cia_pop"].reindex(sset.HISTORICAL_YEARS))
        .interpolate()
        .rename("pop")
    )
    .reset_index()
    .assign(ccode="VAT", pop_source="CIA")
    .set_index(["ccode", "year"])
)

pop_cleaned = pd.concat([pop_cleaned, vat_df])

#### Uninhabited areas

We exclude `UMI` here due to it having some population in 2000.

In [47]:
pop_cleaned = pd.concat(
    [
        pop_cleaned,
        pd.DataFrame(
            index=pd.MultiIndex.from_product(
                (np.setdiff1d(sset.UNINHABITED_ISOS, ["UMI"]), sset.HISTORICAL_YEARS),
                names=["ccode", "year"],
            )
        ).assign(pop=0, pop_source="uninhabited"),
    ]
)

### PWT 10.0 population (and using UN growth rates when necessary)

In [48]:
remaining = np.setdiff1d(
    pwt100.ccode.unique(), pop_cleaned.index.get_level_values("ccode").unique()
)

In [49]:
# make sure we're not missing any pop data for an iso
assert not remaining[
    ~np.isin(remaining, popraw_df.index.get_level_values("ccode").unique())
].size

# make sure all PWT isos are ones we want
assert np.isin(remaining, ALL_ISOS).all()

In [50]:
# we exclude the country codes that have already been cleaned
pwt_pop_clean = xr.Dataset.from_dataframe(
    popraw_df.loc[remaining, ["pop", "un_pop"]]
).sortby("year")
pwt_pop_clean = ypk_fn.smooth_fill(
    pwt_pop_clean["pop"],
    pwt_pop_clean["un_pop"],
    time_dim="year",
    other_dim="ccode",
).to_dataframe()
pwt_pop_clean["pop_source"] = "PWT"
pwt_pop_clean = pwt_pop_clean.join(popraw_df["pop"].rename("old"), how="left")
pwt_pop_clean.loc[pd.isnull(pwt_pop_clean["old"]), "pop_source"] = "PWT + UN_gr"
pwt_pop_clean = pwt_pop_clean.loc[
    (slice(None), sset.HISTORICAL_YEARS), ["pop", "pop_source"]
]

In [51]:
pop_cleaned = pd.concat((pop_cleaned, pwt_pop_clean))

### UN population, for remaining countries and regions

In [52]:
remaining = np.setdiff1d(
    FINAL_ISOS,
    pop_cleaned.dropna(subset="pop").index.get_level_values("ccode").unique(),
)

In [53]:
other_from_un_df = gdp_pop_df.loc[
    (remaining, sset.HISTORICAL_YEARS), ["un_pop"]
].rename(columns={"un_pop": "pop"})
other_from_un_df["pop_source"] = "UN"

assert other_from_un_df.to_xarray().notnull().all().to_array().all()

In [54]:
pop_cleaned = pd.concat([pop_cleaned, other_from_un_df])

In [55]:
pop_cleaned = pop_cleaned.loc[(slice(None), sset.HISTORICAL_YEARS), :]

In [56]:
assert pop_cleaned.to_xarray().notnull().all()

### Rounding to ones of people and attaching units

In [57]:
pop_cleaned["pop"] = pop_cleaned["pop"].round().astype(int)

## Historical GDPpc (2000-2020), constant 2017 PPP USD, creating a single sequence

We will first work with the constant 2017 PPP USD (i.e., ones with the `rgdpna` in their names), then create current PPP 2017 USD versions appropriately (in accordance with `cgdpo`). We will work with **per capita** GDP when doing any form of extrapolation, whether that is based on growth rates or regression analysis.

### Making UN AMA current, non-PPP GDP per capita to constant 2017 USD, non-PPP GDP per capita

These values are not directly used, but will useful in acquiring constant PPP-equivalent values later on.

In [58]:
gdp_pop_df = pd.read_parquet(sset.PATH_INC_POP_AGG)

In [59]:
pl = (
    pd.read_excel(sset.PATH_PWT_RAW, usecols=["countrycode", "year", "pl_gdpo"])
    .rename(columns={"countrycode": "ccode"})
    .set_index(["ccode", "year"])
    .pl_gdpo
)

defla = pd.concat(
    [
        pl.loc["USA"].reset_index(),
        pd.DataFrame(data={"year": [2020], "pl_gdpo": [pl.loc["USA"].loc[2019]]}),
    ],
    axis=0,
).set_index("year")
gdp_pop_df = gdp_pop_df.join(defla, how="left")
gdp_pop_df["un_gdppc_const"] = gdp_pop_df.un_gdppc_nom / gdp_pop_df.pl_gdpo

### Using primary sources that are in constant PPP terms

#### Starting with PWT 10.0 and filling missing values with Fariss et al. (2022)

Note that Fariss et al. (2022) have a variable that we renamed `fariss_rgdpna_pc`, which is imputed per capita `rgdpna` (i.e., `rgdpna` divided by `pop` in PWT 10.0, which we named `rgdpna_pc`)

In [60]:
# fetching PWT information
y_df = gdp_pop_df.loc[(slice(None), sset.HISTORICAL_YEARS), ["rgdpna_pc"]]
y_df["gdppc_source"] = "PWT"
y_df.loc[pd.isnull(y_df.rgdpna_pc), "gdppc_source"] = np.nan

# attaching Fariss et al. (2022)
y_df = ypk_fn.smooth_fill_aux_source(
    y_df, gdp_pop_df, "fariss_rgdpna_pc", "FAMB", "PWT + FAMB_gr"
)

famb_copy = y_df.loc[y_df.gdppc_source == "FAMB", :].reset_index().ccode.unique()
print("Countries copying information from Fariss et al. are:")
print(famb_copy)

Countries copying information from Fariss et al. are:
['AFG' 'AND' 'CUB' 'ERI' 'FSM' 'KIR' 'LBY' 'LIE' 'MCO' 'MHL' 'NRU' 'PLW'
 'PNG' 'PRK' 'SLB' 'SMR' 'SOM' 'SSD' 'TLS' 'TON' 'TUV' 'VUT' 'WSM' 'XKO']


#### Attaching WB WDI (PPP terms, not nominal)

In [61]:
y_df = ypk_fn.smooth_fill_aux_source(
    y_df, gdp_pop_df, "wb_rgdpna_pc", "WB", "PWT/FAMB + WB_gr"
)

In [62]:
# manually fixing source information
y_df.loc[
    (y_df.gdppc_source == "PWT/FAMB + WB_gr")
    & (y_df.index.get_level_values("ccode") == "CUW"),
    "gdppc_source",
] = "PWT + WB_gr"

gr_codes = (
    y_df.loc[y_df.gdppc_source == "PWT/FAMB + WB_gr", :].reset_index().ccode.unique()
)
for i in gr_codes:
    prev = y_df.loc[(i, 2019), "gdppc_source"]
    y_df.loc[(i, 2020), "gdppc_source"] = y_df.loc[(i, 2020), "gdppc_source"].replace(
        "PWT/FAMB", prev
    )

#### Attaching MPD

We convert constant 2011 PPP USD  values in MPD to constant 2017 PPP USD (using PWT 10.0 information), and apply the neutral assumption when PPP conversion rates are missing.

In our workflow, there are no country-years using MPD information directly since all independent countries in MPD have been considered in Fariss et al. (2022), where `rgdpna_pc`-equivalent values can be acquired from.

In [63]:
# from constant 2011 PPP USD ot constant 2017 PPP USD
ppp_to_2017 = ypk_fn.ppp_conversion_specific_year(
    2017, sset.PATH_PWT_RAW, extrap_sim=True, fill_msng_ctries=sset.PPP_CCODE_IF_MSNG
)
ppp_11_to_17 = ppp_to_2017.loc[(slice(None), 2011), :].reset_index()
ppp_11_to_17.set_index(["ccode"], inplace=True)
mpd_df = gdp_pop_df[["mpd_rgdpna_pc"]].join(ppp_11_to_17.conv, how="left")
mpd_df.loc[~pd.isnull(mpd_df.conv), "conv"] = 1
mpd_df["mpd_rgdpna_pc_17"] = (
    mpd_df["mpd_rgdpna_pc"].mul(mpd_df["conv"]) / defla.loc[2011, "pl_gdpo"]
)

y_df = ypk_fn.smooth_fill_aux_source(
    y_df, mpd_df, "mpd_rgdpna_pc_17", "MPD", "PWT/FAMB/WB + MPD_gr"
)

mpd_n = y_df.loc[y_df.gdppc_source.isin(["MPD", "PWT/FAMB/WB + MPD_gr"]), :].shape[0]
print(f"Country-years using MPD information directly are {mpd_n}.")

Fetching information from PWT...


  0%|          | 0/128 [00:00<?, ?it/s]

Filling in the missing countries...
...done
Country-years using MPD information directly are 0.


#### Attaching IMF WEO

In [64]:
y_df = ypk_fn.smooth_fill_aux_source(
    y_df, gdp_pop_df, "imf_rgdpna_pc", "IMF", "PWT/FAMB/WB + IMF_gr"
)

In [65]:
# manually fixing source information
imf_gr_ccodes = (
    y_df.loc[y_df.gdppc_source == "PWT/FAMB/WB + IMF_gr", :]
    .reset_index()
    .ccode.unique()
)
for i in imf_gr_ccodes:
    prev = y_df.loc[(i, 2019), "gdppc_source"]
    y_df.loc[(i, 2020), "gdppc_source"] = f"{prev} + IMF_gr"

### Using secondary sources that are not necessarily in constant PPP terms

#### Individual countries requiring attention

**Copy `NOR` to fill in `SJM`**

`SJM` has no reliable information (there is some information in CIA WFB, but cannot be used).

In [66]:
y_df.loc[("SJM", sset.HISTORICAL_YEARS), "rgdpna_pc"] = y_df.loc[
    ("NOR", sset.HISTORICAL_YEARS), "rgdpna_pc"
].values
y_df.loc[("SJM", sset.HISTORICAL_YEARS), "gdppc_source"] = "copy_NOR"

**North Korea (`PRK`), 2019-2020 GDP growth rates from the Bank of Korea**

In [67]:
# 2019-2020 values are missing, and we use 2018-19, 2019-20 growth rates
y_df.loc[("PRK", 2020), "rgdpna_pc"] = (
    y_df.loc[("PRK", 2019), "rgdpna_pc"]
    * pop_cleaned.loc[("PRK", 2019), "pop"]
    * (gdp_pop_df.loc[("PRK", 2020), "bok_prk_real_gdp_gr"] / 100 + 1)
    / pop_cleaned.loc[("PRK", 2020), "pop"]
)
y_df.loc[("PRK", 2020), "gdppc_source"] = "FAMB + BOK_gr"

#### Attaching WB WDI in non-PPP constant 2017 dollars

Note that we will not copy these values for missing country-years but use them with `ypk_fn.smooth_fill` to utilize their growth rates.

In [68]:
# using WB WDI nominal values
y_df = ypk_fn.smooth_fill_aux_source(
    y_df, gdp_pop_df, "wb_gdppc_nom", "WB(not_PPP)", "FAMB + WB(not_PPP)_gr"
)

# making sure we don't copy these non-PPP values
y_df.loc[(y_df.gdppc_source == "WB(not_PPP)"), ["rgdpna_pc", "gdppc_source"]] = np.nan

#### Attaching UN WDI in non-PPP constant 2017 dollars

Note that we will not copy these values for missing country-years but use them with `ypk_fn.smooth_fill` to utilize their growth rates.

In [69]:
y_df = ypk_fn.smooth_fill_aux_source(
    y_df, gdp_pop_df, "un_gdppc_const", "UN(not_PPP)", "PWT + UN(not_PPP)_gr"
)

# making sure we don't copy these non-PPP values
y_df.loc[(y_df.gdppc_source == "UN(not_PPP)"), ["rgdpna_pc", "gdppc_source"]] = np.nan

# manual clean-up for source
y_df.loc[
    (y_df.index.get_level_values("ccode") == "LIE")
    & (y_df.gdppc_source == "PWT + UN(not_PPP)_gr"),
    "gdppc_source",
] = "FAMB + UN(not_PPP)_gr"

### Taking care of uninhabited regions

We will leave `UMI`'s 2000 value as `np.nan` for now as it is technically inhabited in 2000.

In [70]:
y_df = y_df.loc[
    ~y_df.index.get_level_values("ccode").isin(sset.UNINHABITED_ISOS), :
].copy()

uninhabited = []
for i in list(sset.UNINHABITED_ISOS):
    i_df = y_df.loc["USA", :].reset_index()
    i_df["ccode"], i_df["rgdpna_pc"], i_df["gdppc_source"] = i, 0, "uninhabited"
    i_df.set_index(["ccode", "year"], inplace=True)
    uninhabited.append(i_df)

y_df = pd.concat([y_df] + uninhabited, axis=0).sort_index()

# cleaning UMI
y_df.loc[("UMI", 2000), "rgdpna_pc"] = np.nan
y_df.loc[
    ("UMI", [x for x in sset.HISTORICAL_YEARS if x != 2000]), "gdppc_source"
] = "uninhabited"

### Territories with no info at all

IOT has just US and British military personnel, will assign GBR gdppc

In [71]:
def adjust_terrs(y_df, terr, sov):
    y_df.loc[[terr]] = (
        y_df.loc[[sov]]
        .reset_index()
        .assign(gdppc_source=f"copy_{sov}", ccode=terr)
        .set_index(["ccode", "year"])
    )
    return y_df


for terr, sov in (("IOT", "GBR"), ("XPI", "CHN"), ("XAD", "CYP")):
    y_df = adjust_terrs(y_df, terr, sov)

### Current and former territories and disputed regions

For convenience, we will call
- **Group X**: countries or regions that are current or former sovereigns, claimant, or nearby metropolitan country to those in **Group Y** (e.g., `GBR`)
- **Group Y**: countries or regions that are current or former territories, disputed areas, or nearby small state to those in **Group X** (e.g., `GGY`)

We exclude `SJM` (and its relationship to `NOR`) since there is not any reliable GDPpc information; there is *some* information from CIA WFB, but using this would overstate `SJM`'s GDPpc (to be something in the order of millions of USD). For this reason, we will copy `NOR` GDPpc for `SJM` until a more reliable data source is found. 

#### Organizing the relationships between Groups X and Y

In general, the reason for assigning country-sovereignty ratios instead of that between similar countries is based on the argument in [**Bertram (World Development, 2003)**](https://www.sciencedirect.com/science/article/abs/pii/S0305750X03002134) that territories (or island economies in the paper, to be more specific) seem to converge to trend with their metropolitan patrons more so than similar territory (island) economies.

Following relationships in `GROUPS_X_Y` are considered.

In [72]:
GROUPS_X_Y = ypk_fn.GROUPS_X_Y

#### Cleaning Mainland France and 5 Overseas Departments

We have previously recognized that major sources like PWT 10.0, WB WDI, and Fariss et al. (2022) have the information of France as Mainland France plus the 5 overseas departments; we will separate them here.

Based on our investigation, there seems to be a conversion rate between the PWT 10.0 `rgdpna_pc` values and OECD `oecd_rgdpna_pc` values for Mainland France + 5 overseas departments. We find that rate by minimizing the SSE and apply those so that OECD GDP values can be turned into PWT `rgdpna`-equivalents.

In [73]:
# getting the entire stream of FRA + Overseas Dept.s (FRA+OV)
fra_ov = gdp_pop_df.loc[
    (sset.FRA_OVERSEAS_DEPT, range(1950, 2021)), ["rgdpna_pc", "pop", "rgdpna"]
]

# population of 2020 for FRA+OV is extrapolated simply using WB population
fra_ov.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "pop"] = (
    fra_ov.loc[(sset.FRA_OVERSEAS_DEPT, 2019), "pop"]
    * gdp_pop_df.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "wb_pop"]
    / gdp_pop_df.loc[(sset.FRA_OVERSEAS_DEPT, 2019), "wb_pop"]
)

# take the FRA+OV 2020 GDPpc that we calculated above
fra_ov.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "rgdpna_pc"] = y_df.loc[
    (sset.FRA_OVERSEAS_DEPT, 2020), "rgdpna_pc"
].item()

# imputing 2020 rgdpna of FRA+OV via rgdpna_pc * pop
fra_ov.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "rgdpna"] = (
    fra_ov.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "rgdpna_pc"]
    * fra_ov.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "pop"]
)
fra_ov["gdppc_source"] = "PWT"
fra_ov.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "gdppc_source"] = "PWT + WB_gr"

# set up dataset for regression
oecd_fra_ov = gdp_pop_df[["oecd_rgdpna_pc", "oecd_rgdpna", "oecd_pop"]].dropna(
    how="all"
)
reg_df = oecd_fra_ov.loc[sset.FRA_OVERSEAS_DEPT].join(
    fra_ov.droplevel("ccode"), how="inner"
)

# get coefficient for x-walk
coef = ols("rgdpna_pc ~ oecd_rgdpna_pc - 1", data=reg_df).fit().params["oecd_rgdpna_pc"]

# predict PWT-consistent GDP pc
ov_dept = (
    (
        oecd_fra_ov.drop(sset.FRA_OVERSEAS_DEPT, level="ccode").oecd_rgdpna_pc.dropna()
        * coef
    )
    .rename("rgdpna_pc")
    .to_frame()
    .assign(gdppc_source="OECD + PWT-OECD_ratio")
)

# smooth-fill with france growth rates
ov_da = ov_dept.rgdpna_pc.to_xarray().reindex(year=sset.HISTORICAL_YEARS)
src_da = (
    gdp_pop_df.loc[sset.FRA_OVERSEAS_DEPT, "rgdpna_pc"]
    .to_xarray()
    .reindex(year=sset.HISTORICAL_YEARS)
    .expand_dims(ccode=ov_da.ccode)
)
ov_gdppc = (
    ypk_fn.smooth_fill(ov_da, src_da, time_dim="year", other_dim="ccode")
    .to_series()
    .to_frame()
    .join(ov_dept.gdppc_source, how="outer")
    .fillna({"gdppc_source": "OECD + PWT-OECD_ratio + FRA-interp"})
)

# filling in 1980-2020 values of mainland France in `gdp_pop_df` for convenience
fra_gdppc = (
    gdp_pop_df.loc["FRA", "imf_rgdpna_pc"]
    .loc[sset.HISTORICAL_YEARS]
    .rename("rgdpna_pc")
    .reset_index()
    .assign(ccode="FRA", gdppc_source="IMF")
    .set_index(["ccode", "year"])
)
gdp_pop_df["rgdpna_pc"] = gdp_pop_df["rgdpna_pc"].fillna(fra_gdppc.rgdpna_pc)

fra_all_y = pd.concat([fra_ov[["rgdpna_pc", "gdppc_source"]], fra_gdppc, ov_gdppc])

In [74]:
y_df = y_df.fillna(ov_gdppc)

#### Åland (`ALA`) from Åland Statistics

We use the ratio of Åland to Finland GDPpc (from Åland Statistics) with Finland constant PPP GDPpc to get `ALA` constant PPP GDPpc.

In [75]:
# retrieving 1995-2019 ALA-to-FIN ratio
aland = gdp_pop_df.loc["ALA", "aland_cgdpo_pc"].dropna()
aland_val, aland_yrs = aland.values, aland.reset_index().year.values
fin_val = gdp_pop_df.loc["FIN", "aland_cgdpo_pc"].dropna().values
assert len(aland_yrs) == len(fin_val)

# attaching FIN rgdpna_pc values to get ALA rgdpna_pc values
aland_fin = (
    pd.DataFrame(
        data={
            "ccode": ["ALA"] * len(fin_val),
            "ratio": aland_val / fin_val,
            "year": aland_yrs,
        }
    )
    .set_index(["ccode", "year"])
    .join(gdp_pop_df.loc["FIN", ["rgdpna", "pop"]], how="left")
)
aland_fin["x_rgdpna_pc"] = aland_fin["rgdpna"].div(aland_fin["pop"])
aland_fin["rgdpna_pc"] = aland_fin[["x_rgdpna_pc", "ratio"]].prod(axis=1)
aland_fin["x_ccode"] = "FIN"
aland_fin["gdppc_source"] = "Aland-GOV:ALA/FIN_ratio + PWT:FIN_prod"
terr_columns = ["x_ccode", "x_rgdpna_pc", "rgdpna_pc", "ratio", "gdppc_source"]
aland_fin = aland_fin[terr_columns]

#### `BES` from Statistics Netherlands

We use the ratio of BES islands (from Statistics Netherlands) to `NLD` (from UN AMA) current, non-PPP GDPpc to get `BES` constant PPP GDPpc.

In [76]:
bes = (
    gdp_pop_df.loc[["BES"], ["bes_gov_gdp_nom"]]
    .dropna()
    .join(gdp_pop_df.loc["NLD", "un_gdppc_nom"])
    .join(pop_cleaned.loc["BES", "pop"])
)
bes["ratio"] = bes["bes_gov_gdp_nom"].div(bes[["un_gdppc_nom", "pop"]].prod(axis=1))
bes["x_ccode"] = "NLD"
bes = bes.join(gdp_pop_df.loc["NLD", "rgdpna_pc"].rename("x_rgdpna_pc"))
bes["rgdpna_pc"] = bes[["x_rgdpna_pc", "ratio"]].prod(axis=1)
bes["gdppc_source"] = "(NLD-GOV:BES + UN(not_PPP):NLD_ratio) + PWT:NLD_prod"
bes = bes[terr_columns]

# merging
group_y = pd.concat([aland_fin, bes])

#### Saint Barthélemy from CEROM (current, non-PPP GDPpc)

We use the ratio of `BLM` (from CEROM) to `FRA` with 5 Overseas Department (not PPP, from WB WDI), then multiply this with PWT 10.0 values of `FRA` with 5 Overseas Department.

In [77]:
# turn these values to constant 2017, non-PPP USD
blm = (
    gdp_pop_df.loc[(["BLM"], slice(None)), ["cerom_gdppc_nom"]]
    .dropna()
    .join(defla, how="left")
)
blm["cerom_const_gdppc"] = blm["cerom_gdppc_nom"].div(blm["pl_gdpo"])

# creating ratios between BLM and FRA+, among constant 2017 non-PPP USD values
blm = blm.join(
    gdp_pop_df.loc[
        (sset.FRA_OVERSEAS_DEPT, blm.reset_index()["year"].values),
        ["wb_gdppc_nom", "rgdpna", "pop"],
    ]
    .reset_index()
    .drop(["ccode"], axis=1)
    .set_index(["year"])
)
blm["ratio"] = blm["cerom_const_gdppc"].div(blm["wb_gdppc_nom"])

# multiplying the ratio to PPP values for FRA+ to get PPP equivalent for BLM
blm["x_rgdpna_pc"] = blm["rgdpna"].div(blm["pop"])
blm["x_ccode"] = sset.FRA_OVERSEAS_DEPT
blm["rgdpna_pc"] = blm[["x_rgdpna_pc", "ratio"]].prod(axis=1)
blm["gdppc_source"] = "(CEROM:BLM + WB(not_PPP):FRA+OV)_ratio + PWT:FRA+OV_prod"
blm = blm[terr_columns]

# merging
group_y = pd.concat([group_y, blm])

#### Cocos (Keeling) Islands (`CCK`) and Christmas Island (`CXR`) from Australian Parliament (2010 current, non-PPP GDP)

Similarly, we use the ratio of `CCK` or `CXR` (from Australian Parliament) to `AUS` (not PPP, from WB WDI), then multiply this with PWT 10.0 values of `AUS`.

In [78]:
# changing from 2010 US dollars to 2017 US dollars
cck_cxr = gdp_pop_df.loc[(["CCK", "CXR"], [2010]), ["aus_parl_gdp_nom"]].join(
    pop_cleaned["pop"], how="left"
)
cck_cxr["aus_parl_gdppc_nom"] = (
    cck_cxr["aus_parl_gdp_nom"].div(cck_cxr["pop"]) / defla.loc[2010, "pl_gdpo"]
)
cck_cxr.drop(["pop"], inplace=True, axis=1)

# attaching AUS values
cck_cxr = cck_cxr.join(y_df.loc["AUS", "rgdpna_pc"].rename("x_rgdpna_pc")).join(
    gdp_pop_df.loc["AUS", "wb_gdppc_nom"]
)
cck_cxr["ratio"] = cck_cxr["aus_parl_gdppc_nom"] / cck_cxr["wb_gdppc_nom"]
cck_cxr["rgdpna_pc"] = cck_cxr[["x_rgdpna_pc", "ratio"]].prod(axis=1)
cck_cxr["x_ccode"] = "AUS"
cck_source = "(AUS-GOV:CCK + WB(not_PPP):AUS)_ratio + PWT:AUS_prod"
cck_cxr["gdppc_source"] = cck_source
cck_cxr.loc["CXR", "gdppc_source"] = cck_source.replace("CCK", "CXR")
cck_cxr = cck_cxr[terr_columns]

# merging
group_y = pd.concat([group_y, cck_cxr])

#### Cook Island (`COK`) and Niue (`NIU`) from Asian Development Bank (current, non-PPP GDP)

We use the ratio of `COK` or `NIU` to `NZL` (both from ADB and current, non-PPP GDP), then multiply this with PWT 10.0 + WB WDI values of `NZL`.

In [79]:
info = pd.concat(
    [
        pd.read_excel(
            sset.PATH_ADB_RAW,
            sheet_name=v,
            na_values=["...", "…"],
            skiprows=list(range(0, 6)),
            usecols="B:W",
            index_col=0,
        )
        .filter(like="Per capita GDP", axis=0)
        .T.rename(columns=lambda x: "gdppc")
        .rename_axis("year")
        .reset_index()
        .assign(ccode=k)
        .set_index(["ccode", "year"])
        .gdppc
        for k, v in [("COK", "coo"), ("NIU", "niu"), ("NZL", "nzl")]
    ]
)
ratios = (info.drop("NZL") / info.loc["NZL"]).ffill()
cok_niu_adb = (
    (ratios * y_df.loc["NZL", "rgdpna_pc"])
    .to_frame("rgdpna_pc")
    .assign(gdppc_source="ADB+NZL_ratio + PWT:NZL_prod", x_ccode="NZL")
)

In [80]:
group_y = pd.concat([group_y, cok_niu_adb])

#### Falkland (`FLK`), Gibraltar (`GIB`), Guernsey (`GGY`), and Jersey (`JEY`) from their government reports (all non-PPP)

In [81]:
# GBR rgdpna_pc for multiplying with ratio
gbr_info = y_df.loc["GBR", "rgdpna_pc"].rename("x_rgdpna_pc")
gbr_nom_curr = gdp_pop_df.loc["GBR", "un_gdppc_nom"]

# FLK, GIB, and GGY
flk = gdp_pop_df.loc[(["FLK"], slice(None)), ["flk_gov_gdp_curr"]].join(
    pop_cleaned["pop"]
)
flk["gdppc_curr"] = flk["flk_gov_gdp_curr"].div(flk["pop"])
gib = gdp_pop_df.loc[["GIB"], "gib_gov_gdppc_curr"].rename("gdppc_curr").to_frame()

# GGY requires smooth_fill
ggy_co = ["ggy_gov_gdp_curr", "ggy_gov_gdp_alt"]
ggy = gdp_pop_df.loc[["GGY"], ggy_co].dropna(how="all").join(pop_cleaned["pop"])
for i in ggy_co:
    ggy[i.replace("ggy_gov_", "").replace("gdp", "gdppc")] = ggy[i].div(ggy["pop"])
ggy = xr.Dataset.from_dataframe(ggy[["gdppc_curr", "gdppc_alt"]])
ggy = ypk_fn.smooth_fill(
    ggy["gdppc_curr"], ggy["gdppc_alt"], other_dim="ccode", time_dim="year"
).to_dataframe()

# cleaning with UN nominal, current GDPpc
flk_gib_ggy, fggj = [], ["FLK", "GIB", "GGY", "JEY"]
fggj_source = "({}-GOV + UN(not_PPP):GBR)_ratio + PWT:GBR"
for k, i in enumerate([flk, gib, ggy]):
    i = i[["gdppc_curr"]].dropna().join(gbr_nom_curr).join(gbr_info)
    i["ratio"] = i["gdppc_curr"].div(i["un_gdppc_nom"])
    i["rgdpna_pc"] = i[["x_rgdpna_pc", "ratio"]].prod(axis=1)
    sor = fggj_source.format(fggj[k])
    i["gdppc_source"], i["x_ccode"] = sor, "GBR"
    if 2020 in i.reset_index()["year"].unique():
        i.loc[(fggj[k], 2020), "gdppc_source"] = sor.replace("PWT:", "PWT+WB_gr:")
    flk_gib_ggy.append(i)
flk_gib_ggy = pd.concat(flk_gib_ggy)[terr_columns]

In [82]:
# JEY
jey = gdp_pop_df.loc[["JEY"], ["jey_gov_gdp_const"]].dropna().join(pop_cleaned["pop"])
jey["gdppc_const"] = jey["jey_gov_gdp_const"].div(jey["pop"])
jey = jey.join(y_df.loc["GBR", "rgdpna_pc"].rename("x_rgdpna_pc")).join(
    gdp_pop_df.loc["GBR", "wb_gdppc_nom"]
)
jey["ratio"], jey["x_ccode"] = jey["gdppc_const"].div(jey["wb_gdppc_nom"]), "GBR"
jey["rgdpna_pc"] = jey[["x_rgdpna_pc", "ratio"]].prod(axis=1)
jey["gdppc_source"] = fggj_source.format("JEY")
jey.loc[("JEY", 2020), "gdppc_source"] = fggj_source.format("JEY").replace(
    "PWT:", "PWT+WB_gr:"
)

# merging with the rest
fggj = pd.concat([flk_gib_ggy, jey[terr_columns]])

# merging
group_y = pd.concat([group_y, fggj])

#### Norfolk Island (`NFK`) information from Treadgold reports

Ratios from Treadgold reports have already been cleaned with Australia `rgdpna_pc` values from PWT.

In [83]:
nfk = (
    gdp_pop_df.loc[["NFK"], ["treadgold_rgdpna_pc"]]
    .dropna()
    .join(gdp_pop_df.loc["AUS", "rgdpna_pc"])
)
nfk.rename(
    columns={"rgdpna_pc": "x_rgdpna_pc", "treadgold_rgdpna_pc": "rgdpna_pc"},
    inplace=True,
)
nfk["ratio"], nfk["x_ccode"] = nfk["rgdpna_pc"].div(nfk["x_rgdpna_pc"]), "AUS"
nfk["gdppc_source"] = "Treadgold_ratio + PWT:AUS_prod"
nfk = nfk[terr_columns]

# merging
group_y = pd.concat([group_y, nfk])

#### Pitcairn (`PCN`) and St. Helena (`SHN`) information from their respective Governments

In [84]:
gbr_info = gdp_pop_df.loc["GBR", ["un_gdppc_nom", "rgdpna_pc"]]

# PCN
pcn = (
    gdp_pop_df.loc[["PCN"], ["pcn_gov_gdp_nom"]]
    .dropna()
    .join(gbr_info)
    .join(pop_cleaned.loc["PCN", "pop"])
    .rename(columns={"rgdpna_pc": "x_rgdpna_pc"})
)
pcn["gdppc_nom"], pcn["x_ccode"] = pcn["pcn_gov_gdp_nom"].div(pcn["pop"]), "GBR"
pcn["ratio"] = pcn["gdppc_nom"].div(pcn["un_gdppc_nom"])
pcn["rgdpna_pc"] = pcn[["x_rgdpna_pc", "ratio"]].prod(axis=1)
pcn["gdppc_source"] = "(PCN-GOV + UN(not_PPP):GBR)_ratio + PWT:GBR"
pcn = pcn[terr_columns]

# SHN
shn = gdp_pop_df.loc[["SHN"], ["st_helena_gov_gdppc_nom"]].dropna().join(gbr_info)
shn.rename(columns={"rgdpna_pc": "x_rgdpna_pc"}, inplace=True)
shn["ratio"] = shn["st_helena_gov_gdppc_nom"].div(shn["un_gdppc_nom"])
shn["rgdpna_pc"], shn["x_ccode"] = shn[["x_rgdpna_pc", "ratio"]].prod(axis=1), "GBR"
shn["gdppc_source"] = "(SHN-GOV + UN(not_PPP):GBR)_ratio + PWT:GBR"
shn = shn[terr_columns]

pcn_shn = pd.concat([pcn, shn], axis=0)

# merging
group_y = pd.concat([group_y, pcn_shn])

#### Vatican (`VAT`) information from Olga Kuznetsova (2002)

From the Worldmark Encyclopedia of National Economies, link [here](https://www.researchgate.net/publication/313448599_Vatican_City); see p.6 for nominal GDP and nominal GDPpc 21 million and 21198 (in nominal USD, in 1999).

In [85]:
vat = pd.DataFrame(data={"ccode": ["VAT"], "year": [1999], "gdppc_nom": 21198})
vat = (
    vat.set_index(["ccode", "year"])
    .join(gdp_pop_df.loc["ITA", ["un_gdppc_nom", "rgdpna_pc"]])
    .rename(columns={"rgdpna_pc": "x_rgdpna_pc"})
)
vat["ratio"] = vat["gdppc_nom"].div(vat["un_gdppc_nom"])
vat["rgdpna_pc"], vat["x_ccode"] = vat[["x_rgdpna_pc", "ratio"]].prod(axis=1), "ITA"
vat["gdppc_source"] = "(Kuznetsova:VAT + UN(not_PPP):ITA)_ratio + PWT:ITA"
vat = vat[terr_columns]

# merging
group_y = pd.concat([group_y, vat])

#### Group Y countries having information in WB WDI (as nominal GDPpc)

In [86]:
# we clean group X countries due to France representing FRA+Overseas Dept in WB WDI
wb_group_y_ctries = np.intersect1d(
    GROUPS_X_Y.reset_index().ccode.unique(),
    gdp_pop_df["wb_gdppc_nom"].dropna().reset_index().ccode.unique(),
)
wb_group_x_ctries = [
    GROUPS_X_Y.loc[x, "x_ccode"]
    if (GROUPS_X_Y.loc[x, "x_ccode"] != "FRA")
    else sset.FRA_OVERSEAS_DEPT
    for x in wb_group_y_ctries
]

wb_yiso_df = ypk_fn.fill_ratio_nominal_gdppc(
    wb_group_y_ctries, wb_group_x_ctries, gdp_pop_df, y_df
)

#### Group Y countries having information in UN AMA (as nominal GDPpc)

In [87]:
# we clean group X countries due to France representing FRA+Overseas Dept in UN AMA
un_group_y_ctries = np.intersect1d(
    GROUPS_X_Y.reset_index().ccode.unique(),
    gdp_pop_df["un_gdppc_nom"].dropna().reset_index().ccode.unique(),
)
un_group_x_ctries = [
    GROUPS_X_Y.loc[x, "x_ccode"]
    if (GROUPS_X_Y.loc[x, "x_ccode"] != "FRA")
    else sset.FRA_OVERSEAS_DEPT
    for x in un_group_y_ctries
]
un_yiso_df = ypk_fn.fill_ratio_nominal_gdppc(
    un_group_y_ctries, un_group_x_ctries, gdp_pop_df, y_df, "un_gdppc_nom"
)

#### Group Y countries having information in CIA WFB

CIA WFB is sometimes unreliable, so further manual inspection was done before working with the below countries.

In [88]:
cia_group_y_ctries = np.intersect1d(
    GROUPS_X_Y.reset_index().ccode.unique(),
    gdp_pop_df["cia_rgdpna_pc"].dropna().reset_index().ccode.unique(),
)
cia_group_x_ctries = [GROUPS_X_Y.loc[x, "x_ccode"] for x in cia_group_y_ctries]
cia_yiso_df = ypk_fn.fill_ratio_nominal_gdppc(
    cia_group_y_ctries, cia_group_x_ctries, gdp_pop_df, y_df, "cia_rgdpna_pc"
)

Let us gather `wb_yiso_df`, `un_yiso_df`, and `cia_yiso_df` together.

In [89]:
wuc_cols = ["rgdpna_pc", "x_rgdpna_pc", "gdppc_source", "x_ccode"]
wuc_cols_un = {x: x + "_un" for x in wuc_cols}
wuc_cols_cia = {x: x + "_cia" for x in wuc_cols}
wuc_yiso_df = wb_yiso_df.join(
    un_yiso_df[wuc_cols].rename(columns=wuc_cols_un), how="outer"
).join(cia_yiso_df[wuc_cols].rename(columns=wuc_cols_cia), how="outer")

# filling with UN nominal ratios
for k, v in wuc_cols_un.items():
    wuc_yiso_df.loc[pd.isnull(wuc_yiso_df[k]), k] = wuc_yiso_df.loc[
        pd.isnull(wuc_yiso_df[k]), v
    ]

# filling with CIA ratios
for k, v in wuc_cols_cia.items():
    wuc_yiso_df.loc[pd.isnull(wuc_yiso_df[k]), k] = wuc_yiso_df.loc[
        pd.isnull(wuc_yiso_df[k]), v
    ]

# dropping unnecessary columns
wuc_yiso_df.drop(
    list(wuc_cols_un.values()) + list(wuc_cols_cia.values()), axis=1, inplace=True
)

#### Gathering all information on Group Y countries

In [90]:
wuc_cols_df = {x: x + "_wuc" for x in wuc_cols}
group_y = pd.concat(
    (
        group_y,
        # add france overseas departments
        fra_all_y.loc[sset.FRA_OVERSEAS_DEPT.split("+")[1:], :].assign(x_ccode="FRA"),
    )
).join(wuc_yiso_df[wuc_cols].rename(columns=wuc_cols_df), how="outer")
for k, v in wuc_cols_df.items():
    group_y.loc[pd.isnull(group_y[k]), k] = group_y.loc[pd.isnull(group_y[k]), v]
group_y.drop(list(wuc_cols_df.values()), axis=1, inplace=True)

# ensure no missing
assert group_y[["x_ccode", "rgdpna_pc"]].notnull().all().all()

Since many of the country-years in `group_y` actually have PPP GDPpc information elsewhere, let us try to gather these.

In [91]:
st = sset.HISTORICAL_YEARS[0]
end = sset.HISTORICAL_YEARS[-1]

ppp_y_codes = np.intersect1d(
    GROUPS_X_Y.reset_index().ccode.unique(),
    y_df["rgdpna_pc"].dropna().reset_index().ccode.unique(),
)
group_y_ppp = []
for i in ppp_y_codes:
    cnd = y_df.loc[(i, [st, end]), "rgdpna_pc"].isnull()

    if cnd[0]:
        i_df = y_df.loc[[i], :].copy()
    else:
        i_source = y_df.loc[i, "gdppc_source"].dropna().unique()
        if "WB" in i_source:
            i_df = gdp_pop_df.loc[[i], "wb_rgdpna_pc"].rename("rgdpna_pc").to_frame()
            i_df["gdppc_source"] = "WB"
        elif "FAMB" in i_source:
            i_df = (
                gdp_pop_df.loc[[i], "fariss_rgdpna_pc"].rename("rgdpna_pc").to_frame()
            )
            i_df["gdppc_source"] = "FAMB"
        else:
            i_df = gdp_pop_df.loc[[i], "rgdpna_pc"].to_frame()
            i_df["gdppc_source"] = "PWT"
    xiso = GROUPS_X_Y.loc[i, "x_ccode"]
    i_df["x_ccode"] = xiso

    if (not cnd[1]) and (not cnd[0]):
        i_df.loc[(i, end), "rgdpna_pc"] = y_df.loc[(i, end), "rgdpna_pc"]
        i_df.loc[(i, end), "gdppc_source"] = y_df.loc[(i, end), "gdppc_source"]

    i_df = i_df.join(gdp_pop_df.loc[xiso, "rgdpna_pc"].rename("x_rgdpna_pc"))
    if not cnd[1]:
        i_df.loc[(i, end), "x_rgdpna_pc"] = y_df.loc[(xiso, end), "rgdpna_pc"]

    group_y_ppp.append(i_df)

group_y_ppp = pd.concat(group_y_ppp).dropna()

# merging with the rest
wuc_dict = {x: x + "_wuc" for x in wuc_cols}
group_y_all = group_y_ppp.join(group_y.rename(columns=wuc_dict), how="outer")
for k, v in wuc_dict.items():
    group_y_all.loc[pd.isnull(group_y_all[k]), k] = group_y_all.loc[
        pd.isnull(group_y_all[k]), v
    ]
group_y_all.drop(list(wuc_dict.values()), axis=1, inplace=True)

In [92]:
# for consistency, will attach FRA instead of FRA+overseas depts
group_y_not_fraov = group_y_all.loc[
    ~group_y_all.x_ccode.isin([sset.FRA_OVERSEAS_DEPT, "FRA"]), :
].copy()
group_y_fraov = group_y_all.loc[
    group_y_all.x_ccode.isin([sset.FRA_OVERSEAS_DEPT, "FRA"]), :
].copy()
group_y_fraov.drop(["x_rgdpna_pc"], axis=1, inplace=True)
group_y_fraov = group_y_fraov.join(
    fra_all_y.loc["FRA", "rgdpna_pc"].rename("x_rgdpna_pc"), how="left"
)
group_y_fraov["x_ccode"] = "FRA"

# post-France cleaning
group_y_postfra = pd.concat([group_y_not_fraov, group_y_fraov])
group_y_postfra = group_y_postfra.loc[
    ~group_y_postfra[["x_rgdpna_pc", "rgdpna_pc"]].isnull().any(axis=1), :
].copy()
group_y_postfra["ratio"] = group_y_postfra["rgdpna_pc"].div(
    group_y_postfra["x_rgdpna_pc"]
)

#### Regression method from [Bertram (2004, World Development)](https://www.researchgate.net/publication/222432177_On_the_Convergence_of_Small_Island_Economies_with_Their_Metropolitan_Patrons)

We first organize the dataset for fitting the regression (as `bertram_prep` below).

In [93]:
# organizing the regression dataset
bertram_prep = group_y_postfra.join(GROUPS_X_Y[["disputed", "indep_yr"]], how="left")
bertram_prep = bertram_prep.reset_index().drop(["ratio"], axis=1)
bertram_prep["indep"] = (bertram_prep["indep_yr"] <= bertram_prep["year"]).astype(
    "int64"
)
bertram_prep["grpx_x_indep"] = bertram_prep["indep"].mul(bertram_prep["x_rgdpna_pc"])

We use $k$-folds cross-validation (with $k=10$ in our case) to test which of the following specifications would be best.

Baseline specification that we test is (from Table 2 of Bertram 2004):
$$ y_{i, t} = \alpha + \beta_1 y_{sov(i), t} + \beta_2 IND_{i, t} + \varepsilon_{i, t} $$
where
- $y$: GDPpc in constant 2017 PPP USD terms
- $sov(i)$: former or current sovereignty of $i$ or country having territory disputes with $i$
- $IND_{i, t}$: whether $i$ is independent at time $t$
- $i$, $t$: country $i$ in Group Y and year $t$

Alternative specifications are combinations of doing the following:
1. Adding Group Y country fixed effects $\alpha_i$
2. Adding Group X country fixed effects $\gamma_{sov(i)}$
3. Adding decade fixed effects $\phi_{decade(t)}$ where $decade(t)$ is 10-year range that year $t$ belongs to (i.e., 1950-1959, 1960-1969, ..., 2000-2009, and 2010-2020)
4. Adding $DIS_{i}$ as a variable for whether $i$ has territorial disputes
5. Adding the interaction term $y_{sov(i), t}\times IND_{i, t}$

In [94]:
# creating decade variables
bertram_prep["decade"] = (bertram_prep["year"] - 1950) // 10
bertram_prep.loc[bertram_prep.year == 2020, "decade"] = 6

# creating dummies
deca_dum = pd.get_dummies(bertram_prep.decade, "dec_fe", drop_first=True)
x_dum = pd.get_dummies(bertram_prep.x_ccode, "x_fe", drop_first=True)
y_dum = pd.get_dummies(bertram_prep.ccode, "y_fe", drop_first=True)
bertram_regdf = pd.concat([bertram_prep, deca_dum, x_dum, y_dum], axis=1)

# we will export the Bertram regression dataset for potential future uses
bertram_regdf.reset_index(drop=True, inplace=True)
save(bertram_regdf, sset.DIR_YPK_INT / "bertram_regression_data.parquet")

In [95]:
# gathering different specifications
bertram_baseline = ["x_rgdpna_pc", "indep"]
bertram_y_fe = [c for c in bertram_regdf.columns if "y_fe" in c]
bertram_x_fe = [c for c in bertram_regdf.columns if "x_fe" in c]
bertram_dec_fe = [c for c in bertram_regdf.columns if "dec_fe" in c]

bertram_specs = []
for i in list(lstprod(*([[0, 1]] * 5))):
    spec = bertram_baseline.copy()
    if i[0] == 1:
        spec += bertram_y_fe
    if i[1] == 1:
        spec += bertram_x_fe
    if i[2] == 1:
        spec += bertram_dec_fe
    if i[3] == 1:
        spec.append("disputed")
    if i[4] == 1:
        spec.append("grpx_x_indep")
    bertram_specs.append(spec)

In [96]:
# finding the best specification out of the ones in `bertram_specs`
best_spec = ypk_fn.bertram_k_fold(bertram_regdf, bertram_specs)

  0%|          | 0/32 [00:00<?, ?it/s]

We now use the best specification (in terms of minimized $k$-folds SSE) to fill in the missing values.

In [97]:
# fitting with the best specification
best_model = sm.OLS(
    bertram_regdf["rgdpna_pc"], sm.add_constant(bertram_regdf[best_spec])
).fit()

# creating the RHS dataframe for prediction
bertram_predict = []
for iso in GROUPS_X_Y.reset_index().ccode.unique():
    xiso = GROUPS_X_Y.loc[iso, "x_ccode"]
    indep = GROUPS_X_Y.loc[iso, "indep_yr"]
    iso_df = (
        y_df.loc[[xiso], ["rgdpna_pc"]]
        .reset_index()
        .rename(columns={"rgdpna_pc": "x_rgdpna_pc", "ccode": "x_ccode"})
    )
    iso_df["ccode"] = iso
    iso_df["indep"] = 0
    iso_df = iso_df.join(GROUPS_X_Y.disputed, how="left", on="ccode")
    if not pd.isnull(indep):
        iso_df.loc[iso_df.year >= indep, "indep"] = 1
    iso_df["grpx_x_indep"] = iso_df[["x_rgdpna_pc", "indep"]].prod(axis=1)
    bertram_predict.append(iso_df)
bertram_predict = pd.concat(bertram_predict, axis=0)
bertram_predict["decade"] = 6
bertram_predict.loc[bertram_predict.year <= 2009, "decade"] = 5
pred_deca_dum = pd.get_dummies(bertram_predict.decade, "dec_fe", drop_first=False)
pred_x_dum = pd.get_dummies(bertram_predict.x_ccode, "x_fe", drop_first=True)
pred_y_dum = pd.get_dummies(bertram_predict.ccode, "y_fe", drop_first=True)
for i in range(1, 5):
    bertram_predict[f"dec_fe_{i}"] = 0
bertram_predict = pd.concat(
    [bertram_predict, pred_deca_dum, pred_x_dum, pred_y_dum], axis=1
)

# predicting with the above model
pred_Y_rgdpna_pc = best_model.predict(sm.add_constant(bertram_predict[best_spec]))
bertram_predict["rgdpna_pc_predicted"] = pred_Y_rgdpna_pc
bertram_predict.set_index(["ccode", "year"], inplace=True)
bertram_predict = bertram_predict.join(
    group_y_postfra[["rgdpna_pc", "gdppc_source"]], how="left"
)

# filling in missing values with predicted values
bertram_predict.loc[
    pd.isnull(bertram_predict.rgdpna_pc), "gdppc_source"
] = "bertram_regression"
bertram_predict.loc[
    pd.isnull(bertram_predict.rgdpna_pc), "rgdpna_pc"
] = bertram_predict.loc[
    pd.isnull(bertram_predict.rgdpna_pc), "rgdpna_pc_predicted"
].values

Unfortunately, there are some countries with negative values in their predicted GDPpc (most likely due to little information to begin with). For these countries, we will have to use the average GDPpc ratio between that country and corresponding Group X country.

In [98]:
pred_negative = (
    bertram_predict.loc[bertram_predict.rgdpna_pc_predicted < 0, :]
    .reset_index()
    .ccode.unique()
)
print("Countries with negative predicted values from Bertram regression:")
print(pred_negative)

ratio_iso_df = []
for i in pred_negative:
    iso_df = group_y_postfra.loc[i, :].copy()
    xiso = GROUPS_X_Y.loc[i, "x_ccode"]
    mean_ratio = np.mean(iso_df["rgdpna_pc"].div(iso_df["x_rgdpna_pc"]))
    iso_y = bertram_predict.loc[i, "x_rgdpna_pc"].values * mean_ratio
    iso_df = pd.DataFrame(
        data={"year": sset.HISTORICAL_YEARS, "rgdpna_pc_ratio": iso_y}
    )
    iso_df["ccode"], iso_df["gdppc_source_ratio"] = i, f"{i}+{xiso}_existing_avg_ratio"
    ratio_iso_df.append(iso_df.set_index(["ccode", "year"]))
ratio_iso_df = pd.concat(ratio_iso_df, axis=0)

Countries with negative predicted values from Bertram regression:
['PCN' 'TKL' 'SSD']


In [99]:
bertram_predict.loc[
    bertram_predict.index.get_level_values("ccode").isin(pred_negative)
    & (bertram_predict.gdppc_source == "bertram_regression"),
    ["gdppc_source", "rgdpna_pc"],
] = np.nan

bertram_predict = bertram_predict.join(ratio_iso_df, how="left")
for col in ["gdppc_source", "rgdpna_pc"]:
    bertram_predict.loc[
        bertram_predict.index.get_level_values("ccode").isin(pred_negative)
        & pd.isnull(bertram_predict[col]),
        col,
    ] = bertram_predict.loc[
        bertram_predict.index.get_level_values("ccode").isin(pred_negative)
        & pd.isnull(bertram_predict[col]),
        (col + "_ratio"),
    ].values

Let us merge these results with the rest of the `y_df`.

In [100]:
cols = ["gdppc_source", "rgdpna_pc"]
bertram_results = bertram_predict[cols].rename(columns={x: x + "_bert" for x in cols})
y_df = y_df.join(bertram_results, how="outer")
for i in cols:
    y_df.loc[pd.isnull(y_df[i]), i] = y_df.loc[pd.isnull(y_df[i]), i + "_bert"].values
y_df.drop([x + "_bert" for x in cols], axis=1, inplace=True)

### Other individual countries still missing information

#### Copy `NMP` to fill in `UMI`, only for the year 2000

In [101]:
y_df.loc[("UMI", 2000), "rgdpna_pc"] = y_df.loc[("MNP", 2000), "rgdpna_pc"].item()
y_df.loc[("UMI", 2000), "gdppc_source"] = "copy_MNP"

### Finalizing `rgdpna_pc` series (`y_df`), merging with population, and creating `rgdpna` series

In [102]:
y_df = y_df.loc[ALL_ISOS, :].dropna()
assert y_df.to_xarray().notnull().to_array().all()
yp_df = y_df.join(pop_cleaned, how="left")
yp_df["rgdpna"] = yp_df[["rgdpna_pc", "pop"]].prod(axis=1)

## Filling in the missing values for the `cgdpo` (current PPP 2017 USD) series

### Transforming the `rgdpna_pc` series to `cgdpo_pc` equivalents via PPP conversion rates

In [103]:
yp_df = yp_df.join(ppp_to_2017.conv, how="left")
yp_df.loc[pd.isnull(yp_df.conv), "conv"] = 1

# copying the 2019 conversion to 2020 conversion
yp_df.loc[(slice(None), 2020), "conv"] = yp_df.loc[(slice(None), 2019), "conv"].values
yp_df["cgdpo_pc_equiv"] = yp_df["rgdpna_pc"].div(yp_df["conv"])

### Attaching the actual `cgdpo_pc`

We accomplish this by creating ratio between `cgdpo_pc` and `rgdpna_pc` (`c_to_r`); for 2020, we copy `c_to_r` of 2019. In doing so, we will again try to clean up for the issue with French mainland.

For countries that are not in PWT 10.0 but have Fariss et al. (2022) information, create `c_to_r` by using the ratio between `fariss_cgdpe` and `fariss_rgdpna` (note that `cgdpe` is slightly different from `cgdpo` as it does not take import/export prices into account, but is the closest thing).

In [104]:
# creating mainland France cgdpo_pc from FRA+OV cgdpo-to-rgdpna ratio
gdp_pop_df["cgdpo_pc"] = gdp_pop_df["cgdpo"].div(gdp_pop_df["pop"])
gdp_pop_df["c_to_r"] = gdp_pop_df["cgdpo_pc"].div(gdp_pop_df["rgdpna_pc"])
fra_main = fra_all_y.loc[["FRA"], ["rgdpna_pc"]]
fra_main_yrs = fra_main.reset_index()["year"]
fra_main_ratio = gdp_pop_df.loc[([sset.FRA_OVERSEAS_DEPT], fra_main_yrs), ["c_to_r"]]
fra_main_ratio.loc[(sset.FRA_OVERSEAS_DEPT, 2020), "c_to_r"] = fra_main_ratio.loc[
    (sset.FRA_OVERSEAS_DEPT, 2019), "c_to_r"
]
fra_main = fra_main.join(fra_main_ratio.loc[sset.FRA_OVERSEAS_DEPT, "c_to_r"])
fra_main["cgdpo_pc"] = fra_main[["rgdpna_pc", "c_to_r"]].prod(axis=1)
fra_main["c_to_r_source"] = "PWT"

# creating the rest of the cgdpo_pc
c_to_r = gdp_pop_df.loc[
    ~gdp_pop_df.index.get_level_values("ccode").isin(["FRA", sset.FRA_OVERSEAS_DEPT]),
    ["c_to_r"],
].dropna()
c_to_r["c_to_r_source"] = "PWT"
famb_c_to_r = gdp_pop_df.loc[
    gdp_pop_df.index.get_level_values("ccode") != sset.FRA_OVERSEAS_DEPT,
    ["fariss_rgdpna", "fariss_cgdpo", "fariss_pop"],
].dropna()
famb_c_to_r["c_to_r"] = famb_c_to_r["fariss_cgdpo"].div(famb_c_to_r["fariss_rgdpna"])
famb_c_to_r = famb_c_to_r.loc[
    ~famb_c_to_r.index.get_level_values("ccode").isin(
        c_to_r.reset_index().ccode.unique()
    )
    & famb_c_to_r.index.get_level_values("year").isin(sset.HISTORICAL_YEARS),
    ["c_to_r"],
]
famb_c_to_r["c_to_r_source"] = "FAMB"
c_to_r = pd.concat([c_to_r, famb_c_to_r], axis=0).sort_index()

c_to_r_2020 = c_to_r.loc[(slice(None), [2019]), :].reset_index()
c_to_r_2020["year"] = 2020
c_to_r_2020.set_index(["ccode", "year"], inplace=True)
c_to_r = pd.concat([c_to_r, c_to_r_2020, fra_main[["c_to_r", "c_to_r_source"]]])

# merging the c_to_r ratios with yp_df, creating cgdpo_pc
yp_df = yp_df.join(c_to_r)
yp_df["cgdpo_pc"] = yp_df[["rgdpna_pc", "c_to_r"]].prod(axis=1, skipna=False)
yp_df.loc[pd.isnull(yp_df.c_to_r_source), "cgdpo_pc"] = yp_df.loc[
    pd.isnull(yp_df.c_to_r_source), "cgdpo_pc_equiv"
].values
yp_df.loc[pd.isnull(yp_df.c_to_r_source), "c_to_r_source"] = "conv_rgdpna_pc"
yp_df["cgdpo"] = yp_df[["cgdpo_pc", "pop"]].prod(axis=1)
yp_df.drop(["c_to_r", "conv", "cgdpo_pc_equiv"], axis=1, inplace=True)

## Creating the current PPP-2019 USD `cgdpo` and constant 2019 PPP USD `rgdpna`

### Creating `cgdpo_19` and `cgdpo_pc_19`

In [105]:
yp_df.rename(
    columns={x: x + "_17" for x in ["rgdpna_pc", "rgdpna", "cgdpo_pc", "cgdpo"]},
    inplace=True,
)
infla_1719 = defla.loc[2019, "pl_gdpo"] / defla.loc[2017, "pl_gdpo"]

yp_df["cgdpo_19"] = yp_df["cgdpo_17"] * infla_1719
yp_df["cgdpo_pc_19"] = yp_df["cgdpo_19"] / yp_df["pop"]

### Creating `rgdpna_19` and `rgdpna_pc_19`

In [106]:
# growth rate (or ratio) of rgdpna_17, based in 2019
yp_df = yp_df.join(
    yp_df.loc[(slice(None), 2019), "rgdpna_17"]
    .rename("r_2019")
    .reset_index()
    .drop(["year"], axis=1)
    .set_index(["ccode"])
)
yp_df["r_gr"] = yp_df["rgdpna_17"].div(yp_df["r_2019"])

# multiply this with 2019 value of cgdpo_19
yp_df = yp_df.join(
    yp_df.loc[(slice(None), 2019), "cgdpo_19"]
    .rename("c_2019")
    .reset_index()
    .drop(["year"], axis=1)
    .set_index(["ccode"])
)
yp_df["rgdpna_19"] = yp_df[["r_gr", "c_2019"]].prod(axis=1)

# clean up and rgdpna_pc_19
yp_df.drop(["r_gr", "r_2019", "c_2019"], axis=1, inplace=True)
yp_df["rgdpna_pc_19"] = yp_df["rgdpna_19"].div(yp_df["pop"])

# Filling the `nan`s in with zeros (which are from zero population)
for i in ["rgdpna_pc_19", "cgdpo_pc_19"]:
    yp_df.loc[pd.isnull(yp_df[i]), i] = 0

# cleaning up UMI, year 2000, due to its peculiarity
for i in ["rgdpna_19", "rgdpna_pc_19"]:
    yp_df.loc[("UMI", 2000), i] = yp_df.loc[("UMI", 2000), i.replace("rgdpna", "cgdpo")]

## Exporting

In [107]:
# reorganizing the columns
gp_columns = [
    "pop_source",
    "gdppc_source",
    "c_to_r_source",
    "pop",
    "rgdpna_pc_17",
    "rgdpna_17",
    "rgdpna_pc_19",
    "rgdpna_19",
    "cgdpo_pc_17",
    "cgdpo_17",
    "cgdpo_pc_19",
    "cgdpo_19",
]
yp_df = yp_df[gp_columns]

# Exporting
save(yp_df, sset.PATH_INC_POP_CLEANED)