## Reorganizing raw data (GDP, GDPpc, and population) in long-panel format, converting to current and constant PPP terms, taking care of missing data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr
from tqdm.auto import tqdm

from sliiders import country_level_ypk as ypk_fn
from sliiders import settings as sset

## Importing all raw data, and creating a merged, long-panel version

### PWT

In [None]:
pwt100 = pd.read_excel(sset.PATH_PWT_RAW)
pwt100.rename(columns={"countrycode": "ccode"}, inplace=True)
pwt_gdp_pop = ["ccode", "year", "pop", "rgdpo", "rgdpna", "cgdpo"]
gdp_pop_df = pwt100[pwt_gdp_pop].copy().set_index(["ccode", "year"])

### WB WDI

In [None]:
# WB WDI
wdi_rename_dict = {
    "SP.POP.TOTL": "wb_pop",
    "NY.GDP.MKTP.PP.KD": "wb_rgdpna",
    "NY.GDP.PCAP.PP.KD": "wb_rgdpna_pc",
    "NY.GDP.MKTP.KD": "wb_gdp_nom",
    "NY.GDP.PCAP.KD": "wb_gdp_nom_pc",
}
wb_wdi = pd.read_parquet(sset.DIR_WB_WDI_RAW / "wdi_pop_iy_gdp.parquet").rename(
    columns=wdi_rename_dict
)
wb_wdi = wb_wdi.loc[
    wb_wdi.index.get_level_values("ccode").isin(sset.ALL_ISOS_EXTENDED), :
].reset_index()

# Unifying the country code conventions for Kosovo and Channel Islands
wb_wdi.loc[wb_wdi.ccode == "XKX", "ccode"] = "KO-"
wb_wdi.loc[wb_wdi.ccode == "CHI", "ccode"] = "GGY+JEY"
wb_wdi.set_index(["ccode", "year"], inplace=True)

# re-scaling; currently in ones, but to have them in PWT scales
wb_wdi[["wb_rgdpna", "wb_gdp_nom", "wb_pop"]] /= 1000000

# merging
gdp_pop_df = gdp_pop_df.join(wb_wdi, how="outer")

### IMF

In [None]:
# the code below prevents the "ArrowInvalid" error
imf = pd.read_excel(sset.PATH_IMF_WEO_RAW, na_values=["n/a", "--"]).rename(
    columns={"ISO": "ccode", "Subject Descriptor": "subject"}
)
imf = imf.loc[imf.ccode.isin(sset.ALL_ISOS_EXTENDED), :]

# renaming the subjects
imf_rename = {
    "Gross domestic product per capita, constant prices": "imf_rgdpna_pc",
    "Gross domestic product per capita, current prices": "imf_gdppc_nom",
    "Gross domestic product, current prices": "imf_gdp_nom",
    "Population": "imf_pop",
}
for key, nam in imf_rename.items():
    imf.loc[imf.subject == key, "subject"] = nam
imf = imf.loc[imf.subject.isin(list(imf_rename.values())), :].copy()
v_names = dict(zip(list(range(1980, 2021)), ["v_" + str(x) for x in range(1980, 2021)]))
imf.rename(columns=v_names, inplace=True)

# organizing this in vertical format
first = 0
for nam in imf_rename.values():
    imf_sub = imf.loc[imf.subject == nam, ["ccode"] + list(v_names.values())].set_index(
        ["ccode"]
    )
    imf_sub = ypk_fn.organize_hor_to_ver(
        imf_sub, "ccode", None, nam, "v_", range(1980, 2021)
    )
    imf_sub[nam] = imf_sub[nam].astype("float64")
    if first == 0:
        first += 1
        imf_reorg = imf_sub.copy()
    else:
        imf_reorg = imf_reorg.merge(
            imf_sub, how="outer", left_index=True, right_index=True
        )

gdp_pop_df = gdp_pop_df.merge(imf_reorg, how="outer", left_index=True, right_index=True)

### Maddison Project Database (MPD)

In [None]:
# MPD
mpd = pd.read_parquet(sset.DIR_YPK_INT / "maddison_project.parquet")
mpd = mpd.loc[
    mpd.index.get_level_values("ccode").isin(sset.ALL_ISOS_EXTENDED), ["gdppc", "pop"]
].rename(columns=dict(zip(["gdppc", "pop"], ["mpd_rgdpna_pc", "mpd_pop"])))

# separating North Korea (PRK) and non-PRK
mpd_no_prk = mpd.loc[mpd.index.get_level_values("ccode") != "PRK", :].sort_index()
mpd_prk = mpd.loc[["PRK"], :].sort_index()

# some minor interpolation for the case of North Korea
mpd_prk_interped = pd.DataFrame(data={"ccode": ["PRK"] * 71, "year": range(1950, 2021)})
mpd_prk_interped.set_index(["ccode", "year"], inplace=True)
for i in ["mpd_rgdpna_pc", "mpd_pop"]:
    i_yrs = (
        mpd_prk.loc[~pd.isnull(mpd_prk[i]), :].index.get_level_values("year").unique()
    )
    vals = mpd_prk.loc[("PRK", i_yrs), i].values
    interp_yrs = list(range(1950, i_yrs.max() + 1))
    vals_interp = np.exp(np.interp(interp_yrs, i_yrs, np.log(vals)))
    mpd_prk_interped[i] = np.nan
    mpd_prk_interped.loc[("PRK", interp_yrs), i] = vals_interp

# merge
mpd = pd.concat([mpd_no_prk, mpd_prk_interped], axis=0)
gdp_pop_df = gdp_pop_df.join(mpd, how="outer")

### UN population data (UN WPP)

In [None]:
unpop = pd.read_parquet(
    sset.DIR_YPK_INT / "un_population.parquet",
    filters=[
        ("Variant", "==", "Medium"),
    ],
)
unpop = unpop.loc[
    unpop.index.get_level_values("ccode").isin(sset.ALL_ISOS_EXTENDED)
    & (unpop.index.get_level_values("year") <= 2020),
    ["PopTotal"],
].rename(columns={"PopTotal": "un_pop"})

# re-organizing from thousands to millions
unpop["un_pop"] /= 1000

# merging
gdp_pop_df = gdp_pop_df.join(unpop.un_pop, how="outer").sort_index()

### OECD regional data

Among the relevant countries and regions we want to observe, only the five French overseas departments (Martinique, Mayotte, Guadeloupe, French Guiana, and La Réunion) are available in OECD regional data.

In [None]:
# mapping region names and ISO codes
fra_regions = ["Martinique", "Mayotte", "Guadeloupe", "French Guiana", "La Réunion"]
fra_isos = ["MTQ", "MYT", "GLP", "GUF", "REU"]
fra_map = pd.DataFrame(data={"Region": fra_regions, "ccode": fra_isos})

# reading in the OECD data for population and gdp
regpop = pd.read_csv(sset.DIR_OECD_REGIONS_RAW / "REGION_DEMOGR.csv").rename(
    columns={"Territory Level and Typology": "terrtype", "TIME": "year"}
)
regpop = (
    regpop.loc[(regpop.terrtype != "Country") & ~pd.isnull(regpop.Value), :]
    .merge(fra_map, on=["Region"], how="left")
    .sort_values(["ccode", "year"])
    .rename(columns={"Value": "oecd_pop"})
)

regecon = pd.read_csv(sset.DIR_OECD_REGIONS_RAW / "REGION_ECONOM.csv").rename(
    columns={"Territory Level and Typology": "terrtype", "TIME": "year"}
)
regecon = (
    regecon.loc[(regecon.terrtype != "Country") & ~pd.isnull(regecon.Value), :]
    .merge(fra_map, on=["Region"], how="left")
    .sort_values(["ccode", "year"])
    .rename(columns={"Value": "oecd_rgdpna"})
)

In [None]:
# subsetting for the total population information
fra_pop_detect = regpop.loc[
    ~pd.isnull(regpop.ccode) & (regpop.VAR == "T") & (regpop.Gender == "Total"), :
].set_index(["ccode", "year"])
fra_pop_detect["oecd_pop"] /= 1000000

# subsetting for the total GDP information
fra_Y_detect = regecon.loc[
    ~pd.isnull(regecon.ccode) & (regecon.MEAS == "USD_PPP") & (regecon.year <= 2020), :
].set_index(["ccode", "year"])

# merging with the original dataset
gdp_pop_df = gdp_pop_df.join(
    [fra_pop_detect.oecd_pop, fra_Y_detect.oecd_rgdpna], how="outer"
)

### CIA World Factbook

CIA information has been pre-cleaned to be in 2017 PPP USD, part of which has used extrapolation for PPP conversion rates.

In [None]:
cia = pd.read_parquet(sset.PATH_CIA_INT)[["rgdpna_17", "rgdpna_pc_17"]].rename(
    columns={"rgdpna_17": "cia_rgdpna", "rgdpna_pc_17": "cia_rgdpna_pc"}
)
gdp_pop_df = gdp_pop_df.merge(
    cia[["cia_rgdpna", "cia_rgdpna_pc"]], left_index=True, right_index=True, how="outer"
)

### UN SNA AMA information

In [None]:
# matching country/region names and country/region codes
wb_country_matching = (
    wb_wdi[["country"]].reset_index()[["country", "ccode"]].drop_duplicates()
)

un_country_dict_additional = [
    ["Anguilla", "AIA"],
    ["Bolivia (Plurinational State of)", "BOL"],
    ["China, Hong Kong SAR", "HKG"],
    ["China, Macao Special Administrative Region", "MAC"],
    ["China, People's Republic of", "CHN"],
    ["Congo", "COG"],
    ["Cook Islands", "COK"],
    ["Curaçao", "CUW"],
    ["Czechia", "CZE"],
    ["Côte d'Ivoire", "CIV"],
    ["Democratic People's Republic of Korea", "PRK"],
    ["Democratic Republic of the Congo", "COD"],
    ["Egypt", "EGY"],
    ["Gambia", "GMB"],
    ["Iran, Islamic Republic of", "IRN"],
    ["Kingdom of Eswatini", "SWZ"],
    ["Kyrgyzstan", "KGZ"],
    ["Lao People's Democratic Republic", "LAO"],
    ["Micronesia (Federated States of)", "FSM"],
    ["Montserrat", "MSR"],
    ["Republic of Korea", "KOR"],
    ["Republic of Moldova", "MDA"],
    ["Republic of North Macedonia", "MKD"],
    ["Saint Kitts and Nevis", "KNA"],
    ["Saint Lucia", "LCA"],
    ["Saint Vincent and the Grenadines", "VCT"],
    ["Slovakia", "SVK"],
    ["State of Palestine", "PSE"],
    ["Venezuela (Bolivarian Republic of)", "VEN"],
    ["Viet Nam", "VNM"],
    ["Yemen", "YEM"],
    ["United Kingdom of Great Britain and Northern Ireland", "GBR"],
]

country_matching_additional = pd.concat(
    [
        wb_country_matching,
        pd.DataFrame(un_country_dict_additional, columns=["country", "ccode"]),
    ],
    axis=0,
).reset_index(drop=True)

In [None]:
UN_NOM_y = pd.read_csv(sset.DIR_UN_AMA_RAW / "un_snaama_nom_gdppc.csv").rename(
    columns={
        "Country/Area": "country",
        "Year": "year",
        "GDP, Per Capita GDP - US Dollars": "un_nom_gdppc",
    }
)
UN_NOM_y.drop(["Unit"], axis=1, inplace=True)
UN_NOM_y = UN_NOM_y.merge(wb_country_matching, on=["country"], how="left")
UN_NOM_y = UN_NOM_y.loc[~pd.isnull(UN_NOM_y.ccode), :]
UN_NOM_y = UN_NOM_y.set_index(["ccode", "year"]).drop(["country"], axis=1).un_nom_gdppc
gdp_pop_df = gdp_pop_df.join(UN_NOM_y, how="outer")
gdp_pop_df.sort_index(inplace=True)

### Information from various disaggregated sources, for smaller regions, territories and countries

This includes national account reports and approximations from organizational reports or academic papers.

#### Åland Islands (`ALA`; GDP per capita and population)

- GDP per capita: information available from Statistics and Research Åland (link [here](https://www.asub.ax/en/statistics/national-accounts/gross-domestic-product), see the link "GDP per capita 1995-2018 in current prices, PPS euro"). Since this is in PPP Euro (or Finnish Purchasing Power Standard exchange rate), we will use the nominal Euro-to-USD converision to clean this assuming that these are current PPP.
- Population: information available from Statistics and Research Åland (link [here](https://www.asub.ax/en/statistics/population/size-and-structure-population), see the link "Åland, the Faroe Islands and Greenland". This is in ones of people, so we divide by 1 million to keep the population in millions of people.

In [None]:
# aland islands information, GDP
ala_gdp = (
    pd.read_excel(sset.DIR_ALAND_STATISTICS_RAW / "aland_gdp.xlsx")
    .rename(columns={"Unnamed: 0": "country"})
    .set_index(["country"])
)
ala_years = ala_gdp.columns.values
ala_95_18_cgdpo_pc = ala_gdp.loc["Åland", :]

# exchange rate; EMU only has down to 1999, so for convenience's sake
# for 1995-1998, we will use 1999 rates
wdi_xrate = (
    pd.read_parquet(sset.DIR_WB_WDI_RAW / "wdi_xr.parquet")
    .loc[("EMU", list(range(1999, ala_years.max() + 1))), "xrate"]
    .values
)
ala_xrate = np.hstack([[wdi_xrate[0]] * (1999 - ala_years.min()), wdi_xrate])
ala_95_18_cgdpo_pc = ala_95_18_cgdpo_pc * ala_xrate

# creating the Aland islands column for cgdpo_pc
ala = (
    (gdp_pop_df.loc[pd.IndexSlice["FIN", 1950:], "cgdpo"].copy() * np.nan)
    .rename("ala_cgdpo_pc")
    .reset_index()
)
ala["ccode"] = "ALA"
ala = ala.set_index(["ccode", "year"]).ala_cgdpo_pc
ala.loc["ALA", list(ala_years)] = ala_95_18_cgdpo_pc.values

# aland islands information, population
ala_pop_link = (
    "https://www.asub.ax/sites/www.asub.ax/files/attachments/page/alv01_aland_faroe"
    "_islands_and_greenland_-_an_overview_with_comparable_data.xlsx"
)
ala_pop = pd.read_excel(sset.DIR_ALAND_STATISTICS_RAW / "aland_pop.xlsx").rename(
    columns={"Unnamed: 0": "category"}
)
ala_pop_00_20 = ala_pop.iloc[1].values[1:-1]
ala_pop = (
    (gdp_pop_df.loc[pd.IndexSlice["FIN", 1950:], "pop"].copy() * np.nan)
    .rename("ala_pop")
    .reset_index()
)
ala_pop["ccode"] = "ALA"
ala_pop = ala_pop.set_index(["ccode", "year"]).ala_pop
ala_pop.loc["ALA", list(range(2000, 2021))] = ala_pop_00_20 / 1000000

# merging all
gdp_pop_df = gdp_pop_df.join(ala, how="outer")
gdp_pop_df = gdp_pop_df.join(ala_pop, how="outer")
gdp_pop_df.sort_index(inplace=True)

#### Norfolk Island (`NFK`)

- GDP: GDPpc as a percentage of the Australian level for the years 1951-52 are shown in [Treadgold (Asia Pacific Viewpoint, 1999)](https://doi.org/10.1111/1467-8373.00095) and similar percentage for 1995-96 are shown in [Treadgold (Pacific Economic Bulletin, 1998)](https://openresearch-repository.anu.edu.au/handle/1885/157535).
- Population: The Australian Census has information for Norfolk in the years 2001, 2011, and 2016: www.infrastructure.gov.au/territories-regions-cities/territories/norfolk-island

In [None]:
# population
nfk = (
    (gdp_pop_df.loc[pd.IndexSlice["AUS", 1950:], "un_pop"].copy() * np.nan)
    .rename("aus_census_pop")
    .reset_index()
)
nfk["ccode"] = "NFK"
nfk = nfk.set_index(["ccode", "year"]).aus_census_pop
nfk.loc["NFK", [2001, 2011, 2016]] = np.array([2601, 1796, 1748]) / 1000000
gdp_pop_df = gdp_pop_df.join(nfk, how="outer")

# GDP
nfk_gdp = (
    (gdp_pop_df.loc[pd.IndexSlice["AUS", 1950:], "rgdpna"].copy() * np.nan)
    .rename("treadgold_rgdpna_pc")
    .reset_index()
)
nfk_gdp["ccode"] = "NFK"
nfk_cgdpo = nfk_gdp.copy().rename(columns={"treadgold_rgdpna_pc": "treadgold_cgdpo_pc"})

# getting the Australian GDPpc
nfk_yrs = [1951, 1952, 1995, 1996]
nfk_ratios = np.array([0.39, 0.39, 1.12, 1.12])
nfk_rgdpna_pc = (
    gdp_pop_df.loc[pd.IndexSlice["AUS", nfk_yrs], "rgdpna"].values
    / gdp_pop_df.loc[pd.IndexSlice["AUS", nfk_yrs], "pop"].values
    * nfk_ratios
)
nfk_gdp = nfk_gdp.set_index(["ccode", "year"]).treadgold_rgdpna_pc
nfk_gdp.loc["NFK", nfk_yrs] = nfk_rgdpna_pc
gdp_pop_df = gdp_pop_df.join(nfk_gdp, how="outer")

nfk_cgdpo_pc = (
    gdp_pop_df.loc[pd.IndexSlice["AUS", nfk_yrs], "cgdpo"].values
    / gdp_pop_df.loc[pd.IndexSlice["AUS", nfk_yrs], "pop"].values
    * nfk_ratios
)
nfk_cgdpo = nfk_cgdpo.set_index(["ccode", "year"]).treadgold_cgdpo_pc
nfk_cgdpo.loc["NFK", nfk_yrs] = nfk_cgdpo_pc
gdp_pop_df = gdp_pop_df.join(nfk_cgdpo, how="outer")

#### Cocos (Keeling) Islands (`CCK`)

- GDP: Information available from the House of Representative Committees of Parliament of Australia (link [here](https://www.aph.gov.au/parliamentary_business/committees/House_of_Representatives_Committees?url=ncet/economicenvironment/report/index.htm)). Please see **Chapter 3 The economic environment of the Indian Ocean Territories** page 23; the units are in Australian dollars (nominal), and the value corresponds to 2010's GDP.
- Population: from the Australian Bureau of Statistics (ABS); information for [2016](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/90102), [2011](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2011/quickstat/90102?opendocument), [2006](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2006/quickstat/910053009?opendocument), and [2001](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2001/quickstat/910053009?opendocument) available from the ABS Quickstat pages

In [None]:
# GDP
cck_gdp = (
    (gdp_pop_df.loc[pd.IndexSlice["AUS", 1950:], "rgdpna"].copy() * np.nan)
    .rename("aus_census_nom_gdp")
    .reset_index()
)
cck_gdp["ccode"] = "CCK"
cck_gdp = cck_gdp.set_index(["ccode", "year"]).aus_census_nom_gdp
cck_gdp.loc["CCK", [2010]] = (15000000 / 1000000) * pd.read_parquet(
    sset.DIR_WB_WDI_RAW / "wdi_xr.parquet"
).loc[("AUS", [2010]), "xrate"].values

gdp_pop_df = gdp_pop_df.join(cck_gdp, how="outer")

# population
gdp_pop_df.loc[("CCK", [2001, 2006, 2011, 2016]), "aus_census_pop"] = (
    np.array([621, 572, 550, 544]) / 1000000
)
gdp_pop_df.sort_index(inplace=True)

#### Christmas Island (`CXR`)
- GDP: Information available from the House of Representative Committees of Parliament of Australia (link [here](https://www.aph.gov.au/parliamentary_business/committees/House_of_Representatives_Committees?url=ncet/economicenvironment/report/index.htm)). Please see **Chapter 3 The economic environment of the Indian Ocean Territories** page 23; the units are in Australian dollars (nominal), and the value corresponds to 2010's GDP.
- Population: from the Australian Bureau of Statistics (ABS); information for [2016](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/90101?opendocument), [2011](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2011/quickstat/910052009?opendocument), [2006](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2006/quickstat/910052009?opendocument&navpos=220), and [2001](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2001/quickstat/910052009?opendocument&navpos=220) available from the ABS Quickstat pages

In [None]:
# population
cxr_gdp_pop = gdp_pop_df.loc[["CCK"]].reset_index()
cxr_gdp_pop["ccode"] = "CXR"
cxr_gdp_pop.set_index(["ccode", "year"], inplace=True)
cxr_gdp_pop.loc[("CXR", [2001, 2006, 2011, 2016]), "aus_census_pop"] = (
    np.array([1446, 1349, 2072, 1843]) / 1000000
)

# GDP
xrate_val = (
    pd.read_parquet(sset.DIR_WB_WDI_RAW / "wdi_xr.parquet")
    .loc[("AUS", 2010), "xrate"]
    .values[0]
)
cxr_gdp_pop.loc[("CXR", 2010), "aus_census_nom_gdp"] = xrate_val * 71000000 / 1000000

# merging
gdp_pop_df = pd.concat([gdp_pop_df, cxr_gdp_pop], axis=0).sort_index()

#### Pitcairn Island (`PCN`)

- GDP: estimate of approximately 217,000 New Zealand dollars (from [this link](https://web.archive.org/web/20150705134639/http://www.government.pn/policies/Pitcairn%20Island%20SDP%202012-2016.pdf#page=4) for a WayBackMachine Archive of the Government of Pitcairn's "Pitcairn Islands Strategic Development Plan")
- Population: from the Pitcairn Island [government website](http://www.immigration.gov.pn/community/the_people/index.html), 233 people in 1937 and 49 people in 2017. Also, according to CIA World Factbook ([link here](https://www.cia.gov/the-world-factbook/countries/pitcairn-islands/#people-and-society)), its population was 50 in 2021 (will interpolate and round up to whole numbers).

In [None]:
# Pitcairn islands population
pcn_50_20 = np.round(
    np.exp(
        np.interp(list(range(1950, 2021)), [1937, 2017, 2021], np.log([233, 49, 50]))
    ),
    0,
)
pcn_gdp = (
    (gdp_pop_df.loc[pd.IndexSlice["NZL", 1950:], "pop"].copy() * np.nan)
    .rename("pcn_pop")
    .reset_index()
)
pcn_gdp["ccode"] = "PCN"
pcn_pop = pcn_gdp.copy()
pcn_pop = pcn_pop.set_index(["ccode", "year"]).pcn_pop
pcn_pop.loc["PCN", list(range(1950, 2021))] = pcn_50_20 / 1000000

# Pitcairn GDP
pcn_gdp.rename(columns={"pcn_pop": "pcn_nom_gdp"}, inplace=True)
pcn_gdp = pcn_gdp.set_index(["ccode", "year"]).pcn_nom_gdp
wdi_xrate = (
    pd.read_parquet(sset.DIR_WB_WDI_RAW / "wdi_xr.parquet")
    .loc[("NZL", 2006), "xrate"]
    .values[0]
)
pcn_gdp.loc["PCN", 2006] = (217000 / 1000000) * wdi_xrate

# merging all
gdp_pop_df = gdp_pop_df.join(pcn_pop, how="outer")
gdp_pop_df = gdp_pop_df.join(pcn_gdp, how="outer")
gdp_pop_df.sort_index(inplace=True)

#### Svalbard and Jan Mayen (`SJM`)

Jan Mayen is uninhabited, but Svalbard has population and economic activity.
- GDP: Unfortunately, we will have to use the GDP per capita of Norway itself as we do not have a reliable GDP estimate of Svalbard and `SJM` is a part of Norway.
- Population: From the Statistikkbanken (Statistics Norway, link [here](https://www.ssb.no/en/statbank/table/07429)), select all half years (2009-2021).

In [None]:
## svalbard and jan maarten (no population for jan mayen)
h1 = [2085, 2052, 2017, 2115, 2158, 2100, 2185, 2152, 2145, 2214, 2258, 2428]
h2 = [2140, 2071, 2140, 2195, 2195, 2118, 2189, 2162, 2210, 2310, 2379, 2417]
sjm_pop_09_20 = np.round((np.array(h1) + np.array(h2)) / 2, 0) / 1000000
sjm_pop = (
    (gdp_pop_df.loc[pd.IndexSlice["NOR", 1950:], "pop"].copy() * np.nan)
    .rename("nor_census_pop")
    .reset_index()
)
sjm_pop["ccode"] = "SJM"
sjm_pop = sjm_pop.set_index(["ccode", "year"]).nor_census_pop
sjm_pop.loc["SJM", list(range(2009, 2021))] = sjm_pop_09_20

# merging all
gdp_pop_df = gdp_pop_df.join(sjm_pop, how="outer")
gdp_pop_df.sort_index(inplace=True)

#### Saint Helena, Ascension and Tristan da Cunha (`SHN`)

We have data from the [St. Helena Government](https://www.sainthelena.gov.sh/wp-content/uploads/2020/07/SEDP-EOY-Progress-Report-Final-160720.pdf) about the estimated GDP per capita (in 2019 prices, non-PPP) of 2018 and 2019. We will combine this later (by comparing ratios with the UK GDP) to get the approximate GDP per capita values in PPP.
- Note that this is *not* the average GDPpc of Saint Helena, Ascension and Tristan da Cunha but rather just Saint Helena (so we are using Saint Helena to proxy for the three areas, which are represented by the country-code `SHN`).

In [None]:
wdi_xrate = (
    pd.read_parquet(sset.DIR_WB_WDI_RAW / "wdi_xr.parquet")
    .loc[("GBR", [2018, 2019]), "xrate"]
    .values
)
shn = (
    (gdp_pop_df.loc[pd.IndexSlice["GBR", 1950:], "rgdpna"].copy() * np.nan)
    .rename("shn_gov_gdppc")
    .reset_index()
)
shn["ccode"] = "SHN"
shn = shn.set_index(["ccode", "year"]).shn_gov_gdppc
shn.loc["SHN", [2018, 2019]] = np.array([8490, 8230]) * wdi_xrate

# merging all
gdp_pop_df = gdp_pop_df.join(shn, how="outer")
gdp_pop_df.sort_index(inplace=True)

#### Saint Barthélemy (`BLM`)

Saint Barthélemy's 2010 and 1999 (nominal) GDP per capita shown in the CEROM document: [link here](https://www.cerom-outremer.fr/guadeloupe/publications/etudes-cerom/estimation-du-pib-par-habitant-de-st-barthelemy.html).

In [None]:
blm = (
    (gdp_pop_df.loc[pd.IndexSlice["GBR", 1950:], "rgdpna"].copy() * np.nan)
    .rename("cerom_gdppc")
    .reset_index()
)
blm["ccode"] = "BLM"
blm = blm.set_index(["ccode", "year"]).cerom_gdppc
wdi_xrate = (
    pd.read_parquet(sset.DIR_WB_WDI_RAW / "wdi_xr.parquet")
    .loc[("EMU", [1999, 2010]), "xrate"]
    .values
)
blm.loc["BLM", [1999, 2010]] = np.array([26000, 35700]) * wdi_xrate

# merging all
gdp_pop_df = gdp_pop_df.join(blm, how="outer")
gdp_pop_df.sort_index(inplace=True)

#### United States Minor Outlying Islands (`UMI`)

- Population: from the U.S. Census ([link here](https://www.census.gov/history/pdf/2000-minoroutlyingislands.pdf)), years 1980, 1990, 2000
- GDP: will use GDPpc from `MNP` (Northern Mariana Islands)

In [None]:
# US Minor Outlying Islands
umi_gdp = (
    (gdp_pop_df.loc[pd.IndexSlice["USA", 1950:], "pop"].copy() * np.nan)
    .rename("us_census_pop")
    .reset_index()
)
umi_gdp["ccode"] = "UMI"
umi_gdp = umi_gdp.set_index(["ccode", "year"]).us_census_pop
umi_gdp.loc["UMI", [1980, 1990, 2000]] = np.array([1082, 193, 316]) / 1000000

# merging all
gdp_pop_df = gdp_pop_df.join(umi_gdp, how="outer")
gdp_pop_df.sort_index(inplace=True)

### Uninhabited areas (no population and no economic activity)

These are: the French Southern and Antarctic Lands (`ATF`), Bouvet Island (`BVT`), Clipperton Island (`Cl-`), Heard and McDonald Islands (`HMD`), British Indian Ocean Territory (`IOT`), South Georgia and the South Sandwich Islands (`SGS`).
- `ATF`: According to the CIA World Factbook website (link [here](https://www.cia.gov/the-world-factbook/countries/french-southern-and-antarctic-lands/)), `ATF` does not have permanent population and therefore we will record this as having no population and no economic activity.
- `BVT`: uninhabited to protect nature reserve
- `CL-`: is an atoll with no permanent inhabitants since 1945.
- `HMD`: is an Australian external territory near Antarctica.
- `IOT`: is mostly composed of U.S.-U.K. military facilities with no permanent population.
- `SGS`: is uninhabited

In [None]:
# assigning 0 population and 0 GDP
atf_pop = (
    (gdp_pop_df.loc[pd.IndexSlice["FRA", 1950:], "pop"].copy() * np.nan)
    .rename("noecon_pop")
    .reset_index()
)
atf_pop["ccode"], atf_pop["atf_pop"] = "ATF", 0
atf_pop.set_index(["ccode", "year"], inplace=True)
atf_gdp = atf_pop.copy().rename(columns={"noecon_pop": "noecon_rgdpna"})
atf_pop, atf_gdp = atf_pop.noecon_pop, atf_gdp.noecon_rgdpna

# merging all
gdp_pop_df = gdp_pop_df.join(atf_pop, how="outer")
gdp_pop_df = gdp_pop_df.join(atf_gdp, how="outer")

In [None]:
# Attaching the other uninhabited areas' information
for i in np.setdiff1d(sset.UNINHABITED_ISOS, ["ATF"]):
    i_df = gdp_pop_df.loc[(["ATF"], slice(None)), :].reset_index()
    i_df["ccode"] = i
    i_df.set_index(["ccode", "year"], inplace=True)
    gdp_pop_df = pd.concat([gdp_pop_df, i_df], axis=0)
gdp_pop_df.sort_index(inplace=True)

### Exporting the intermediate result

In [None]:
# NE.GDI.FTOT.ZS: Gross fixed cap. formation rate in WB WDI; will be dealt with later
col_to_drop = np.intersect1d(gdp_pop_df.columns, ["country", "NE.GDI.FTOT.ZS"])
if len(col_to_drop) > 0:
    gdp_pop_df.drop(col_to_drop, axis=1, inplace=True)

gdp_pop_df.to_parquet(sset.DIR_YPK_RAW / "gdp_gdppc_pop_raw_multiple_sources.parquet")

## Historical population (1950-2019), creating a single sequence

Before we go on further, note that when we refer to country-level population for those with overseas territories, it excludes the population of such territories (e.g., `USA` population noted below does not include `PRI` [Puerto Rico]). One big exception is that `FRA` (France) from PWT actually includes the following territories or "overseas region" into its population value calculation: `GLP`, `MYT`, `MTQ`, `GUF`, and `REU`. So we will actually use the `FRA+OV` designation for PWT variables and clean up along the way so that `FRA` only represents mainland France.

### Re-reading in the raw-organized data

If worried about memory issues (due to reasons outside of this notebook), restart the kernel, run the first and second cells, and run the following cells.

In [None]:
gp_df = pd.read_parquet(sset.DIR_YPK_RAW / "gdp_gdppc_pop_raw_multiple_sources.parquet")
fraov = gp_df.loc[("FRA", slice(None)), ["pop", "cgdpo", "rgdpna", "rgdpo"]]
fraov.reset_index(inplace=True)
fraov["ccode"] = "FRA+OV"
gp_df = pd.concat([gp_df, fraov.set_index(["ccode", "year"])], axis=0)

# just setting apart the population data
popraw_df = gp_df.loc[:, [x for x in gp_df.columns if "pop" in x]]

### Cleaning for the uninhabited areas

In [None]:
# `inh` for inhabited
pop_uninh = popraw_df.loc[sset.UNINHABITED_ISOS, :].sort_index()
pop_uninh["pop"] = 0
pop_uninh["pop_source"] = "uninhabited"
pop_uninh["pop_unit"] = "millions (of people)"
pop_uninh = pop_uninh[["pop_unit", "pop_source", "pop"]]
pop_inh = popraw_df.loc[
    ~popraw_df.index.get_level_values("ccode").isin(sset.UNINHABITED_ISOS), :
].sort_index()

### Cleaning for Serbia and Kosovo

It has been noted that Kosovo's population was included in the Serbian population from UN (for all times) and that from PWT (before 1999). We do have the Kosovan population from World Bank (1960-2020) so we will subtract this from the Serbian population, and for 1950-1959, we will use the mean ratio between Kosovo and Serbia.

In [None]:
# 1960-2020
kosovo_pop = pop_inh.loc["KO-", "wb_pop"]
kosovo_yrs = kosovo_pop.index.values
pop_inh.loc[("SRB", kosovo_yrs), "pop"] = (
    pop_inh.loc[("SRB", kosovo_yrs), "un_pop"].values - kosovo_pop.values
)
srb_ko_pop = pop_inh.loc[["SRB"], ["pop"]].sort_index()
ko_pop = srb_ko_pop.reset_index()
ko_pop["ccode"], ko_pop["pop"] = "KO-", np.nan
srb_ko_pop = pd.concat(
    [srb_ko_pop, ko_pop.set_index(["ccode", "year"])], axis=0
).sort_index()
srb_ko_pop.loc[("KO-", kosovo_yrs), "pop"] = kosovo_pop.values

# taking care of the 1950-1959 (mean ratio of KO- to KO- + SRB)
ratio = (kosovo_pop.values / pop_inh.loc[("SRB", kosovo_yrs), "un_pop"].values).mean()
pre_kosovo_yrs = np.sort(
    np.setdiff1d(srb_ko_pop.index.get_level_values("year").unique(), kosovo_yrs)
)
kosovo_pre_pop = pop_inh.loc[("SRB", pre_kosovo_yrs), "un_pop"].values * ratio
srb_ko_pop.loc[("SRB", pre_kosovo_yrs), "pop"] = (
    pop_inh.loc[("SRB", pre_kosovo_yrs), "un_pop"].values - kosovo_pre_pop
)
srb_ko_pop.loc[("KO-", pre_kosovo_yrs), "pop"] = kosovo_pre_pop
srb_ko_pop["pop_unit"] = "millions (of people)"
srb_ko_pop["pop_source"] = "UN_SRB_minus_WB_XKX"
srb_ko_pop.loc[("KO-", kosovo_yrs), "pop_source"] = "WB"
srb_ko_pop.loc[("SRB", pre_kosovo_yrs), "pop_source"] = "UN_SRB_ratio_WB_XKX"
srb_ko_pop.loc[("KO-", pre_kosovo_yrs), "pop_source"] = "UN_SRB_ratio_WB_XKX"

## merging with the uninhabited to create the "clean" dataset
pop_cleaned = pd.concat([pop_uninh, srb_ko_pop], axis=0).sort_index()

### Cleaning for the U.S. and U.S. Territories

It seems that there is some arbitrariness when it comes to including territories or excluding them, when calculating for the population (even within the same dataset, it is unclear whether the population for some year includes or excludes the territories). We will take the PWT10.0 population as the U.S. population without territories included, and attach population numbers of the territories appropriately (from relevant, filled data sources). For the missing years we extrapolate using the known ratio between the respective territory and the U.S. mainland.

In [None]:
# USA; use PWT10.0 as the base
# since it's missing 2020, use the growth rate from wb_pop
us_gr = pop_inh.loc[("USA", 2020), "un_pop"] / pop_inh.loc[("USA", 2019), "un_pop"]
us_2020 = pop_inh.loc[("USA", 2019), "pop"] * us_gr
us_df = pop_inh.loc[(["USA"], range(1950, 2021)), ["pop"]].sort_index()
us_df.loc[("USA", 2020), "pop"] = us_2020
us_df["pop_source"] = "PWT"
us_df.loc[("USA", 2020), "pop_source"] = "PWT_ratio_UN"

In [None]:
# UMI
umi_yrs = (
    pop_inh.loc[~pd.isnull(pop_inh.us_census_pop), :]
    .index.get_level_values("year")
    .values
)
umi_df = pop_inh.loc[("USA", range(1950, 2021)), ["pop"]].reset_index()
umi_df["ccode"], umi_df["pop"] = "UMI", np.nan
umi_df.set_index(["ccode", "year"], inplace=True)
umi_vals = pop_inh.loc[("UMI", umi_yrs), "us_census_pop"].values
umi_df.loc[("UMI", umi_yrs), "pop"] = umi_vals
umi_ratio = (
    umi_df.loc[("UMI", umi_yrs), "pop"].values
    / pop_inh.loc[("USA", umi_yrs), "pop"].values
).mean()
umi_interp_yrs = list(range(umi_yrs.min(), umi_yrs.max() + 1))
umi_df.loc[("UMI", umi_interp_yrs), "pop"] = np.exp(
    np.interp(umi_interp_yrs, umi_yrs, np.log(umi_vals))
)
umi_df["pop_source"] = "interp"
umi_df.loc[("UMI", umi_yrs), "pop_source"] = "US_CENSUS"

# ratio with the US
umi_remaining = np.setdiff1d(list(range(1950, 2021)), umi_interp_yrs)
umi_df.loc[("UMI", umi_remaining), "pop"] = (
    us_df.loc[("USA", umi_remaining), "pop"].values * umi_ratio
)
umi_df.loc[("UMI", umi_remaining), "pop_source"] = "US_CENSUS_UMI_ratio_PWT_USA"

In [None]:
# PRI, VIR, GUM, ASM, MNP: use UN population
other_us_terr_df = (
    pop_inh.loc[(["PRI", "VIR", "GUM", "ASM", "MNP"], range(1950, 2021)), ["un_pop"]]
    .sort_index()
    .rename(columns={"un_pop": "pop"})
)
other_us_terr_df["pop_source"] = "UN"

In [None]:
# merging the cleaned information
pop_cleaned = pd.concat([pop_cleaned, us_df, umi_df, other_us_terr_df]).sort_index()
pop_cleaned["pop_unit"] = "millions (of people)"

### Cleaning for the French territories

There are (excluding the no-population ones):
- Five overseas departments: `GUF`, `GLP`, `MTQ`, `MYT`, `REU`
- Overseas collectivities: `PYF`, `BLM`, `SPM`, `MAF`, `WLF`
- Other: `NCL`

Including that for France, we will simply use UN population (since its French population doesn't include the departments).

In [None]:
# overseas departments
french = ["GUF", "GLP", "MTQ", "MYT", "REU", "PYF", "BLM", "SPM", "MAF", "WLF"]
french += ["NCL", "FRA"]
french_pop = pop_inh.loc[(french, range(1950, 2021)), ["un_pop"]].rename(
    columns={"un_pop": "pop"}
)
french_pop["pop_source"] = "UN"
french_pop["pop_unit"] = "millions (of people)"

pop_cleaned = pd.concat([french_pop, pop_cleaned], axis=0).sort_index()

### Cleaning for the British territories

These are (excluding the no-population ones):
- Overseas Territories: `AIA`, `BMU`, `VGB`, `CYM`, `FLK`, `GIB`, `MSR`, `PCN`, `SHN`, `TCA`
- Crown dependencies: `GGY`, `JEY`, `IMN`

1. For all Overseas Territories *except `PCN`*, we take from UN (and for `PCN`, from interpolated data using CIA WF and `PCN` Government data).

2. For `GGY` and `JEY`, their merged information is in `GGY+JEY` (for `un_pop`). Guernsey Annual Electronic Report (link [here](https://gov.gg/CHttpHandler.ashx?id=123156&p=0#:~:text=At%20the%20end%20of%20March%202019%2C%20Guernsey's%20population%20was%2062%2C792.&text=There%20was%20a%20natural%20decrease,of%20459%20people%20(0.7%25).)) shows `GGY` (Guernsey)'s population from 2009 to 2019. Take the average ratio of `GGY` population to `GGY+JEY` between these years, and use this ratio to extrapolate for the missing years (of `GGY` and `JEY`, separately).

3. For `GBR`, take from PWT10.0. For `IMN`, take from UN.

In [None]:
# cleaning Pitcairn
pcn_pop = pop_inh.loc[["PCN"], ["pcn_pop"]].rename(columns={"pcn_pop": "pop"})
pcn_pop["pop_source"] = "interp"
pcn_pop.loc[("PCN", 2017), "pop_source"] = "PCN_GOV"
pcn_pop.loc[("PCN", 2020), "pop_source"] = "PCN_GOV_interp_CIA"

In [None]:
# Guernsey and Jersey
ggy_09_19 = [62274, 62431, 62915, 63085, 62732, 62341, 62234, 62208, 62106]
ggy_09_19 += [62333, 62792]
ggy_09_19 = np.array(ggy_09_19) / 1000000
yr0919 = list(range(2009, 2020))

ggy_jey_df = pop_inh.loc[["GGY+JEY"], ["un_pop"]].rename(columns={"un_pop": "pop"})
ggy_df = ggy_jey_df.reset_index()
ggy_df["ccode"] = "GGY"
ggy_df["pop"] = np.nan
ggy_df.loc[ggy_df.year.isin(yr0919), "pop"] = ggy_09_19
ggy_df.set_index(["ccode", "year"], inplace=True)

ggy_ratio = (ggy_09_19 / ggy_jey_df.loc[("GGY+JEY", yr0919), "pop"].values).mean()
not_0919 = np.setdiff1d(list(range(1950, 2021)), yr0919)
ggy_df.loc[("GGY", not_0919), "pop"] = (
    ggy_jey_df.loc[("GGY+JEY", not_0919), "pop"].values * ggy_ratio
)
jey_df = ggy_jey_df.reset_index()
jey_df["ccode"], jey_df["pop"] = "JEY", ggy_jey_df["pop"].values - ggy_df["pop"].values
jey_df.set_index(["ccode", "year"], inplace=True)

# sources
ggy_df["pop_source"] = "GGY_REPORT"
ggy_df.loc[("GGY", not_0919), "pop_source"] = "UN_ratio_GGY_REPORT"
jey_df["pop_source"] = "UN_ratio_GGY_REPORT"
jey_df.loc[("JEY", yr0919), "pop_source"] = "UN_minus_GGY_REPORT"

In [None]:
# other GBR territories and GBR
gbr_df = pop_inh.loc[(["GBR"], range(1950, 2021)), ["pop"]]
gbr_df.loc[("GBR", 2020), "pop"] = (
    gbr_df.loc[("GBR", 2019), "pop"]
    * pop_inh.loc[("GBR", 2020), "un_pop"]
    / pop_inh.loc[("GBR", 2019), "un_pop"]
)
gbr_df["pop_source"] = "PWT"
gbr_df.loc[("GBR", 2020), "pop_source"] = "PWT_ratio_UN"

other_gbr_terr_df = pop_inh.loc[
    ["AIA", "BMU", "VGB", "CYM", "FLK", "GIB", "MSR", "SHN", "TCA", "IMN"], ["un_pop"]
].rename(columns={"un_pop": "pop"})
other_gbr_terr_df["pop_source"] = "UN"

# gathering all GBR-related territories
gbr_rel_df = pd.concat([pcn_pop, ggy_df, jey_df, gbr_df, other_gbr_terr_df], axis=0)
gbr_rel_df["pop_unit"] = "millions (of people)"

# merging
pop_cleaned = pd.concat([gbr_rel_df, pop_cleaned], axis=0).sort_index()

### Cleaning for the Australian territories

These are (excluding the no-population ones):
- External Territories: `CXR`, `CCK`, `NFK`

In [None]:
# getting the AUS 2020 value in
pop_inh.loc[("AUS", 2020), "pop"] = (
    pop_inh.loc[("AUS", 2020), "un_pop"]
    / pop_inh.loc[("AUS", 2019), "un_pop"]
    * pop_inh.loc[("AUS", 2019), "pop"]
)

In [None]:
# christmas island
cxr_df = pop_inh.loc[["CXR"], ["aus_census_pop"]].rename(
    columns={"aus_census_pop": "pop"}
)
aus_cen_yrs = [2001, 2006, 2011, 2016]
cxr_df["pop_source"] = "interp"
cxr_df.loc[("CXR", aus_cen_yrs), "pop_source"] = "AUS_CENSUS"
cxr_interp_yrs = range(np.min(aus_cen_yrs), np.max(aus_cen_yrs) + 1)
cxr_df.loc[("CXR", list(cxr_interp_yrs)), "pop"] = np.exp(
    np.interp(
        cxr_interp_yrs,
        aus_cen_yrs,
        np.log(cxr_df.loc[("CXR", aus_cen_yrs), "pop"].values),
    )
)

# country-territory ratios for extrapolation
cxr_ratio = (
    cxr_df.loc[("CXR", aus_cen_yrs), "pop"].values
    / pop_inh.loc[("AUS", aus_cen_yrs), "pop"].values
).mean()
non_cen_yrs = np.setdiff1d(list(range(1950, 2021)), cxr_interp_yrs)
cxr_df.loc[("CXR", non_cen_yrs), "pop"] = (
    cxr_ratio * pop_inh.loc[("AUS", non_cen_yrs), "pop"].values
)
cxr_df.loc[("CXR", non_cen_yrs), "pop_source"] = "PWT_AUS_ratio_AUS_CENSUS_CXR"

In [None]:
# Cocos keeling island
cck_df = pop_inh.loc[["CCK"], ["aus_census_pop"]].rename(
    columns={"aus_census_pop": "pop"}
)
cck_df["pop_source"] = "interp"
cck_df.loc[("CCK", aus_cen_yrs), "pop_source"] = "AUS_CENSUS"
cck_df.loc[("CCK", list(cxr_interp_yrs)), "pop"] = np.exp(
    np.interp(
        cxr_interp_yrs,
        aus_cen_yrs,
        np.log(cck_df.loc[("CCK", aus_cen_yrs), "pop"].values),
    )
)

# country-territory ratios for extrapolation
cck_ratio = (
    cck_df.loc[("CCK", aus_cen_yrs), "pop"].values
    / pop_inh.loc[("AUS", aus_cen_yrs), "pop"].values
).mean()
cck_df.loc[("CCK", non_cen_yrs), "pop"] = (
    cck_ratio * pop_inh.loc[("AUS", non_cen_yrs), "pop"].values
)
cck_df.loc[("CCK", non_cen_yrs), "pop_source"] = "PWT_AUS_ratio_AUS_CENSUS_CCK"

In [None]:
## norfolk island
nfk_df = pop_inh.loc[["NFK"], ["aus_census_pop"]].rename(
    columns={"aus_census_pop": "pop"}
)
nfk_df["pop_source"] = "interp"
nfk_df.loc[("NFK", [2001, 2011, 2016]), "pop_source"] = "AUS_CENSUS"
nfk_df.loc[("NFK", list(cxr_interp_yrs)), "pop"] = np.exp(
    np.interp(
        cxr_interp_yrs,
        [2001, 2011, 2016],
        np.log(nfk_df.loc[("NFK", [2001, 2011, 2016]), "pop"].values),
    )
)

# country-territory ratios for extrapolation
nfk_ratio = (
    nfk_df.loc[("NFK", [2001, 2011, 2016]), "pop"].values
    / pop_inh.loc[("AUS", [2001, 2011, 2016]), "pop"].values
).mean()
nfk_df.loc[("NFK", non_cen_yrs), "pop"] = (
    nfk_ratio * pop_inh.loc[("AUS", non_cen_yrs), "pop"].values
)
nfk_df.loc[("NFK", non_cen_yrs), "pop_source"] = "PWT_AUS_ratio_AUS_CENSUS_NFK"

In [None]:
aus_df = pop_inh.loc[(["AUS"], range(1950, 2021)), ["pop"]].copy()
aus_df["pop_source"] = "PWT"
aus_df.loc[("AUS", 2020), "pop_source"] = "PWT_ratio_UN"

pop_cleaned = pd.concat([aus_df, pop_cleaned, nfk_df, cck_df, cxr_df], axis=0)
pop_cleaned.sort_index(inplace=True)
pop_cleaned["pop_unit"] = "millions (of people)"

### Cleaning for the New Zealand territories

These are (excluding the no-population ones):
- External Territories: `TKL`, `NIU`, `COK`

In [None]:
# New Zealand
nzl_df = pop_inh.loc[("NZL", range(1950, 2021)), ["pop"]].sort_index()
nzl_df.loc[("NZL", 2020), "pop"] = (
    nzl_df.loc[("NZL", 2019), "pop"]
    * pop_inh.loc[("NZL", 2020), "un_pop"]
    / pop_inh.loc[("NZL", 2019), "un_pop"]
)
nzl_df["pop_source"] = "PWT"
nzl_df.loc[("NZL", 2020), "pop_source"] = "PWT_ratio_UN"

In [None]:
nzl_terr_df = pop_inh.loc[
    (["NIU", "COK", "TKL"], list(range(1950, 2021))), ["un_pop"]
].rename(columns={"un_pop": "pop"})
nzl_terr_df["pop_source"] = "UN"

pop_cleaned = pd.concat([pop_cleaned, nzl_df, nzl_terr_df], axis=0).sort_index()
pop_cleaned["pop_unit"] = "millions (of people)"

### Cleaning for the Danish territories

These are (excluding the no-population ones):
- `GRL`, `FRO`

In [None]:
# Denmark
dnk_df = pop_inh.loc[("DNK", range(1950, 2021)), ["pop"]].sort_index()
dnk_df.loc[("DNK", 2020), "pop"] = (
    dnk_df.loc[("DNK", 2019), "pop"]
    * pop_inh.loc[("DNK", 2020), "un_pop"]
    / pop_inh.loc[("DNK", 2019), "un_pop"]
)
dnk_df["pop_source"] = "PWT"
dnk_df.loc[("DNK", 2020), "pop_source"] = "PWT_ratio_UN"

In [None]:
dnk_terr_df = pop_inh.loc[(["GRL", "FRO"], list(range(1950, 2021))), ["un_pop"]].rename(
    columns={"un_pop": "pop"}
)
dnk_terr_df["pop_source"] = "UN"

pop_cleaned = pd.concat([pop_cleaned, dnk_df, dnk_terr_df], axis=0).sort_index()
pop_cleaned["pop_unit"] = "millions (of people)"

### Cleaning for the Finnish territories

These are (excluding the no-population ones):
- `ALA`

In [None]:
# Finland
fin_df = pop_inh.loc[("FIN", range(1950, 2021)), ["pop"]].sort_index()
fin_df.loc[("FIN", 2020), "pop"] = (
    fin_df.loc[("FIN", 2019), "pop"]
    * pop_inh.loc[("FIN", 2020), "un_pop"]
    / pop_inh.loc[("FIN", 2019), "un_pop"]
)
fin_df["pop_source"] = "PWT"
fin_df.loc[("FIN", 2020), "pop_source"] = "PWT_ratio_UN"

In [None]:
ala_df = pop_inh.loc[(["ALA"], range(1950, 2021)), ["ala_pop"]].rename(
    columns={"ala_pop": "pop"}
)
ala_df["pop_source"] = "ALA_STAT"
ala_yrs = (
    ala_df.loc[~pd.isnull(ala_df["pop"]), :].index.get_level_values("year").unique()
)
ala_noyrs = np.setdiff1d(list(range(1950, 2021)), ala_yrs)
ala_ratio = (
    ala_df.loc[("ALA", ala_yrs), "pop"].values
    / fin_df.loc[("FIN", ala_yrs), "pop"].values
).mean()
ala_df.loc[("ALA", ala_noyrs), "pop"] = (
    ala_ratio * fin_df.loc[("FIN", ala_noyrs), "pop"].values
)
ala_df.loc[("ALA", ala_noyrs), "pop_source"] = "PWT_FIN_ratio_ALA_STAT_ALA"

In [None]:
pop_cleaned = pd.concat([pop_cleaned, fin_df, ala_df], axis=0).sort_index()
pop_cleaned["pop_unit"] = "millions (of people)"

### Cleaning for the Norwegian territories

These are (excluding the no-population ones):
- `SJM`

In [None]:
# Finland
nor_df = pop_inh.loc[("NOR", range(1950, 2021)), ["pop"]].sort_index()
nor_df.loc[("NOR", 2020), "pop"] = (
    nor_df.loc[("NOR", 2019), "pop"]
    * pop_inh.loc[("NOR", 2020), "un_pop"]
    / pop_inh.loc[("NOR", 2019), "un_pop"]
)
nor_df["pop_source"] = "PWT"
nor_df.loc[("NOR", 2020), "pop_source"] = "PWT_ratio_UN"

In [None]:
sjm_df = pop_inh.loc[(["SJM"], range(1950, 2021)), ["nor_census_pop"]].rename(
    columns={"nor_census_pop": "pop"}
)
sjm_df["pop_source"] = "NOR_CENSUS"
sjm_yrs = (
    sjm_df.loc[~pd.isnull(sjm_df["pop"]), :].index.get_level_values("year").unique()
)
sjm_noyrs = np.setdiff1d(list(range(1950, 2021)), sjm_yrs)
sjm_ratio = (
    sjm_df.loc[("SJM", sjm_yrs), "pop"].values
    / nor_df.loc[("NOR", sjm_yrs), "pop"].values
).mean()
sjm_df.loc[("SJM", sjm_noyrs), "pop"] = (
    sjm_ratio * nor_df.loc[("NOR", sjm_noyrs), "pop"].values
)
sjm_df.loc[("SJM", sjm_noyrs), "pop_source"] = "PWT_NOR_ratio_NOR_CENSUS_SJM"

In [None]:
pop_cleaned = pd.concat([pop_cleaned, nor_df, sjm_df], axis=0).sort_index()
pop_cleaned["pop_unit"] = "millions (of people)"

### For the rest, filling in missing population info from UN populations data to PWT data

In [None]:
# creating a xr.Dataset with the rest
pop_inh_remain = (
    pop_inh.loc[(slice(None), range(1950, 2021)), :]
    .index.get_level_values("ccode")
    .unique()
)
pop_inh_remain = np.setdiff1d(
    pop_inh_remain, pop_cleaned.index.get_level_values("ccode").unique()
)
pop_inh_remain = np.setdiff1d(pop_inh_remain, ["CHI", "FRA+OV", "XKX", "GGY+JEY"])
pop_inh_rem = pop_inh.loc[(pop_inh_remain, range(1950, 2021)), ["pop", "un_pop"]].copy()
pop_inh_rem_ds = xr.Dataset.from_dataframe(pop_inh_rem)

# smooth_fill
pop_inh_rem_filled = ypk_fn.smooth_fill(
    pop_inh_rem_ds["pop"], pop_inh_rem_ds["un_pop"], time_dim="year", other_dim="ccode"
).to_dataframe()

pop_inh_rem_filled = pop_inh_rem_filled.merge(
    pop_inh_rem[["pop"]].rename(columns={"pop": "pop_source"}),
    how="left",
    left_index=True,
    right_index=True,
)
pop_inh_rem_filled.loc[~pd.isnull(pop_inh_rem_filled.pop_source), "pop_source"] = "PWT"
pop_inh_rem_filled.loc[pd.isnull(pop_inh_rem_filled.pop_source), "pop_source"] = "UN"

In [None]:
# concatenating with the rest
pop_cleaned = pd.concat([pop_cleaned, pop_inh_rem_filled], axis=0).sort_index()
pop_cleaned["pop_unit"] = "millions (of people)"

## Historical GDPpc (1950-2019), constant 2017 PPP USD, creating a single sequence

We will first work with the constant 2017 PPP USD (i.e., ones with the `rgdpna` in their names), then create current PPP 2017 USD versions appropriately (in accordance with `cgdpo`). We will work with **per capita** version for extrapolation. But before working on the entire set of countries, I will first set aside France (due to the territory-inclusion problem mentioned above), work with the other countries (including the 5 French overseas territories), and return to the French case to subtract the sum of GDP values from the said territories to acquire the mainland French GDP (and GDPpc).

### Setting aside the no-population cases

In [None]:
y_uninh = gp_df.loc[(sset.UNINHABITED_ISOS, range(1950, 2021)), ["rgdpna"]].rename(
    columns={"rgdpna": "rgdpna_pc"}
)
y_uninh["rgdpna_pc"] = 0
y_uninh["gdp_source"] = "uninhabited"

### Using World Bank information to fill in for PWT

Both `rgdpna_pc` and `wb_rgdpna_pc` are in ones of constant 2017 PPP USD; we will use the `smooth_fill` function in `ypk_settings.py`.

In [None]:
gp_df["rgdpna_pc"] = gp_df["rgdpna"] / gp_df["pop"]
y_xr_pwt_wb = xr.Dataset.from_dataframe(
    gp_df.loc[
        ~gp_df.index.get_level_values("ccode").isin(sset.UNINHABITED_ISOS),
        ["rgdpna_pc", "wb_rgdpna_pc"],
    ]
)

# filling in rgdpna_pc using wb_rgdpna_pc
y_pwt_wb = ypk_fn.smooth_fill(
    y_xr_pwt_wb["rgdpna_pc"],
    y_xr_pwt_wb["wb_rgdpna_pc"],
    time_dim="year",
    other_dim="ccode",
).to_dataframe()
y_pwt_clean = y_pwt_wb.loc[
    y_pwt_wb.index.get_level_values("year").isin(range(1950, 2021)), :
].copy()

# filling in the source information
y_pwt_clean = y_pwt_clean.merge(
    gp_df[["rgdpna_pc", "wb_rgdpna_pc"]].rename(columns={"rgdpna_pc": "gdp_source"}),
    left_index=True,
    right_index=True,
    how="left",
)
y_pwt_clean.loc[~pd.isnull(y_pwt_clean.gdp_source), "gdp_source"] = "PWT"
y_pwt_clean.loc[
    pd.isnull(y_pwt_clean.gdp_source) & ~pd.isnull(y_pwt_clean.wb_rgdpna_pc),
    "gdp_source",
] = "WB"
y_pwt_clean.drop(["wb_rgdpna_pc"], axis=1, inplace=True)

### Using IMF information to fill in for PWT

According to this [link](https://www.imf.org/external/pubs/ft/weo/faq.htm#q4d) talking about the PPP used by IMF, it is said that IMF WEO's PPP rates are calculated based on ICP's 2017 report. Since the variable description says that this is constant PPP, we will interpret `imf_rgdpna_pc` as being in ones of constant 2017 PPP USD.

In [None]:
y_pwt_clean = y_pwt_clean.merge(
    gp_df[["imf_rgdpna_pc"]], how="left", left_index=True, right_index=True
)
y_xr_pwt_imf = xr.Dataset.from_dataframe(y_pwt_clean[["imf_rgdpna_pc", "rgdpna_pc"]])

# smooth_fill
y_pwt_imf = ypk_fn.smooth_fill(
    y_xr_pwt_imf["rgdpna_pc"],
    y_xr_pwt_imf["imf_rgdpna_pc"],
    time_dim="year",
    other_dim="ccode",
).to_dataframe()

# merging
prev_dict = {"rgdpna_pc": "rgdpna_pc_prev"}
y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(
    y_pwt_imf[["rgdpna_pc"]],
    left_index=True,
    right_index=True,
    how="left",
)
y_pwt_clean.loc[
    ~pd.isnull(y_pwt_clean.rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),
    "gdp_source",
] = "IMF"
y_pwt_clean.drop(["rgdpna_pc_prev", "imf_rgdpna_pc"], inplace=True, axis=1)

### Using MPD (Maddison) information to fill in for PWT

MPD data in `mpd_rgdpna_pc` are in **constant 2011 PPP USD**, so we will have to use the PPP conversion rates to change them into constant 2017 PPP USD before using the function `smooth_fill`.

In [None]:
# from constant 2011 PPP USD ot constant 2017 PPP USD
ppp_to_2017 = ypk_fn.ppp_conversion_specific_year(2017, extrap_sim=True)
ppp_11_to_17 = ppp_to_2017.loc[(slice(None), 2011), :].reset_index()
ppp_11_to_17.set_index(["ccode"], inplace=True)

We will use the "neutral assumption" (i.e., using the conversion rate of 1) and not use the WB conversion rates (and **only use PWT conversion rates**) due to there being a big discrepancy between the two measures.

In [None]:
# applying the conversion
y_pwt_clean = y_pwt_clean.merge(
    gp_df[["mpd_rgdpna_pc"]].merge(
        ppp_11_to_17[["conv"]], left_index=True, right_index=True, how="left"
    ),
    left_index=True,
    right_index=True,
    how="left",
)
y_pwt_clean.loc[pd.isnull(y_pwt_clean.conv), "conv"] = 1
y_pwt_clean["mpd_rgdpna_pc"] *= y_pwt_clean["conv"]

In [None]:
# smooth_fill
y_xr_pwt_mpd = xr.Dataset.from_dataframe(y_pwt_clean[["mpd_rgdpna_pc", "rgdpna_pc"]])
y_pwt_mpd = ypk_fn.smooth_fill(
    y_xr_pwt_mpd["rgdpna_pc"],
    y_xr_pwt_mpd["mpd_rgdpna_pc"],
    time_dim="year",
    other_dim="ccode",
).to_dataframe()

# merging
y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(
    y_pwt_mpd[["rgdpna_pc"]],
    left_index=True,
    right_index=True,
    how="left",
)
y_pwt_clean.loc[
    ~pd.isnull(y_pwt_clean.rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),
    "gdp_source",
] = "MPD"
y_pwt_clean.drop(["rgdpna_pc_prev", "mpd_rgdpna_pc", "conv"], inplace=True, axis=1)

### Using OECD regional data to fill in for PWT

OECD information is in constant 2015 PPP USD, so we will change accordingly before using `smooth_fill`.

In [None]:
# from constant 2015 PPP USD to constant 2017 PPP USD
ppp_15_to_17 = ppp_to_2017.loc[(slice(None), 2015), :].reset_index()
ppp_15_to_17.set_index(["ccode"], inplace=True)

# applying the conversion
y_pwt_clean = y_pwt_clean.merge(
    gp_df[["oecd_rgdpna"]].merge(
        ppp_15_to_17[["conv"]], left_index=True, right_index=True, how="left"
    ),
    left_index=True,
    right_index=True,
    how="left",
)
y_pwt_clean = y_pwt_clean.merge(
    pop_cleaned[["pop"]], left_index=True, right_index=True, how="left"
)
y_pwt_clean.loc[pd.isnull(y_pwt_clean.conv), "conv"] = 1
y_pwt_clean["oecd_rgdpna_pc"] = (
    y_pwt_clean["oecd_rgdpna"] / y_pwt_clean["pop"] * y_pwt_clean["conv"]
)

In [None]:
# smooth_fill
y_xr_pwt_oecd = xr.Dataset.from_dataframe(y_pwt_clean[["oecd_rgdpna_pc", "rgdpna_pc"]])
y_pwt_oecd = ypk_fn.smooth_fill(
    y_xr_pwt_oecd["rgdpna_pc"],
    y_xr_pwt_oecd["oecd_rgdpna_pc"],
    time_dim="year",
    other_dim="ccode",
).to_dataframe()

# merging
y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(
    y_pwt_oecd[["rgdpna_pc"]],
    left_index=True,
    right_index=True,
    how="left",
)
y_pwt_clean.loc[
    ~pd.isnull(y_pwt_clean.rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),
    "gdp_source",
] = "OECD"
y_pwt_clean.drop(
    ["rgdpna_pc_prev", "oecd_rgdpna_pc", "oecd_rgdpna", "conv"], inplace=True, axis=1
)

### Using CIA information to fill in for PWT

CIA information (`cia_rgdpna`) is in constant 2017 PPP USD, so we will use this as is.

In [None]:
# CIA rgdpna
y_pwt_clean = y_pwt_clean.merge(
    gp_df[["cia_rgdpna"]], left_index=True, right_index=True, how="left"
)
y_pwt_clean["cia_rgdpna"] /= y_pwt_clean["pop"]
y_pwt_clean.rename(columns={"cia_rgdpna": "cia_rgdpna_pc"}, inplace=True)

# interpolating
cia_y_ccode = (
    y_pwt_clean.loc[~pd.isnull(y_pwt_clean.cia_rgdpna_pc), :]
    .index.get_level_values("ccode")
    .unique()
)
cc_dfs = []
for cc in tqdm(cia_y_ccode):
    cc_df = y_pwt_clean.loc[cc, ["cia_rgdpna_pc"]]
    cc_yrs = (
        cc_df.loc[~pd.isnull(cc_df.cia_rgdpna_pc), :]
        .index.get_level_values("year")
        .unique()
    )
    cc_filled = range(cc_yrs.min(), cc_yrs.max() + 1)
    cc_filled_vals = np.exp(
        np.interp(cc_filled, cc_yrs, np.log(cc_df.loc[cc_yrs, "cia_rgdpna_pc"].values))
    )
    cc_dfs.append(
        pd.DataFrame(
            data={
                "ccode": [cc] * len(cc_filled),
                "year": cc_filled,
                "cia_rgdpna_pc_interp": cc_filled_vals,
            }
        )
    )
cc_dfs = pd.concat(cc_dfs, axis=0).set_index(["ccode", "year"])

In [None]:
# smooth_fill
y_pwt_clean = y_pwt_clean.merge(cc_dfs, left_index=True, right_index=True, how="outer")
y_xr_pwt_cia = xr.Dataset.from_dataframe(
    y_pwt_clean[["cia_rgdpna_pc_interp", "rgdpna_pc"]]
)
y_pwt_cia = ypk_fn.smooth_fill(
    y_xr_pwt_cia["rgdpna_pc"],
    y_xr_pwt_cia["cia_rgdpna_pc_interp"],
    time_dim="year",
    other_dim="ccode",
).to_dataframe()

# merging
y_pwt_clean = y_pwt_clean.rename(columns=prev_dict).merge(
    y_pwt_cia[["rgdpna_pc"]],
    left_index=True,
    right_index=True,
    how="left",
)
y_pwt_clean.loc[
    ~pd.isnull(y_pwt_clean.cia_rgdpna_pc) & pd.isnull(y_pwt_clean.rgdpna_pc_prev),
    "gdp_source",
] = "CIA"
y_pwt_clean.loc[
    ~pd.isnull(y_pwt_clean.cia_rgdpna_pc_interp)
    & pd.isnull(y_pwt_clean.rgdpna_pc_prev)
    & (y_pwt_clean.gdp_source != "CIA"),
    "gdp_source",
] = "CIA_interp"

y_pwt_clean.drop(
    ["rgdpna_pc_prev", "cia_rgdpna_pc", "cia_rgdpna_pc_interp"], inplace=True, axis=1
)

### Aland Statistics (for `ALA`)

We will extrapolate for the missing years as well, by creating ratios with Finland.

In [None]:
ala_cgdpo_pc = gp_df.loc[~pd.isnull(gp_df.ala_cgdpo_pc), ["ala_cgdpo_pc"]]
ala_yrs = ala_cgdpo_pc.index.get_level_values("year").unique()
ala_ppp_val = ppp_to_2017.loc[("ALA", ala_yrs), "conv"].values
y_pwt_clean.loc[("ALA", ala_yrs), "rgdpna_pc"] = (
    ala_ppp_val * ala_cgdpo_pc.ala_cgdpo_pc.values
)
y_pwt_clean.loc[("ALA", ala_yrs), "gdp_source"] = "ALAND_STAT"

# ratio wrt Finland
ala_fin_ratio = (
    y_pwt_clean.loc[("ALA", ala_yrs), "rgdpna_pc"].values
    / y_pwt_clean.loc[("FIN", ala_yrs), "rgdpna_pc"].values
).mean()
ala_noyrs = np.setdiff1d(list(range(1950, 2021)), ala_yrs)
y_pwt_clean.loc[("ALA", ala_noyrs), "rgdpna_pc"] = (
    ala_fin_ratio * y_pwt_clean.loc[("FIN", ala_noyrs), "rgdpna_pc"].values
)
y_pwt_clean.loc[("ALA", ala_noyrs), "gdp_source"] = "FIN_extrap"

### Treadgold Reports on Norfolk Island (for `NFK`)

We will extrapolate for the missing years as well, by creating ratios with Australia (`AUS`).

In [None]:
tgold_rgdpna_pc = gp_df.loc[
    ~pd.isnull(gp_df.treadgold_rgdpna_pc), ["treadgold_rgdpna_pc"]
]
tgold_yrs = tgold_rgdpna_pc.index.get_level_values("year").unique()
tgold_interp_yrs = list(range(tgold_yrs.min(), tgold_yrs.max() + 1))
tgold_interp_vals = np.exp(
    np.interp(
        tgold_interp_yrs, tgold_yrs, np.log(tgold_rgdpna_pc.treadgold_rgdpna_pc.values)
    )
)
y_pwt_clean.loc[("NFK", tgold_interp_yrs), "rgdpna_pc"] = tgold_interp_vals
y_pwt_clean.loc[("NFK", tgold_interp_yrs), "gdp_source"] = "Treadgold_ratio_PWT"

# calculating the ratios separately
nfk_aus_ratio_early = (
    y_pwt_clean.loc[("NFK", tgold_interp_yrs[0:2]), "rgdpna_pc"].values
    / y_pwt_clean.loc[("AUS", tgold_interp_yrs[0:2]), "rgdpna_pc"].values
).mean()

nfk_aus_ratio_later = (
    y_pwt_clean.loc[("NFK", tgold_interp_yrs[-2:]), "rgdpna_pc"].values
    / y_pwt_clean.loc[("AUS", tgold_interp_yrs[-2:]), "rgdpna_pc"].values
).mean()

y_pwt_clean.loc[("NFK", 1950), "rgdpna_pc"] = (
    nfk_aus_ratio_early * y_pwt_clean.loc[("AUS", 1950), "rgdpna_pc"]
)
nfk_noyrs = np.setdiff1d(list(range(1951, 2021)), tgold_interp_yrs)
y_pwt_clean.loc[("NFK", nfk_noyrs), "rgdpna_pc"] = (
    nfk_aus_ratio_later * y_pwt_clean.loc[("AUS", nfk_noyrs), "rgdpna_pc"].values
)
y_pwt_clean.loc[("NFK", [1950] + list(nfk_noyrs)), "gdp_source"] = "AUS_extrap"

### Statistics Netherlands information on `BES`

Nominal GDP for Bonaire, Saba, and Eustatius are shown separately (for the years 2012 and 2017, on this [Statistics Netherlands file](https://www.cbs.nl/en-gb/publication/2020/41/trends-in-the-caribbean-netherlands-2020)). We will add them together, interpolate, and create ratios with the Netherlands information to fill in the missing pieces as well.

In [None]:
# numbers correspond to GDP for Bonaire, Saba, and Eustatius (in mil. of nominal USD)
bes_yrs = list(range(2012, 2018))
bes_2012, bes_2017 = 372 + 42 + 101, 428 + 47 + 108
bes_pc_12_17 = (
    np.array([bes_2012, bes_2017])
    / pop_cleaned.loc[("BES", [2012, 2017]), "pop"].values
)

bes_ratio = bes_pc_12_17 / gp_df.loc[("NLD", [2012, 2017]), "wb_gdp_nom_pc"].values
bes_avg_ratio = bes_ratio.mean()
bes_ppp_pc_12_17 = (
    bes_ratio * y_pwt_clean.loc[("NLD", [2012, 2017]), "rgdpna_pc"].values
)
bes_ppp_pc_12_17_interp = np.exp(
    np.interp(bes_yrs, [2012, 2017], np.log(bes_ppp_pc_12_17))
)
bes_noyrs = np.setdiff1d(list(range(1950, 2021)), bes_yrs)
bes_ppp_pc_not12_17 = (
    y_pwt_clean.loc[("NLD", bes_noyrs), "rgdpna_pc"].values * bes_avg_ratio
)
y_pwt_clean.loc[("BES", bes_yrs), "rgdpna_pc"] = bes_ppp_pc_12_17_interp
y_pwt_clean.loc[("BES", bes_yrs), "gdp_source"] = "NLD_STAT"
y_pwt_clean.loc[("BES", bes_noyrs), "rgdpna_pc"] = bes_ppp_pc_not12_17
y_pwt_clean.loc[("BES", bes_noyrs), "gdp_source"] = "NLD_extrap"

###  Information from CEROM, Saint Barthelemy (for `BLM`)

In [None]:
# getting the nominal values
cerom = gp_df.loc["BLM", ["cerom_gdppc"]]
cerom_yrs = cerom.loc[~pd.isnull(cerom.cerom_gdppc), :].index.values
cerom_vals = cerom.loc[~pd.isnull(cerom.cerom_gdppc), "cerom_gdppc"].values

# ratio with the nominal, french gdppc
blm_ratio = cerom_vals / gp_df.loc[("FRA", cerom_yrs), "wb_gdp_nom_pc"].values
blm_y_vals = blm_ratio * y_pwt_clean.loc[("FRA", cerom_yrs), "rgdpna_pc"].values
blm_interp_yrs = list(range(cerom_yrs.min(), cerom_yrs.max() + 1))
y_pwt_clean.loc[("BLM", blm_interp_yrs), "rgdpna_pc"] = np.exp(
    np.interp(blm_interp_yrs, cerom_yrs, np.log(blm_y_vals))
)

# rest of the years
blm_noyrs = np.setdiff1d(list(range(1950, 2021)), blm_interp_yrs)
y_pwt_clean.loc[("BLM", blm_noyrs), "rgdpna_pc"] = (
    y_pwt_clean.loc[("FRA", blm_noyrs), "rgdpna_pc"].values * blm_ratio.mean()
)

y_pwt_clean.loc[("BLM", blm_noyrs), "gdp_source"] = "FRA_extrap"
y_pwt_clean.loc[("BLM", blm_interp_yrs), "gdp_source"] = "CEROM"

### Australian Census from information (for `CCK` and `CXR`)

Again, we only have nominal GDP of these areas in the year 2010 as nominal terms. Therefore, we will again rely on the scale with the relevant sovereignty (being `AUS`).

In [None]:
# ratios in 2010
aus_2010 = gp_df.loc[("AUS", 2010), "wb_gdp_nom_pc"]
cxr_2010 = (
    gp_df.loc[("CXR", 2010), "aus_census_nom_gdp"]
    / pop_cleaned.loc[("CXR", 2010), "pop"]
)
cck_2010 = (
    gp_df.loc[("CCK", 2010), "aus_census_nom_gdp"]
    / pop_cleaned.loc[("CCK", 2010), "pop"]
)
cxr_aus_r, cck_aus_r = cxr_2010 / aus_2010, cck_2010 / aus_2010

# apply the ratios
cxr_vals = y_pwt_clean.loc[("AUS", range(1950, 2021)), "rgdpna_pc"].values * cxr_aus_r
cck_vals = y_pwt_clean.loc[("AUS", range(1950, 2021)), "rgdpna_pc"].values * cck_aus_r
y_pwt_clean.loc[("CXR", list(range(1950, 2021))), "rgdpna_pc"] = cxr_vals
y_pwt_clean.loc[("CCK", list(range(1950, 2021))), "rgdpna_pc"] = cck_vals

# sources
y_pwt_clean.loc[("CXR", list(range(1950, 2021))), "gdp_source"] = "AUS_extrap"
y_pwt_clean.loc[("CCK", list(range(1950, 2021))), "gdp_source"] = "AUS_extrap"
y_pwt_clean.loc[("CCK", 2010), "gdp_source"] = "AUS_parliament"
y_pwt_clean.loc[("CXR", 2010), "gdp_source"] = "AUS_parliament"

### Svalbard and Jan Mayen (`SJM`)

As mentioned above, we do not have reliable metrics for GDPpc or GDP for `SJM`; so we will copy Norway's information.

In [None]:
y_pwt_clean.loc[("SJM", list(range(1950, 2021))), "rgdpna_pc"] = y_pwt_clean.loc[
    ("NOR", list(range(1950, 2021))), "rgdpna_pc"
].values
y_pwt_clean.loc["SJM", "gdp_source"] = "NOR_copy"

### United States Minor Outlying Islands (`UMI`)

We will use the most similar U.S. territory, which are the Northern Mariana Islands (`MNP`).

In [None]:
mnp = y_pwt_clean.loc["MNP", :].copy()
mnpyrs = np.unique(mnp.loc[~pd.isnull(mnp.rgdpna_pc), :].index.get_level_values("year"))
y_pwt_clean.loc[("UMI", mnpyrs), "rgdpna_pc"] = y_pwt_clean.loc[
    ("MNP", mnpyrs), "rgdpna_pc"
].values
y_pwt_clean.loc[("UMI", mnpyrs), "gdp_source"] = "MNP_copy"

### Pitcairn Island (`PCN`): take the ratio with `GBR`

In [None]:
pcn_ratio = (
    gp_df.loc[("PCN", 2006), "pcn_nom_gdp"]
    / y_pwt_clean.loc[("PCN", 2006), "pop"]
    / gp_df.loc[("GBR", 2006), "wb_gdp_nom_pc"]
)
pcn_rgdpna_pc = (
    pcn_ratio * y_pwt_clean.loc[("GBR", list(range(1950, 2021))), "rgdpna_pc"].values
)
y_pwt_clean.loc[("PCN", list(range(1950, 2021))), "rgdpna_pc"] = pcn_rgdpna_pc
y_pwt_clean.loc["PCN", "gdp_source"] = "GBR_ratio"

### Cleaning up for other territories (current and former)

In general, the reason for assigning country-sovereignty ratios instead of that between similar countries is based on the argument in [**Bertram (World Development, 2003)**](https://www.sciencedirect.com/science/article/abs/pii/S0305750X03002134) that territories (or island economies in the paper, to be more specific) seem to converge to trend with their metropolitan patrons more so than similar territory (island) economies. Based on this idea, we will fill in the missing years' GDPpc data with the nearest 5-year average (e.g., if years before 2000 are missing, use the country-sovereignty GDPpc ratio from 2000-2005 [averaged] to extrapolate for the missing years).

Following territory-sovereignty relationships are considered (excluding the ones dealt with above, but including *previous* territory-sovereignty relationships mentioned in Bertram (2003)). Note that if they are already all filled (1950-2020), they will not be further extrapolated, and for the former territories, year in parentheses is that of gaining independence:
- Current `GBR`: `IMN`, `JEY`, `GGY`, `AIA`, `BMU`, `IOT`, `VGB`, `CYM`, `FLK`, `GIB`, `MSR`, `SHN`, `TCA`
- Former `GBR`: `VCT` (1979), `DMA` (1976), `GRD` (1974), `KNA` (1983), `ATG` (1981), `BHS` (1973), `MDV` (1966), `KIR` (1979), `TUV` (1978), `SLB` (1978), `TON` (1970), `FJI` (1970), `VUT` (1980; also managed by `FRA`)
- Current `FRA`: `GUF`, `GLP`, `MTQ`, `MYT`, `REU`, `SPM`, `MAF`, `BLM`, `PYF`, `WLF`, `NCL`
- Former `FRA`: `COM` (1975)
- Current `NLD`: `SXM`, `ABW`, `BES`, `CUW`
- Current `DNK`: `GRL`, `FRO`
- Current `NZL`: `NIU`, `WSM`, `COK`, `TKL`
- Current `USA`: `VIR`, `GUM`, `UMI`, `ASM`
- Former `USA` (including Free Association): `MHL`, `FSM`, `PLW`
- Former `AUS`: `NRU`

In [None]:
def fill_using_simple_ratio(
    terr_code, sov_code, df, col="rgdpna_pc", source_col="gdp_source"
):
    """Fill the missing values of the country denoted by `terr_code` using the average
    ratio of nearest known values (at most 5 years) between the countries `terr_code`
    and `sov_code`. This is based on the observation in Bertram (World Development,
    2003) that GDPpc of (island) territories tend to converge to sovereign GDPpc.

    Parameters
    ----------
    terr_code : str
        country code for country/region that belongs to or was associated with the
        country/region represented by `sov_code`
    sov_code : str
        country code for country/region that had or still has legal control over the
        country/region represented by `terr_code`
    df : pandas.DataFrame
        that contains the country/region-level information; should have the columns
        `col` and `source_col`, and be multi-indexed by `ccode` and `year` (denoting
        country code and year)
    col : str
        column containing data that needs imputation/extrapolation (by using the ratio
        of values from the two countries)
    source_col : str
        column containing source data, to be filled with basic information about which
        country information (`sov_code`) was used to conduct the
        imputation/extrapolation

    Returns
    -------
    df : pandas.DataFrame
        containing the original information from the pre-modified `df` and the newly
        imputed/extrapolated information

    """
    ALL_YRS = list(range(1950, 2021))
    msng = (
        df.loc[
            pd.isnull(df[col]) & (df.index.get_level_values("ccode") == terr_code), :
        ]
        .index.get_level_values("year")
        .unique()
    )

    if len(msng) == 0:
        return df

    filled = np.sort(np.setdiff1d(ALL_YRS, msng))

    if ALL_YRS[-1] in msng:
        # years to create averages from
        years = filled[-5:]

        # which years to extrapolate?
        to_fill = msng[msng > filled[-1]]
        avg_ratio = (
            df.loc[(terr_code, years), col].values
            / df.loc[(sov_code, years), col].values
        ).mean()
        df.loc[(terr_code, to_fill), col] = (
            avg_ratio * df.loc[(sov_code, to_fill), col].values
        )

    if ALL_YRS[0] in msng:
        # years to create averages from
        years = filled[0:5]

        # which years to extrapolate?
        to_fill = msng[msng < filled[0]]
        avg_ratio = (
            df.loc[(terr_code, years), col].values
            / df.loc[(sov_code, years), col].values
        ).mean()
        df.loc[(terr_code, to_fill), col] = (
            avg_ratio * df.loc[(sov_code, to_fill), col].values
        )

    df.loc[(terr_code, msng), source_col] = "{}_extrap".format(sov_code)

    return df

#### `GBR` Territories (former and current)

In [None]:
current_terr_gbr = [
    "IMN",
    "JEY",
    "GGY",
    "AIA",
    "BMU",
    "VGB",
    "CYM",
    "FLK",
    "GIB",
    "MSR",
    "SHN",
    "TCA",
]
former_terr_gbr = [
    "VCT",
    "DMA",
    "GRD",
    "KNA",
    "ATG",
    "BHS",
    "MDV",
    "KIR",
    "TUV",
    "SLB",
    "TON",
    "FJI",
    "VUT",
]
for i in former_terr_gbr + current_terr_gbr:
    y_pwt_clean = fill_using_simple_ratio(i, "GBR", y_pwt_clean)

#### `FRA` Territories (former and current)

In [None]:
current_terr_fra = [
    "GUF",
    "GLP",
    "MTQ",
    "MYT",
    "REU",
    "SPM",
    "MAF",
    "BLM",
    "PYF",
    "WLF",
    "NCL",
]
former_terr_fra = ["COM"]
for i in former_terr_fra + current_terr_fra:
    y_pwt_clean = fill_using_simple_ratio(i, "FRA", y_pwt_clean)

#### `NLD` Territories (current)

In [None]:
current_terr_nld = ["SXM", "ABW", "BES", "CUW"]
for i in current_terr_nld:
    y_pwt_clean = fill_using_simple_ratio(i, "NLD", y_pwt_clean)

#### `DNK` Territories (current)

In [None]:
former_terr_prt = ["GRL", "FRO"]
for i in former_terr_prt:
    y_pwt_clean = fill_using_simple_ratio(i, "DNK", y_pwt_clean)

#### `NZL` Territories (current)

In [None]:
current_terr_nzl = ["NIU", "WSM", "COK", "TKL"]
for i in current_terr_nzl:
    y_pwt_clean = fill_using_simple_ratio(i, "NZL", y_pwt_clean)

#### `USA` Territories (former and current)

In [None]:
current_terr_usa = ["VIR", "GUM", "UMI", "ASM"]
former_terr_usa = ["MHL", "FSM", "PLW"]
for i in former_terr_usa + current_terr_usa:
    y_pwt_clean = fill_using_simple_ratio(i, "USA", y_pwt_clean)

#### `AUS` Territories (former)

In [None]:
y_pwt_clean = fill_using_simple_ratio("NRU", "AUS", y_pwt_clean)

### City-states, city-territories, or microstates

Will use a similar tactic as above. We match it as follows:
- `AND`: `FRA`
- `MCO`: `FRA`
- `MAC`: `HKG`
- `VAT`: `ITA`

In [None]:
micro_to_extrap = ["AND", "MCO", "MAC", "VAT"]
micro_from_extrap = ["FRA", "FRA", "HKG", "ITA"]
for j, i in enumerate(micro_to_extrap):
    y_pwt_clean = fill_using_simple_ratio(i, micro_from_extrap[j], y_pwt_clean)

### Extrapolating based on similar trends

For the remaining countries, we will detect similar-trending countries and use their average trends to fill in the missing pieces. Note that I will not be using trends from those countries whose information have already been extrapolated using others' trends (e.g., using territory-sovereignty relationship).

In [None]:
## detecting which to extrapolate and which not to
to_extrap = []
from_extrap = []
for i in y_pwt_clean.index.get_level_values("ccode").unique():
    i_sum = pd.isnull(y_pwt_clean.loc[i, "rgdpna_pc"].values).sum()
    if i_sum > 0:
        if i not in ["GGY+JEY", "CHI", "FRA+OV"]:
            to_extrap.append(i)
        continue

    i_source = np.unique(y_pwt_clean.loc[i, "gdp_source"].values)
    i_no_extraped = True
    for sour in i_source:
        if (sour == "GBR_ratio") or ("extrap" in sour) or ("copy" in sour):
            i_no_extraped = False
            break

    if i_no_extraped:
        from_extrap.append(i)

exclude_extrap = np.setdiff1d(
    y_pwt_clean.index.get_level_values("ccode").unique(), to_extrap + from_extrap
)

In [None]:
## extrapolation process
gdppc_extrap = ypk_fn.extrap_using_closest(
    to_extrap,
    ypk_fn.organize_ver_to_hor(
        y_pwt_clean.copy(),
        "rgdpna_pc",
        "year",
        "ccode",
        range(1950, 2021),
    ),
    begin_end=[1950, 2020],
    exclude_these=list(exclude_extrap) + ["GGY+JEY", "CHI", "FRA+OV"],
)
gdppc_extrap = ypk_fn.organize_hor_to_ver(
    gdppc_extrap,
    "ccode",
    None,
    "rgdpna_pc_extrap",
    yrs=range(1950, 2021),
)

In [None]:
## merging this back to the original dataframe
y_pwt_clean = y_pwt_clean.merge(
    gdppc_extrap, how="left", left_index=True, right_index=True
)
y_pwt_clean.loc[pd.isnull(y_pwt_clean.gdp_source), "gdp_source"] = y_pwt_clean.loc[
    pd.isnull(y_pwt_clean.gdp_source), "msng_fill"
].values
y_pwt_clean.loc[pd.isnull(y_pwt_clean.rgdpna_pc), "rgdpna_pc"] = y_pwt_clean.loc[
    pd.isnull(y_pwt_clean.rgdpna_pc), "rgdpna_pc_extrap"
].values

In [None]:
## finalizing the rgdpna_pc series
y_clean = (
    y_pwt_clean.loc[
        ~y_pwt_clean.index.get_level_values("ccode").isin(["FRA+OV", "GGY+JEY", "CHI"]),
        :,
    ]
    .sort_index()
    .drop(["rgdpna_pc_extrap", "msng_fill"], axis=1)
)
y_clean = pd.concat([y_clean, y_uninh])[["rgdpna_pc", "gdp_source"]].sort_index()

### Cleaning up for Mainland France

Currently, the values that have been recorded in `rgdpna_pc` for `FRA` are in terms of overall French population (including the 5 overseas departments). Therefore, in order to keep things consistent, we will 1) multiply the overall French population and the overseas departments' populations to get `rgdpna` values, 2) subtract the five overseas departments' `rgdpna` values from overall French `rgdpna`, and 3) divide by the mainland French population to get the mainland-specific `rgdpna_pc` values.

In [None]:
fra_terr = ["GUF", "GLP", "MTQ", "MYT", "REU"]
fra_terr_dfs = y_clean.loc[["FRA"] + fra_terr, :].copy()
fra_terr_dfs = fra_terr_dfs.merge(
    gp_df.loc[["FRA"], ["pop"]], left_index=True, right_index=True, how="left"
)
fra_terr_dfs.loc[("FRA", 2020), "pop"] = (
    fra_terr_dfs.loc[("FRA", 2019), "pop"]
    * pop_cleaned.loc[("FRA", 2020), "pop"]
    / pop_cleaned.loc[("FRA", 2019), "pop"]
)

In [None]:
fra_terr_dfs

In [None]:
fra_terr = ["GUF", "GLP", "MTQ", "MYT", "REU"]
fra_terr_dfs = y_clean.loc[["FRA"] + fra_terr, :].copy()
fra_terr_dfs = fra_terr_dfs.merge(
    gp_df.loc[["FRA"], ["pop"]], left_index=True, right_index=True, how="left"
)

In [None]:
# cleaning up the mainland france
fra_terr = ["GUF", "GLP", "MTQ", "MYT", "REU"]
fra_terr_dfs = y_clean.loc[["FRA"] + fra_terr, :].copy()
fra_terr_dfs = fra_terr_dfs.merge(
    gp_df.loc[["FRA"], ["pop"]], left_index=True, right_index=True, how="left"
)
fra_terr_dfs.loc[("FRA", 2020), "pop"] = (
    fra_terr_dfs.loc[("FRA", 2019), "pop"]
    * pop_cleaned.loc[("FRA", 2020), "pop"]
    / pop_cleaned.loc[("FRA", 2019), "pop"]
)
yrs_tgt = list(range(1950, 2021))
for i in fra_terr:
    fra_terr_dfs.loc[(i, yrs_tgt), "pop"] = pop_cleaned.loc[(i, yrs_tgt), "pop"].values

fra_terr_dfs["rgdpna"] = fra_terr_dfs["rgdpna_pc"] * fra_terr_dfs["pop"]
fra_terr_dfs.loc[("FRA", yrs_tgt), "rgdpna"] = fra_terr_dfs.loc[
    ("FRA", yrs_tgt), "rgdpna"
].values - (
    fra_terr_dfs.loc[fra_terr, ["rgdpna"]]
    .reset_index()
    .groupby(["year"])
    .sum()
    .rgdpna.values
)

fra_terr_dfs.loc[("FRA", yrs_tgt), "pop"] = pop_cleaned.loc[
    ("FRA", yrs_tgt), "pop"
].values
fra_terr_dfs.loc[("FRA", yrs_tgt), "rgdpna_pc"] = (
    fra_terr_dfs.loc[("FRA", yrs_tgt), "rgdpna"].values
    / fra_terr_dfs.loc[("FRA", yrs_tgt), "pop"].values
)

# re-attaching with the cleaned GDPpc dataset
y_clean.loc[("FRA", yrs_tgt), "rgdpna_pc"] = fra_terr_dfs.loc[
    ("FRA", yrs_tgt), "rgdpna_pc"
].values

### Creating GDP (`rgdpna`) values

This will be simpler to execute, by appending the cleaned population dataset.

In [None]:
y_clean = y_clean.merge(pop_cleaned, left_index=True, right_index=True, how="left")
y_clean["gdp_unit"] = "millions (PPP USD)"
y_clean["gdppc_unit"] = "ones (PPP USD)"
y_clean["rgdpna"] = y_clean["rgdpna_pc"] * y_clean["pop"]

## Filling in the missing values for the `cgdpo` (current PPP 2017 USD) series

### Transforming the `rgdpna_pc` series to `cgdpo_pc` equivalents

In [None]:
y_clean = y_clean.merge(
    ppp_to_2017[["conv"]],
    how="left",
    left_index=True,
    right_index=True,
)
## neutral assumption
y_clean.loc[pd.isnull(y_clean.conv), "conv"] = 1

## copying the 2019 conversion to 2020 conversion
y_clean.loc[(slice(None), 2020), "conv"] = y_clean.loc[
    (slice(None), 2019), "conv"
].values
y_clean["cgdpo_pc_equiv"] = y_clean["rgdpna_pc"] / y_clean["conv"]

### Attaching the actual `cgdpo` values from PWT, and creating `cgdpo_pc`

In doing so, we will again try to clean up for the issue with French mainland.

In [None]:
col_dict = {"cgdpo": "cgdpo_pwt", "pop": "pop_pwt"}
y_clean = y_clean.merge(
    gp_df[["cgdpo", "pop"]].rename(columns=col_dict),
    left_index=True,
    right_index=True,
    how="left",
)

Briefly separating out the French territories, and dealing with their numbers first

In [None]:
## gathering the French overseas department values
y_clean_fra_terr = y_clean.loc[fra_terr, :].copy()
y_clean_fra_terr["cgdpo"] = y_clean_fra_terr["cgdpo_pc_equiv"]
fra_terr_cgdpo = y_clean_fra_terr.reset_index().groupby(["year"]).sum()["cgdpo"].values

## subtracting this from the `cgdpo_pwt` values in `y_claen` (for FRA)
y_clean.loc[("FRA", yrs_tgt), "cgdpo_pwt"] = (
    y_clean.loc[("FRA", yrs_tgt), "cgdpo_pwt"].values - fra_terr_cgdpo
)

Creating `cgdpo_pc` in PWT version (`cgdpo_pc_pwt`) but with the cleaned population

In [None]:
y_clean["cgdpo_pc_pwt"] = y_clean["cgdpo_pwt"] / y_clean["pop"]

### Smooth-filling the missing values

In [None]:
## creating a xr.Dataset with the cgdpo_pc variables
cgdpo_clean_up = xr.Dataset.from_dataframe(y_clean[["cgdpo_pc_pwt", "cgdpo_pc_equiv"]])

## smooth_fill
cgdpo_clean_up = ypk_fn.smooth_fill(
    cgdpo_clean_up["cgdpo_pc_pwt"],
    cgdpo_clean_up["cgdpo_pc_equiv"],
    time_dim="year",
    other_dim="ccode",
).to_dataframe()

y_clean = y_clean.merge(
    cgdpo_clean_up.rename(columns={"cgdpo_pc_pwt": "cgdpo_pc"}),
    left_index=True,
    right_index=True,
    how="left",
)

## creating cgdpo values
y_clean["cgdpo"] = y_clean["cgdpo_pc"] * y_clean["pop"]

## Creating the current PPP-2019 USD `cgdpo` and constant PPP-2019 USD `rgdpna`

In [None]:
## some minor clean-up of names
y_clean.drop(
    ["cgdpo_pwt", "cgdpo_pc_pwt", "pop_pwt", "cgdpo_pc_equiv", "conv"],
    axis=1,
    inplace=True,
)
ren_names = ["cgdpo_pc", "cgdpo", "rgdpna_pc", "rgdpna"]
y_clean.rename(
    columns=dict(zip(ren_names, [x + "_17" for x in ren_names])), inplace=True
)

Creating `cgdpo_19` and `cgdpo_pc_19`

In [None]:
pwt100 = pd.read_excel(sset.PATH_PWT_RAW)
pwt100.rename(columns={"countrycode": "ccode"}, inplace=True)
pwt100.set_index(["ccode", "year"], inplace=True)
infla_1719 = pwt100.loc[("USA", 2019), "pl_gdpo"] / pwt100.loc[("USA", 2017), "pl_gdpo"]

y_clean["cgdpo_19"] = y_clean["cgdpo_17"] * infla_1719
y_clean["cgdpo_pc_19"] = y_clean["cgdpo_19"] / y_clean["pop"]

Creating `rgdpna_19` and `rgdpna_pc_19`

In [None]:
ccodes = y_clean.index.get_level_values("ccode").unique()
y_clean["rgdpna_19"] = 0
for cc in tqdm(ccodes):
    cc_17 = y_clean.loc[(cc, 2019), "rgdpna_17"]
    if cc_17 == 0:
        continue
    cc_vals = (
        y_clean.loc[(cc, yrs_tgt), "rgdpna_17"].values
        / cc_17
        * y_clean.loc[(cc, 2019), "cgdpo_19"]
    )
    y_clean.loc[(cc, yrs_tgt), "rgdpna_19"] = cc_vals

y_clean["rgdpna_pc_19"] = y_clean["rgdpna_19"] / y_clean["pop"]

Filling the `nan`s in with zeros (which are from zero population)

In [None]:
y_clean.loc[pd.isnull(y_clean.rgdpna_pc_19), "rgdpna_pc_19"] = 0
y_clean.loc[pd.isnull(y_clean.cgdpo_pc_19), "cgdpo_pc_19"] = 0

## I-Y (investment to GDP) ratios and `delta` (depreciation rate) cleanup

We do not extrapolate for all the missingness of I-Y ratio here since that will be done by notebooks to follow.

In [None]:
## from PWT
y_clean = y_clean.merge(
    pwt100[["csh_i", "delta"]].rename(columns={"csh_i": "iy_ratio"}),
    how="left",
    left_index=True,
    right_index=True,
)

## from WB WDI
wb_wdi = pd.read_parquet(sset.DIR_WB_WDI_RAW / "wdi_pop_iy_gdp.parquet")
y_clean = y_clean.merge(
    wb_wdi[["NE.GDI.FTOT.ZS"]].rename(columns={"NE.GDI.FTOT.ZS": "wb_iy_ratio"}),
    how="left",
    left_index=True,
    right_index=True,
)

## these values are in percentages, so change accordingly
y_clean["wb_iy_ratio"] = y_clean["wb_iy_ratio"] / 100

In [None]:
# adding in the IMF iy ratios, while trying to avoid the ArrowInvalid error
imf = pd.read_excel(sset.PATH_IMF_WEO_RAW, na_values=["n/a", "--"]).rename(
    columns={"ISO": "ccode", "Subject Descriptor": "subject"}
)
imf = imf.loc[imf.ccode.isin(sset.ALL_ISOS_EXTENDED), :]

# renaming and organizing in vertical format
imf.loc[imf.subject == "Total investment", "subject"] = "imf_iy_ratio"
v_names = dict(zip(list(range(1980, 2021)), ["v_" + str(x) for x in range(1980, 2021)]))
imf.rename(columns=v_names, inplace=True)

imf_reorg = imf.loc[
    imf.subject == "imf_iy_ratio", ["ccode"] + list(v_names.values())
].set_index(["ccode"])
imf_reorg = ypk_fn.organize_hor_to_ver(
    imf_reorg, "ccode", None, "imf_iy_ratio", "v_", range(1980, 2021)
)
imf_reorg["imf_iy_ratio"] /= 100

# merging
y_clean = y_clean.merge(imf_reorg, how="outer", left_index=True, right_index=True)

In [None]:
## IY ratio fill-in, not smoothly; PWT -> WB -> IMF
y_clean["iy_ratio_source"] = np.nan
y_clean.loc[
    pd.isnull(y_clean.iy_ratio) & ~pd.isnull(y_clean.wb_iy_ratio), "iy_ratio_source"
] = "WB"
y_clean.loc[
    pd.isnull(y_clean.iy_ratio)
    & pd.isnull(y_clean.wb_iy_ratio)
    & ~pd.isnull(y_clean.imf_iy_ratio),
    "iy_ratio_source",
] = "IMF"
y_clean.loc[~pd.isnull(y_clean.iy_ratio), "iy_ratio_source"] = "PWT"

## filling in with WB values
y_clean.loc[pd.isnull(y_clean.iy_ratio), "iy_ratio"] = y_clean.loc[
    pd.isnull(y_clean.iy_ratio), "wb_iy_ratio"
].values
y_clean.drop(["wb_iy_ratio"], inplace=True, axis=1)

## filling in with IMF values
y_clean.loc[pd.isnull(y_clean.iy_ratio), "iy_ratio"] = y_clean.loc[
    pd.isnull(y_clean.iy_ratio), "imf_iy_ratio"
].values
y_clean.drop(["imf_iy_ratio"], inplace=True, axis=1)

## final source fill-in
y_clean.loc[pd.isnull(y_clean.iy_ratio_source), "iy_ratio_source"] = "-"

In [None]:
## filling in the delta information
y_clean["delta_source"] = "-"
y_clean.loc[~pd.isnull(y_clean.delta), "delta_source"] = "PWT"

### 2020 information is missing entirely, so just use 2019's values
y_clean.loc[(slice(None), 2020), "delta"] = y_clean.loc[
    (slice(None), 2019), "delta"
].values
y_clean.loc[
    (~pd.isnull(y_clean.delta)) & (y_clean.index.get_level_values("year") == 2020),
    "delta_source",
] = "PWT_copy_2019"

### using annual global averages when values are missing;
gp_yrly_delta = y_clean.reset_index().groupby("year").mean()[["delta"]]
y_clean = y_clean.merge(
    gp_yrly_delta.rename(columns={"delta": "delta_yr_avg"}),
    left_index=True,
    right_index=True,
    how="left",
)
y_clean.loc[pd.isnull(y_clean.delta), "delta"] = y_clean.loc[
    pd.isnull(y_clean.delta), "delta_yr_avg"
].values
y_clean.loc[y_clean.delta_source == "-", "delta_source"] = "yearly_global_avg"

## Capital ratio by category

In the case of missing data, we will again use yearly global average and mark those as so.

In [None]:
## captial information
PWT_RAW_DIR = Path(os.path.dirname(sset.PATH_PWT_RAW))
pwt_capital = pd.read_excel(PWT_RAW_DIR / "pwt_K_detail_100.xlsx").rename(
    columns={"countrycode": "ccode"}
)
capital_vals = ["Nc_Struc", "Nc_Mach", "Nc_TraEq", "Nc_Other"]
pwt_capital = pwt_capital.set_index(["ccode", "year"])[capital_vals]
for i in capital_vals:
    pwt_capital[i] = pwt_capital[i].astype("float64")

## ratio of capital in each category to total capital
pwt_capital["total_cap"] = pwt_capital[capital_vals].sum(axis=1)
newnames = []
for i in capital_vals:
    newname = i.split("_")[-1].lower() + "_ratio_prep"
    newnames.append(newname)
    pwt_capital[newname] = pwt_capital[i] / pwt_capital["total_cap"]

y_clean = y_clean.merge(
    pwt_capital[newnames], left_index=True, right_index=True, how="left"
)

y_clean["k_ratio_source"] = "-"
y_clean.loc[~pd.isnull(y_clean[newnames[0]]), "k_ratio_source"] = "PWT"

## copying 2019 value into 2020 ones
for i in newnames:
    y_clean.loc[(slice(None), 2020), i] = y_clean.loc[(slice(None), 2019), i].values
y_clean.loc[
    (~pd.isnull(y_clean[i])) & (y_clean.index.get_level_values("year") == 2020),
    "k_ratio_source",
] = "PWT_copy_2019"

k_ratio_names = []
for i in newnames:
    yr_avg = i[0:-5] + "_yr_avg"
    y_clean = y_clean.merge(
        y_clean.reset_index().groupby("year").mean()[[i]].rename(columns={i: yr_avg}),
        how="left",
        left_index=True,
        right_index=True,
    )
    name = "k_" + i[0:-5]
    k_ratio_names.append(name)
    y_clean[name] = y_clean[i]
    y_clean.loc[pd.isnull(y_clean[i]), name] = y_clean.loc[
        pd.isnull(y_clean[i]), yr_avg
    ]
    if i == newnames[0]:
        y_clean.loc[pd.isnull(y_clean[i]), "k_ratio_source"] = "yearly_global_avg"

We also add the "movable capital ratio" (`k_movable_ratio`), which is the sum of `k_mach_ratio`, `k_traeq_ratio`, and `k_other_ratio`.

In [None]:
y_clean["k_movable_ratio"] = (
    y_clean["k_mach_ratio"] + y_clean["k_traeq_ratio"] + y_clean["k_other_ratio"]
)

## Exporting

Let us also clarify the current PPP, USD 2017 and constant 2017 PPP USD variables' names (by marking them with `_17`) to signal what units they are in.

In [None]:
y_clean_cop = y_clean.copy()  ## due to pandas error, I will make a copy
y_clean_cop["pop_unit"] = "millions (of people)"
y_clean_cop["gdp_unit"] = "millions (of USD)"
y_clean_cop["gdppc_unit"] = "ones (of USD)"

## reorganizing the columns
gp_columns = [
    "pop_unit",
    "gdppc_unit",
    "gdp_unit",
    "pop_source",
    "gdp_source",
    "iy_ratio_source",
    "k_ratio_source",
    "delta_source",
    "pop",
    "rgdpna_pc_17",
    "rgdpna_17",
    "rgdpna_pc_19",
    "rgdpna_19",
    "cgdpo_pc_17",
    "cgdpo_17",
    "cgdpo_pc_19",
    "cgdpo_19",
    "iy_ratio",
]
gp_columns += k_ratio_names + ["k_movable_ratio", "delta"]
y_clean_cop = y_clean_cop[gp_columns].copy()

## Exporting
y_clean_cop.to_parquet(
    sset.DIR_YPK_INT / "gdp_gdppc_pop_capital_1950_2020_post_ypk3.parquet"
)