## Reorganizing raw data (GDP, GDPpc, and population) in long-panel format, converting to current and constant PPP terms, taking care of missing data

Note that we deal with historical data in the **years 2000-2020** in this notebook.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr
from tqdm.auto import tqdm

from sliiders import country_level_ypk as ypk_fn
from sliiders import settings as sset

## Importing all raw data, and creating a merged, long-panel version

### PWT 10.0

In [11]:
sliider_years = list(range(2000, 2021))
pwt100 = pd.read_excel(sset.PATH_PWT_RAW)
pwt100.rename(columns={"countrycode": "ccode"}, inplace=True)
pwt_gdp_pop = ["ccode", "year", "pop", "rgdpo", "rgdpna", "cgdpo"]
gdp_pop_df = pwt100[pwt_gdp_pop].copy().set_index(["ccode", "year"])
gdp_pop_df = gdp_pop_df.loc[
    gdp_pop_df.index.get_level_values("year").isin(sliider_years), :
]

### WB WDI

In [12]:
# WB WDI
wdi_rename_dict = {
    "SP.POP.TOTL": "wb_pop",
    "NY.GDP.MKTP.PP.KD": "wb_rgdpna",
    "NY.GDP.PCAP.PP.KD": "wb_rgdpna_pc",
    "NY.GDP.MKTP.KD": "wb_gdp_nom",
    "NY.GDP.PCAP.KD": "wb_gdp_nom_pc",
}
wb_wdi = pd.read_parquet(sset.DIR_YPK_RAW / "wdi_pop_iy_gdp.parquet").rename(
    columns=wdi_rename_dict
)
wb_wdi = wb_wdi.loc[
    wb_wdi.index.get_level_values("ccode").isin(sset.ALL_ISOS_EXTENDED)
    & wb_wdi.index.get_level_values("year").isin(sliider_years),
    :,
].reset_index()

# Unifying the country code conventions for Kosovo and Channel Islands
wb_wdi.loc[wb_wdi.ccode == "XKX", "ccode"] = "KO-"
wb_wdi.loc[wb_wdi.ccode == "CHI", "ccode"] = "GGY+JEY"
wb_wdi.set_index(["ccode", "year"], inplace=True)

# re-scaling; currently in ones, but to have them in PWT scales
wb_wdi[["wb_rgdpna", "wb_gdp_nom", "wb_pop"]] /= 1000000

# merging
gdp_pop_df = gdp_pop_df.join(wb_wdi, how="outer")