## Projecting capital stock values (2010-2100) according to Dellink et al. (2017)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import os
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import xarray as xr
from dask_gateway import Gateway
from tqdm.auto import tqdm

## settings and utility functions for SLIIDERS
from sliiders import __file__ as slfile
from sliiders import country_level_ypk as ypk_fn
from sliiders import settings as sset

# dask gateway setup
gateway = Gateway()
image_name = sset.DASK_IMAGE

## Capital projection

We incorporate historical 2010 capital stock values and projected GDP, GDPpc, and population values.

### Importing and merging capital 2010 values

In [None]:
## historical data and projected gdp, gdppc, and population
hist_df = pd.read_parquet(
    sset.DIR_YPK_FINAL / "gdp_gdppc_pop_capital_1950_2020.parquet"
)
proj_yp_df = pd.read_parquet(
    sset.DIR_YPK_INT / "gdp_gdppc_pop_proj_2010_2100_post_ypk6.parquet"
)

## merging 2010 capital values
proj_ypk_df = proj_yp_df.merge(
    (
        hist_df.loc[(slice(None), 2010), ["rnna_19"]].rename(
            columns={"rnna_19": "capital"}
        )
    ),
    how="left",
    left_index=True,
    right_index=True,
)

# readjusting the values to ones (of dollars and people)
for i in ["gdp", "pop", "capital"]:
    unitname = f"{i}_unit"
    proj_ypk_df[i] *= 1000000
    proj_ypk_df[unitname] = "ones (of USD)"
    if i == "pop":
        proj_ypk_df[unitname] = "ones (of people)"

### Getting the overall GDP elasticity with respect to capital

We first need to calculate the overall GDP elasticity w.r.t. capital, and here we assume a simple Cobb-Douglas production function with population being an approximation of the labor force. Alternatively, we may use IIASA approximation (from Crespo Cuaresma, 2017) of the said elasticity being approximately 0.326.

In [None]:
## let us subset values for 2010
k2010 = proj_ypk_df.loc[(slice(None), 2010), :]

# since 2010 values are same across all SSP (but could be different across iams)
# we subset SSP2 here and calculate the GDP elasticity wrt capital
k2010_pos_y = k2010.loc[
    (k2010.gdp > 0) & (k2010.index.get_level_values("ssp") == "SSP2"), :
].sort_index()
overall_elas_ols = sm.OLS(
    np.log(k2010_pos_y["gdp"]), sm.add_constant(np.log(k2010_pos_y[["pop", "capital"]]))
)
overall_elas_ols = overall_elas_ols.fit()

OVERALL_E = overall_elas_ols.params["capital"]
OVERALL_E_IIASA = 0.326

In [None]:
# for seeing the regression summary
overall_elas_ols.summary()

#### Calculating the initial marginal product of capital (${MPK}_{r, t_0}$, with $t_0 = 2010$) and appending other necessary information

**Four options of calculating MPK**

If we assume a simple, Cobb-Douglas form for the production function (i.e., $Y = AK^\alpha L^{1-\alpha}$), the marginal product of capital (MPK) can be written as:
$$ \frac{\partial Y}{\partial K} = \alpha \cdot \underbrace{A{K}^\alpha{L}^{1-\alpha}}_{=Y}\cdot \frac{1}{K} = \alpha \frac{Y}{K} = \alpha \frac{Y/L}{K/L} $$
and similarly if we are going to assume some form like $Y = AK^\alpha$, we can write:
$$ \frac{\partial Y}{\partial K} = \alpha \cdot \underbrace{AK^{\alpha}}_{=Y} \cdot \frac{1}{K} = \alpha \frac{Y}{K} $$
so essentially the MPK can be written as the ratio of GDP ($Y$) and capital ($K$) multiplied by the GDP elasticity w.r.t. capital ($\alpha$).

We have acquired two different estimates (one ours, one IIASA's) of $\alpha$ from above, but we can further look at calculating $\alpha$ for each country by fitting either a Cobb-Douglas function or a capital-only function. So there are four options for calculating a country's MPK:
1. Use $\alpha$ from IIASA
2. Use $\alpha$ from our estimation
3. Use $\alpha$ from fitting a Cobb-Douglas function
4. Use $\alpha$ from fitting a capital-only function

and we can multiply the value of $\frac{Y}{K}$ (in the year 2010) afterwards.

In [None]:
MPK_init_calc_lamb = lambda x: ypk_fn.MPK_init_calc(
    x, hist_df, k2010, [OVERALL_E, OVERALL_E_IIASA]
)

# for the inhabited areas
mpk_calc = []
inhabited_isos = np.setdiff1d(sset.ALL_ISOS, sset.UNINHABITED_ISOS)
for i in tqdm(inhabited_isos):
    lst_mpks = MPK_init_calc_lamb(i)
    mpk_calc.append(lst_mpks)
mpk_calc = pd.concat(mpk_calc, axis=0)

# there are some cases in which the TPK and elasticities per country are not
# found via optimization (minimization); in this case, we will clip it with
# the minimum MPK garnered either from `mpk_our`, `mpk_iiasa`, or itself (whichever
# is lesser yet above 0)
for i in ["mpk_ctry_cd", "mpk_ctry_co"]:
    cd_values = mpk_calc[["mpk_our", "mpk_iiasa", i]].values
    mpk_calc.loc[mpk_calc[i] == 0, i] = cd_values[cd_values > 0].min()

# attaching the uninhabited areas; by default, their Y/K ratios and MPK values will
# be set to 0 (doesn't matter too much, since their projected capitals will be 0)
mpk_calc_uninhabited = k2010.reset_index().set_index(["ccode", "ssp", "iam"])
mpk_calc_uninhabited = mpk_calc_uninhabited.loc[
    (sset.UNINHABITED_ISOS, slice(None), slice(None)), ["gdp", "capital", "pop"]
]
for i in ["yk", "mpk_our", "mpk_iiasa", "mpk_ctry_cd", "mpk_ctry_co"]:
    mpk_calc_uninhabited[i] = 0
mpk_calc = pd.concat([mpk_calc, mpk_calc_uninhabited], axis=0).sort_index()

### Using the perpetual inventory method (PIM) with the dynamic parameter equations specified in Dellink et al. (2017)

The method in Dellink et al. (2017) is basically a PIM, but its parameters are dynamic (and evolving on their own) so that they approach converge to specific long-term values. Below is (with `dask` parallelization) an application of the Dellink et al. (2017) methodology using the MPKs (in 4 different methods) we have calculated above for each country.

First, we load the 2010 historical values (some estimated) of capital stock into our projection dataset. Also, we calculate the by-country average depreciation rates (from PWT 10.0) and overall average (average of the by-country rates) rates (also from PWT 10.0) which are used in the PIM process. If a country is missing from the PWT 10.0 dataset, we will simply use the overall average depreciation rate for the country-specific values.

In [None]:
## importing the (initial) iy ratios in 2010
iy_org = hist_df.loc[(slice(None), [2010]), ["iy_ratio_fit", "delta"]]
iy_org = (
    iy_org.reset_index()
    .rename(columns={"delta": "delta_c", "iy_ratio_fit": "iy_ratio"})
    .drop(["year"], axis=1)
    .set_index(["ccode"])
)

## AFG has the average delta value
delta_overall = iy_org.loc["AFG", "delta_c"]

## merge this with the 2010 (starting point) dataset
mpk_calc = mpk_calc.merge(iy_org, left_index=True, right_index=True, how="left")
mpk_calc["delta"] = delta_overall

In [None]:
## cluster setup
N_CLUSTER = 20
cluster = gateway.new_cluster(worker_image=image_name, profile="micro")
client = cluster.get_client()
cluster.scale(N_CLUSTER)
cluster

In [None]:
## getting the ccodes and ccode-specific DFs necessary
ccodes_pos_y = (
    proj_ypk_df.loc[proj_ypk_df.gdp > 0, :].index.get_level_values("ccode").unique()
)
ccodes_dfs = [proj_ypk_df.loc[[cc], :].copy() for cc in ccodes_pos_y]

## uninhabited ones set aside
cc_dfs_uninh = proj_ypk_df.loc[
    ~proj_ypk_df.index.get_level_values("ccode").isin(ccodes_pos_y), :
].sort_index()

In [None]:
# making sure SLIIDERS functions are compatible with Dask workflow
# run this when all the workers are available
sliiders_dir = Path(slfile).parent
zipf = zipfile.ZipFile("sliiders.zip", "w", zipfile.ZIP_DEFLATED)
for root, dirs, files in os.walk(sliiders_dir):
    for file in files:
        zipf.write(
            os.path.join(root, file),
            os.path.relpath(os.path.join(root, file), os.path.join(sliiders_dir, "..")),
        )
zipf.close()
client.upload_file("sliiders.zip")

In [None]:
MPK_var_cases = ["mpk_our", "mpk_ctry_cd", "mpk_ctry_co"] * 2
MPK_case_len = len(MPK_var_cases)
all_cases = []
for i, case in enumerate(MPK_var_cases):
    if i < (MPK_case_len // 2):
        pim_lamb = lambda x: ypk_fn.pim_single_ctry(x, mpk_calc, OVERALL_E, case)
    else:
        pim_lamb = lambda x: ypk_fn.pim_single_ctry(x, mpk_calc, OVERALL_E_IIASA, case)
    pim_dfs = client.map(pim_lamb, ccodes_dfs)
    pim_dfs = client.gather(pim_dfs)
    pim_dfs = pd.concat(pim_dfs, axis=0)
    all_cases.append(pim_dfs)
    j = i + 1
    print(f"Step {j}/{MPK_case_len} done")

In [None]:
# shutting down cluster
cluster.scale(0)
client.close()
cluster.close()
cluster.shutdown()

# removing the .zip file that's been uploaded to Dask
os.remove("sliiders.zip")

### Checking against the Dellink et al. (2017)'s Figure 6 (capital intensity plots)

We examine our 6 options as below. After examination with the graph as well as the SSE values, it seems that the case utilizing by-country MPK, **capital-only** production function, and the IIASA overall MPK are the ones that perform the best, at least with the four countries whose information are available.

However, since the SSEs for the numbers are very similar between the two cases (varying only by **capital-and-labor** production versus **capital-only** production) and because capital-only one has been used previously to produce capital stock estimates, we will use estimates from `all_cases[-1]` as our main capital stock estimates and those from `all_cases[-2]` as alternative estimates.

In [None]:
all_cases_sse = []
for i in all_cases:
    all_cases_sse.append(ypk_fn.examine_against_fig6(i))

For sanity check, we will also graph top ten and bottom cases of capital stock (in natural logarithm) for some specified SSP (SSP3 below) and some year (2100 below).

In [None]:
ypk_fn.top_bottom_10(all_cases[-1])

In [None]:
ypk_fn.top_bottom_10(all_cases[-2])

## Re-organizing the dataset and exporting

### Data re-organization

In [None]:
# capital stock estimates
pim_dfs_iiasa_co = all_cases[-1].copy()
pim_dfs_iiasa_cd = all_cases[-2].copy()

# creating gdppc, unit changes, and changing the name to be matched
output_df = proj_ypk_df.rename(
    columns={"gdp": "rgdpna_19", "gdppc": "rgdpna_pc_19"}
).drop(["capital"], axis=1)
output_df["pop"] /= 1000000
output_df["rgdpna_19"] /= 1000000

## attaching the capital stock estimates
necess_cols = ["capital_estim", "MPK", "IY", "KY"]
output_df = output_df.merge(
    pim_dfs_iiasa_co[necess_cols].rename(columns={"capital_estim": "rnna_19"}),
    left_index=True,
    right_index=True,
    how="left",
)
output_df["rnna_19"] /= 1000000

alt_name = "rnna_19_alternative"
output_df = output_df.merge(
    pim_dfs_iiasa_cd[["capital_estim"]].rename(columns={"capital_estim": alt_name}),
    left_index=True,
    right_index=True,
    how="left",
)
output_df[alt_name] /= 1000000

for i in necess_cols[1:] + [alt_name, "rnna_19"]:
    output_df.loc[pd.isnull(output_df[i]), i] = 0

## adding the unit information and reordering
output_df["gdp_capital_unit"] = "millions (of USD)"
output_df["gdppc_unit"] = "ones (of USD)"
output_df["pop_unit"] = "millions (of people)"
output_df.sort_index(inplace=True)

### Scale creation with respect to historical 2019 values of population and current-PPP (2019 USD) capital stock

In [None]:
## fetching the 2019 historical values
hist_gp = (
    hist_df.loc[(slice(None), 2019), ["cn_19", "pop"]]
    .reset_index()
    .drop(["year"], axis=1)
    .set_index(["ccode"])
    .rename(columns={"pop": "pop_2019", "cn_19": "cn_19_2019"})
)

## merge and create scales
output_df = output_df.merge(hist_gp, left_index=True, right_index=True, how="left")
output_df["pop_scale"] = output_df["pop"] / output_df["pop_2019"]
output_df["rnna_19_scale"] = output_df["rnna_19"] / output_df["cn_19_2019"]
output_df["rnna_19_alternative_scale"] = (
    output_df["rnna_19_alternative"] / output_df["cn_19_2019"]
)

### Exporting: historical 2019 values

In [None]:
hist2019 = hist_df.loc[
    (slice(None), 2019),
    [
        "gdp_capital_unit",
        "gdppc_unit",
        "pop_unit",
        "cgdpo_19",
        "cgdpo_pc_19",
        "pop",
        "cn_19",
    ],
].reset_index()
hist2019 = hist2019.drop(["year"], axis=1).set_index(["ccode"])
hist2019.to_parquet(sset.DIR_YPK_FINAL / "gdp_gdppc_pop_capital_hist2019.parquet")

### Exporting: projected values (2010-2100)

In [None]:
col_ordering = [
    "gdp_capital_unit",
    "gdppc_unit",
    "pop_unit",
    "rgdpna_19",
    "rgdpna_pc_19",
    "rnna_19",
    "rnna_19_scale",
    "rnna_19_alternative",
    "rnna_19_alternative_scale",
    "cn_19_2019",
    "pop",
    "pop_scale",
    "pop_2019",
    "MPK",
    "IY",
    "KY",
]

## filling in the nan's with 0, for uninhabited areas
output_df = output_df[col_ordering].copy().sort_index()
for i in ["rgdpna_pc_19", "rnna_19_scale", "rnna_19_alternative_scale", "pop_scale"]:
    output_df.loc[pd.isnull(output_df[i]), i] = 0

output_df.to_parquet(
    sset.DIR_YPK_FINAL / "gdp_gdppc_pop_capital_proj_2010_2100.parquet"
)