## Clean up and impute missing projected (2010-2100) GDPpc, GDP, and population values

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import shutil
import warnings
from itertools import product as lstprod
from pathlib import Path

import dask.dataframe as ddf
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import xarray as xr
from dask_gateway import Gateway
from scipy.optimize import minimize as opt_min
from shapely.geometry import MultiPolygon, Point, Polygon
from tqdm.auto import tqdm

from sliiders.spatial import iso_poly_box_getter
from sliiders import country_level_ypk as ypk_fn
from sliiders import settings as sset

# dask gateway setup
gateway = Gateway()
cluster_name = sset.DASK_IMAGE

## Importing and cleaning SSP-IAM projections

### Raw data re-formatting

In [4]:
iiasa_raw_df = pd.read_csv(sset.DIR_YPK_RAW / "SspDb_country_data_2013-06-12.csv")
iiasa_pop = iiasa_raw_df.loc[iiasa_raw_df.VARIABLE == "Population", :].sort_values(
    ["SCENARIO", "MODEL", "REGION"]
)
iiasa_gdp = iiasa_raw_df.loc[iiasa_raw_df.VARIABLE == "GDP|PPP", :].sort_values(
    ["SCENARIO", "MODEL", "REGION"]
)

### Population

We will only take IIASA projections, with the exception of countries whose information are in OECD projections but not in IIASA.

#### Basic cleaning

In [6]:
# Cleaning the projections
ii_pop_clean = ypk_fn.ssp_and_model_simplify("SCENARIO", "MODEL", iiasa_pop)
ii_pop_clean.sort_values(["ccode", "ssp", "iam"], inplace=True)

# double-checking if IIASA and IIASA-WiC values are same
v_ = [str(y) for y in np.arange(2010, 2105, 5)]
for i in set(ii_pop_clean.ccode):
    row = ii_pop_clean[ii_pop_clean.ccode == i]
    iams = set(row.iam)
    if ("IIASA" in iams) and ("IIASA-WiC" in iams):
        w1 = row.loc[row.iam == "IIASA", v_].values
        w2 = row.loc[row.iam == "IIASA-WiC", v_].values
        if not (w1 == w2).all():
            print(i)

In [7]:
# cleaning up by gathering only two population IAMs per country:
# IIASA (or equivalently, IIASA-WiC), and OECD
ii_pop = pd.DataFrame(ii_pop_clean[["ccode", "ssp", "iam"] + v_])
new_v_ = ["v_" + str(y) for y in v_]

for i, ccode in enumerate(list(set(ii_pop_clean.ccode))):
    j = 0
    indiv_df = []
    case = ii_pop[ii_pop.ccode == ccode]
    get_these = []
    ## add oecd if existing
    if "OECD" in set(case.iam):
        indiv_df.append(case[case.iam == "OECD"].values)
        j += 1
    ## add only one of IIASA OR IIASA-WiC
    if "IIASA" in set(case.iam):
        indiv_df.append(case[case.iam == "IIASA"].values)
        j += 1
    elif "IIASA-WiC" in set(case.iam):
        indiv_df.append(case[case.iam == "IIASA-WiC"].values)
        j += 1

    indiv_df = pd.DataFrame(
        np.vstack(indiv_df), columns=["ccode", "ssp", "iam"] + new_v_
    )
    indiv_df["howmany_iam"] = j
    if i == 0:
        agg_df = indiv_df.copy()
    else:
        agg_df = pd.concat([agg_df, indiv_df], axis=0)

agg_df["unit"] = "millions"

In [8]:
# brief clean-ups
ii_pop = agg_df.copy()
ii_pop["iam_fill"] = "-"
ii_pop.loc[ii_pop.iam == "IIASA-WiC", "iam"] = "IIASA"

# adding the extra rows for missing iams
for i, ccode in enumerate(set(ii_pop.ccode)):
    case = ii_pop[ii_pop.ccode == ccode]
    if case["howmany_iam"][0] == 1:
        copy_case = pd.DataFrame(case)
        if set(["OECD"]) == set(copy_case.iam):
            copy_case["iam"], copy_case["iam_fill"] = "IIASA", "OECD"
        elif set(["IIASA"]) == set(copy_case.iam):
            copy_case["iam"], copy_case["iam_fill"] = "OECD", "IIASA"
        ii_pop = pd.concat([ii_pop, copy_case], axis=0)

## further re-ordering cleanups
ii_pop.sort_values(["ccode", "ssp", "iam"], inplace=True)
ii_pop.set_index(["ccode", "ssp", "iam"], inplace=True)

#### Cleaning up for the case of France

In the French case, IIASA's version has the 5 overseas departments (i.e., `MYT`, `MTQ`, `GUF`, `GLP`, and `REU`) **excluded** when it calculates the French populations. This is different in the OECD's version of the French population since it seems to **include** the said overseas departments. This can be confirmed below as the values for the sum of IIASA's populations for `MYT`, `MTQ`, `GUF`, `GLP`, `REU` and `FRA` is approximately the same as the values for OECD's French population.

From here on, the French case for both IIASA and OECD will **exclude** the five overseas departments and keep them separately logged.

In [9]:
## checking
fra_dept = ["FRA", "MYT", "MTQ", "GUF", "GLP", "REU"]
v_fut_5 = [x for x in ii_pop.columns if "v_" in x]
for ssp in ["SSP{}".format(i) for i in range(1, 6)]:
    ## OECD case
    oecd_val = ii_pop.loc[("FRA", ssp, "OECD"), v_fut_5].values

    ## IIASA case
    iiasa_val = ii_pop.loc[(fra_dept, ssp, "IIASA"), v_fut_5].values
    iiasa_val = np.sum(iiasa_val, axis=0)

    jointhese = [ssp, str(round(np.sum((oecd_val - iiasa_val) ** 2), 4))]
    print(": ".join(jointhese))

SSP1: 0.0
SSP2: 0.0
SSP3: 0.0
SSP4: 0.0
SSP5: 0.0


The above confirms that OECD cases do include all of the five overseas departments when calculating their population. So we will subtract these values to get the "mainland France" population values.

In [10]:
ii_pop_fra = ii_pop.copy()
csi = ["ccode", "ssp", "iam"]
for ssp in ["SSP{}".format(i) for i in range(1, 6)]:
    fra_dept_oecd = ii_pop.loc[(fra_dept[1:], ssp, "OECD"), v_fut_5].values
    fra_dept_oecd = np.sum(fra_dept_oecd, axis=0)
    fra_overall_oecd = ii_pop.loc[("FRA", ssp, "OECD"), v_fut_5].values

    ii_pop_fra.loc[("FRA", ssp, "OECD"), v_fut_5] = fra_overall_oecd - fra_dept_oecd

#### Interpolating, turning into a long-panel format, and taking only the IIASA cases

Projections are given every five years, so we will use interpolation to fill in the missing years' information. We will assume that the between any known adjacent two years' values (e.g., 2015 and 2020), the values grow log-linearly.

In [11]:
# interpolate log-linearly and turning into a long-panel format
ii_pop = ypk_fn.organize_hor_to_ver(
    ypk_fn.log_lin_interpolate(ii_pop_fra),
    "ccode",
    ["ssp", "iam"],
    "pop",
    yrs=list(range(2010, 2101)),
)

# selecting only the IIASA cases
ii_pop = (
    ii_pop.loc[(slice(None), slice(None), slice(None), "IIASA"), :]
    .reset_index()
    .drop(["howmany_iam", "iam_fill", "iam"], axis=1)
    .set_index(["ccode", "year", "ssp"])
)

#### Detecting those ISOs that are missing, and getting the country-level population estimates for these ISOs (from LandScan 2019)

In [14]:
# cluster setup
cluster = gateway.new_cluster(worker_image=cluster_name, profile="micro")
client = cluster.get_client()
cluster.scale(20)
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [15]:
# detecting which ISOs are missing
isos_pop_wproj = ii_pop.index.get_level_values("ccode").unique()
need_landscan = np.sort(np.setdiff1d(sset.ALL_ISOS, isos_pop_wproj))

# landscan and (raw) coordinates
ls19 = ddf.read_parquet(
    sset.DIR_LANDSCAN_INT / "population_with_xy.parquet"
).repartition(npartitions=20)
ls19 = ls19.persist()

# shapefiles requiring information from LandScan
ctries_shp = gpd.read_parquet(sset.PATH_GADM_ADM1)

# fixing ISO codes to be consistent with our convention
ctries_shp.loc[ctries_shp.GID_0 == "XKO", "GID_0"] = "KO-"
ctries_shp.loc[ctries_shp.GID_0 == "XCL", "GID_0"] = "CL-"

# subsetting the shapefiles for those missing projections
ctries_shp = ctries_shp.set_index(["GID_0"]).sort_index().loc[need_landscan]

Note that the current shapefile information we are using often has more than one MultiPolygon per ISO code, so we will create a shapefile dataset with one MultiPolygon per ISO code.

In [16]:
ctries_shp_lst = []
for iso in tqdm(need_landscan):
    iso_lst = []
    for i in ctries_shp.loc[[iso], "geometry"].values:
        if type(i) == MultiPolygon:
            j = [x for x in i.geoms]
        elif type(i) == Polygon:
            j = [i]
        iso_lst += j
    ctries_shp_lst.append(MultiPolygon(iso_lst))

ctries_shp_df = gpd.GeoDataFrame(
    data={"ccode": need_landscan, "geometry": ctries_shp_lst}
)
ctries_shp_df.set_index(["ccode"], inplace=True)

  0%|          | 0/56 [00:00<?, ?it/s]

Based on the ISO-relevant shapefiles and grid-level population in LandScan 2019, let us find the country-level population information.

In [17]:
def subset_grid_find_pop(iso, shp_df=ctries_shp_df, ls_df=ls19):
    poly_bounds = iso_poly_box_getter(iso, shp_df)
    geom = shp_df.loc[iso, "geometry"]

    sub_dfs = []
    for bd in poly_bounds:
        x_mn, x_mx, y_mn, y_mx = bd
        sub_df = ls_df.loc[
            (ls_df.x > x_mn) & (ls_df.y > y_mn) & (ls_df.x < x_mx) & (ls_df.y < y_mx), :
        ].compute()
        sub_dfs.append(sub_df)
    sub_df = pd.concat(sub_dfs, axis=0).drop_duplicates(["x_ix", "y_ix"])

    if sub_df.shape[0] == 0:
        return 0

    pop = 0
    for l in range(sub_df.shape[0]):
        pt = Point(sub_df.iloc[l, :]["x"], sub_df.iloc[l, :]["y"])
        if geom.contains(pt):
            pop += sub_df.iloc[l, :]["population"]

    return pop

In [None]:
# this may take a while
ls_msng_pop = []
for iso in tqdm(need_landscan):
    ls_msng_pop.append(subset_grid_find_pop(iso))

msng_from_proj_pop = pd.DataFrame(data={"pop": ls_msng_pop, "ccode": need_landscan})
msng_from_proj_pop.to_parquet(sset.DIR_YPK_INT / "msng_from_iiasa_proj_pop.parquet")

In [18]:
cluster.scale(0)
cluster.close()
client.close()
cluster.shutdown()

#### Attaching LandScan 2019 values to the overall population projections

In [19]:
msng_from_proj_pop = pd.read_parquet(
    sset.DIR_YPK_INT / "msng_from_iiasa_proj_pop.parquet"
).set_index(["ccode"])

pop_from_landscan = []
for i in msng_from_proj_pop.index.get_level_values("ccode"):
    i_shell = ii_pop.loc[["USA"], :].reset_index().copy()
    i_shell["ccode"] = i

    ## adjusting it to millions of people
    i_shell["pop"] = msng_from_proj_pop.loc[i, "pop"] / 1000000
    i_shell.set_index(ii_pop.index.names, inplace=True)
    pop_from_landscan.append(i_shell)

ii_pop = pd.concat([ii_pop] + pop_from_landscan, axis=0).sort_index()

### GDPpc and GDP

We will use IAMs `IIASA` and `OECD`.

#### Basic cleaning

In [20]:
## cleaning the imported dataset
csi = ["ccode", "ssp", "iam"]
ii_gdp_clean = ypk_fn.ssp_and_model_simplify("SCENARIO", "MODEL", iiasa_gdp)
ii_gdp_clean.set_index(csi, inplace=True)
ii_gdp_clean.sort_index(axis=0, inplace=True)

num_v = [str(x) for x in np.arange(2010, 2105, 5)]
v_name = ["v_" + str(v) for v in num_v]
ii_gdp_clean.rename(columns=dict(zip(num_v, v_name)), inplace=True)
ii_gdp_clean = ii_gdp_clean[v_name]

## changing the values from billions of dollars to millions of dollars
ii_gdp_clean[v_name] = ii_gdp_clean[v_name] * 1000

## double-checking if IIASA and IIASA-WiC values are same
## it's verifiable that IIASA and IIASA-WiC ones are the same
for i in set(ii_gdp_clean.index.get_level_values("ccode")):
    row = ii_gdp_clean.loc[(i, slice(None), slice(None)), :]
    iams = set(row.index.get_level_values("iam"))
    if ("IIASA" in iams) and ("IIASA-WiC" in iams):
        w1 = row.loc[(slice(None), slice(None), "IIASA"), v_name].values
        w2 = row.loc[(slice(None), slice(None), "IIASA-WiC"), v_name].values
        if not (w1 == w2).all():
            print(i)

## getting only IIASA and OECD cases
clean_ccodes = ii_gdp_clean.index.get_level_values("ccode")
for i, ccode in enumerate(list(set(clean_ccodes))):
    j = 0
    indiv_df = []
    case = ii_gdp_clean.loc[(ccode, slice(None), slice(None)), :]
    get_these = []
    ## add oecd if existing
    if "OECD" in set(case.index.get_level_values("iam")):
        indiv_df.append(case.loc[(slice(None), slice(None), "OECD"), :])
        j += 1
    ## add only one of IIASA OR IIASA-WiC
    if "IIASA" in set(case.index.get_level_values("iam")):
        indiv_df.append(case.loc[(slice(None), slice(None), "IIASA"), :])
        j += 1
    elif "IIASA-WiC" in set(case.index.get_level_values("iam")):
        indiv_df.append(case.loc[(slice(None), slice(None), "IIASA-WiC"), :])
        j += 1

    indiv_df = pd.concat(indiv_df, axis=0)
    indiv_df["howmany_iam"] = j
    if i == 0:
        agg_df = indiv_df.copy()
    else:
        agg_df = pd.concat([agg_df, indiv_df], axis=0)

ii_gdp = agg_df.copy().reset_index()
ii_gdp["iam_fill"] = "-"
ii_gdp.loc[ii_gdp.iam == "IIASA-WiC", "iam"] = "IIASA"
ii_gdp.set_index(csi, inplace=True)

## If either OECD or IIASA track is missing, fill in using the other track
for i, ccode in enumerate(set(ii_gdp.index.get_level_values("ccode"))):
    case = ii_gdp.loc[(ccode, slice(None), slice(None)), :]
    if case["howmany_iam"][0] == 1:
        copy_case = case.copy().reset_index()
        if set(["OECD"]) == set(copy_case.iam):
            copy_case["iam"], copy_case["iam_fill"] = "IIASA", "OECD"
        elif set(["IIASA"]) == set(copy_case.iam):
            copy_case["iam"], copy_case["iam_fill"] = "OECD", "IIASA"
        ii_gdp = pd.concat([ii_gdp, copy_case.set_index(csi)], axis=0)

ii_gdp = ypk_fn.organize_hor_to_ver(
    ii_gdp.sort_index(axis=0), "ccode", ["ssp", "iam"], "gdp", yrs=range(2010, 2101)
).drop(["howmany_iam"], axis=1)
ii_gdp["unit"] = "millions"

#### Attaching the population values, creating GDPpc, and log-linearly interpolating

In [21]:
ii_gdppc = ii_gdp.merge(ii_pop[["pop"]], how="left", left_index=True, right_index=True)
ii_gdppc["gdppc"] = ii_gdppc["gdp"] / ii_gdppc["pop"]
scenarios = list(lstprod(*[[f"SSP{x}" for x in range(1, 6)], ["IIASA", "OECD"]]))
scen_dfs = []
for scen in tqdm(scenarios):
    ssp, iam = scen
    scen_df = (
        ii_gdppc.loc[(slice(None), slice(None), ssp, iam), ["gdppc"]]
        .reset_index()
        .drop(["ssp", "iam"], axis=1)
        .set_index(["ccode", "year"])
    )
    scen_df = ypk_fn.log_lin_interpolate(
        ypk_fn.organize_ver_to_hor(
            scen_df, "gdppc", "year", "ccode", range(2010, 2101)
        ),
    ).reset_index()
    scen_df["ssp"], scen_df["iam"] = ssp, iam
    scen_dfs.append(scen_df.set_index(["ccode", "ssp", "iam"]))
ii_gdppc = ypk_fn.organize_hor_to_ver(
    pd.concat(scen_dfs, axis=0), "ccode", ["ssp", "iam"], "gdppc", yrs=range(2010, 2101)
)
ii_gdppc["unit"] = "ones"

  0%|          | 0/10 [00:00<?, ?it/s]

#### Getting the by-scenario global GDPpc

In [22]:
ii_gdppc_w_pop = ii_gdppc.merge(
    ii_pop[["pop"]], how="left", left_index=True, right_index=True
)
ii_gdppc_w_pop["gdp"] = ii_gdppc_w_pop["pop"] * ii_gdppc_w_pop["gdppc"]
scen_agg_dfs = []
for scen in tqdm(scenarios):
    ssp, iam = scen
    scen_agg_df = (
        ii_gdppc_w_pop.loc[(slice(None), slice(None), ssp, iam)]
        .reset_index()
        .groupby(["year"])
        .sum()[["pop", "gdp"]]
        .reset_index()
    )
    scen_agg_df["ssp"], scen_agg_df["iam"] = ssp, iam
    scen_agg_df.set_index(["year", "ssp", "iam"], inplace=True)
    scen_agg_dfs.append(scen_agg_df)
global_df = pd.concat(scen_agg_dfs, axis=0).sort_index()
global_df["gdppc"] = global_df["gdp"] / global_df["pop"]

  0%|          | 0/10 [00:00<?, ?it/s]

#### GDPpc for countries that are not in the current projections (subbing in the global GDPpc), and attaching it with the existing projections

In [23]:
gdppc_yesproj = np.sort(ii_gdppc.index.get_level_values("ccode").unique())
gdppc_noproj = np.setdiff1d(sset.ALL_ISOS, gdppc_yesproj)
missing_gdps = []
for iso in tqdm(gdppc_noproj):
    iso_df = global_df.reset_index()
    iso_df["ccode"], iso_df["unit"] = iso, "ones"
    iso_df.set_index(["ccode", "year", "ssp", "iam"], inplace=True)
    missing_gdps.append(iso_df[["gdppc", "unit"]])
missing_gdps = pd.concat(missing_gdps, axis=0).sort_index()

ii_gdppc = pd.concat([ii_gdppc, missing_gdps], axis=0).sort_index()

  0%|          | 0/65 [00:00<?, ?it/s]

In [24]:
ii_yp = ii_gdppc.merge(ii_pop, left_index=True, right_index=True, how="left")
ii_yp["pop_unit"] = "millions (of people)"
ii_yp["gdppc_unit"] = "ones (of USD)"
ii_yp["gdp_unit"] = "millions (of USD)"
ii_yp.drop(["unit_x", "unit_y"], inplace=True, axis=1)
ii_yp["gdp"] = ii_yp["gdppc"] * ii_yp["pop"]

## if population is 0, then GDPpc and GDP should also be 0 (no economic activity)
ii_yp.loc[ii_yp["pop"] == 0, "gdppc"] = 0

#### Turning the GDP and GDPpc values to 2019 USD

In [25]:
## inflator from 2005 to 2019
pwt = (
    pd.read_excel(sset.DIR_YPK_RAW / "pwt_100.xlsx")
    .rename(columns={"countrycode": "ccode"})
    .set_index(["ccode", "year"])
)
infla = pwt.loc[("USA", 2019), "pl_gdpo"] / pwt.loc[("USA", 2005), "pl_gdpo"]
ii_yp["gdp"] *= infla
ii_yp["gdppc"] *= infla

#### Organizing and exporting

In [26]:
ii_yp = ii_yp[["gdp", "gdppc", "pop", "gdp_unit", "gdppc_unit", "pop_unit"]].copy()
ii_yp.to_parquet(sset.DIR_YPK_INT / "gdp_gdppc_pop_proj_2010_2100_post_ypk6.parquet")