# Country-level Analysis of beyond-GDP Metrics

In [None]:
# Stdlib imports
from pathlib import Path
from itertools import combinations

# 3rd party imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## WISE data analysis with Focus on Switzerland

In [None]:
# Set up file paths
data_root = Path("../../data")
wisedb_path = data_root / "WISE/Data/WISE_Database/"

In [None]:
# Read the WISE database into memory
wise_db = dict()

for sheet in ["Content", "Metrics Info", "C Data", "CG Data", "Metrics C&CG", "C&CG Code"]:
    wise_db[sheet] = pd.read_excel(wisedb_path / "WISE_Database.xlsx", sheet_name=sheet)

In [None]:
# Filter data for Switzerland (ISO3 = "CHE")
wise_ch = wise_db["C Data"].loc[wise_db["C Data"]["ISO3"].values == "CHE"]

In [None]:
wise_ch

In [None]:
# Split dataframe into dictionary of dataframes. Each key-value pair belongs to a specific indicator
wise_ch_ind__raw = dict()

for acr in wise_ch["Acronym"].unique():
    wise_ch_ind__raw[acr] = wise_ch.loc[wise_ch["Acronym"]==acr]

# Current number of indicators
len(wise_ch_ind__raw.keys())

### Data Cleaning

#### Step 1: Ignore indices that have not been recorded in at least 10 years (i.e. 10 data points)

In [None]:
wise_ch_ind__statrel = ({k: v for (k,v) in wise_ch_ind__raw.items() if len(v["Year"])>=10})

# Current number of indicators
len(wise_ch_ind__statrel.keys())

In [None]:
# Removed indicators
set(wise_ch_ind__raw.keys()) - set(wise_ch_ind__statrel.keys())

#### Step 2: Ignore constant indices

In [None]:
wise_ch_ind__nonconst = ({k: v for (k,v) in wise_ch_ind__statrel.items() if v["Value"].std()>0})

# Current number of indicators
len(wise_ch_ind__nonconst.keys())

In [None]:
# Removed indicators
set(wise_ch_ind__statrel.keys() - set(wise_ch_ind__nonconst.keys()))

### Step 3: Split by capital

In [None]:
capital_map = {"Human": list(wise_db["Metrics Info"].loc[wise_db["Metrics Info"]["Wellbeing"] == "X", "Acronym"].values),
               "Social": list(wise_db["Metrics Info"].loc[wise_db["Metrics Info"]["Inclusion"] == "X", "Acronym"].values),
               "Natural": list(wise_db["Metrics Info"].loc[wise_db["Metrics Info"]["Sustainability"] == "X", "Acronym"].values),
               "Economic": list(wise_db["Metrics Info"].loc[wise_db["Metrics Info"]["Economy and Society"] == "X", "Acronym"].values)
              }

In [None]:
wise_ch_by_capitals = dict()

for cap in ["Human", "Social", "Natural", "Economic"]:
    wise_ch_by_capitals[cap] = {k: v for (k,v) in wise_ch_ind__nonconst.items() if k in capital_map[cap]}

In [None]:
# Find minimal and maximal recording year per capital
year_ranges = dict()

for cap in ["Human", "Social", "Natural", "Economic"]:
    [min_years, max_years] = list(zip(*[(df["Year"].min(), df["Year"].max()) for _, df in wise_ch_by_capitals[cap].items()]))
    year_ranges[cap] = range(min(min_years), max(max_years)+1)

In [None]:
wise_capitals_ch = dict()

for cap in ["Human", "Social", "Natural", "Economic"]:
    df = pd.DataFrame(index=wise_ch_by_capitals[cap].keys(), columns=[y for y in year_ranges[cap]])
    
    for index in df.index:
        for year in df.columns:
            current = wise_ch_by_capitals[cap][index]
            val = current.loc[current["Year"]==year, "Value"]
            if len(val)>0:
                df.loc[index, year] = val.values[0]

    wise_capitals_ch[cap] = df

# Next steps:
- in all four capitals, drop columns where all values are NULL
- interpolate the remaining NULL values
- look at correlations
- compute PCA

### Step 3: Find indicators that can be compared because they were measured in the same year

In [None]:
impossible = []
bad = []
medium = []
good = []
great = []

combis = list(combinations(wise_ch_ind__nonconst.keys(),2))
n_combis = len(combis)
for ind1, ind2 in combis:
    years_1 = set(wise_ch_ind__nonconst[ind1]["Year"])
    years_2 = set(wise_ch_ind__nonconst[ind2]["Year"])
    common_years = years_1.intersection(years_2)
    if len(common_years)==0:
        impossible.append([ind1, ind2])
    elif len(common_years) <= 5:
        bad.append([ind1, ind2, len(common_years)])
    elif len(common_years) <= 10:
        medium.append([ind1, ind2, len(common_years)])
    elif len(common_years) <= 30:
        good.append([ind1, ind2, len(common_years)])
    else:
        great.append([ind1, ind2, len(common_years)])

pd.DataFrame.from_dict({"impossible": [len(impossible), np.round(100*len(impossible)/n_combis,2)],
                        "bad": [len(bad), np.round(100*len(bad)/n_combis,2)],
                        "medium": [len(medium), np.round(100*len(medium)/n_combis,2)],
                        "good": [len(good), np.round(100*len(good)/n_combis,2)],
                        "great": [len(great), np.round(100*len(great)/n_combis,2)]
                       })\
            .transpose()\
            .rename({0: "count", 1: "%"}, axis=1)\
            .astype({"count": int})

In [None]:
sorted_years_list = sorted([(ind, set(df["Year"].values), len(df["Year"])) for (ind, df) in wise_ch_ind__nonconst.items()], key=lambda x: x[2])[::-1]

In [None]:
intersection = sorted_years_list[0][1]
size_intersection = [len(sorted_years_list[0][1])]
for i in range(1,len(sorted_years_list)):
    intersection = intersection.intersection(sorted_years_list[i][1])
    size_intersection.append(len(intersection))

In [None]:
plt.plot(size_intersection)
plt.axhline(y=30, ls="--", c="k")