# MONET2030 - Exploratory Data Analysis

In [None]:
# Stdlib imports
import re
from pathlib import Path

# 3rd party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
from pymonet import monet_scraper as scraper
from pymonet import monet_consts as const
from sipi_da_utils import plot

# 1) Load data

In [None]:
directions = pd.read_csv(const.trend_directions).drop("Unnamed: 0", axis=1)
directions.head()

In [None]:
key_inds = pd.read_csv(const.key_indicators)
key_inds.head()

In [None]:
capmap = pd.read_csv(const.capmap_path).drop("Unnamed: 0", axis=1)
capmap.head()

In [None]:
mitl = pd.read_csv(const.metainfo_table_path)
mitl.head()

In [None]:
dfl = scraper.DataFileLoader(mitl, const.raw_dir, const.processed_dir)
dfl.get_data()

In [None]:
monet_data = pd.read_csv(const.processed_dir / "stage_4_clean" / "monet_2030_clean.csv")
monet_data.set_index("Year", inplace=True)

In [None]:
monet_data.head()

## 2) Consolidate data into single DataFrame metrics_meta_df

In [None]:
metrics_meta_df = pd.DataFrame([{k: d[k] for k in ["metric_id", "dam_id", "observable", "description"]} 
                          for d in dfl.processed_data_list["stage2"]
                         ]
                        )
metrics_meta_df = metrics_meta_df[metrics_meta_df["metric_id"].str.endswith("metr")]

In [None]:
metrics_meta_df = metrics_meta_df.merge(mitl[["dam_id", "indicator_id", "sdg", "topic", "indicator"]], on="dam_id", how="left")

In [None]:
print(len(metrics_meta_df), len(capmap))
metrics_meta_df = metrics_meta_df.merge(capmap, left_on="indicator_id", right_on="id", how="outer")
print(len(metrics_meta_df))

In [None]:
metrics_meta_df = metrics_meta_df[["metric_id", "dam_id", "indicator_id", "sdg", "Capital - Primary", "observable", "description"]]
metrics_meta_df = metrics_meta_df.loc[metrics_meta_df["metric_id"].isin(monet_data.columns),:]

In [None]:
metrics_meta_df.head()

## 3) Split by Capital

In [None]:
four_caps = [cap for cap in metrics_meta_df["Capital - Primary"].unique() if cap==cap]
four_caps

In [None]:
caps_dict = {cap: metrics_meta_df[metrics_meta_df["Capital - Primary"]==cap] for cap in four_caps}

## 4) EDA by Capital

In [None]:
metrics = dict()
for cap in four_caps:
    metrics[cap] = monet_data.loc[:, caps_dict[cap]["metric_id"]]

In [None]:
all([len(metrics[cap].columns) == len(caps_dict[cap]) for cap in four_caps])

In [None]:
for cap in four_caps:
    print(cap, len(metrics[cap].columns))

### 4.1) Temporal data availability

In [None]:
fig, axs = plt.subplots(2,2, figsize=(17,8), sharex=True)
for i, cap in enumerate(four_caps):
    plot.visualize_data_availability(metrics[cap].transpose(), 
                                     title = f"{cap.title()} Metrics",
                                     x_label = "Years",
                                     y_label = "Metric IDs",
                                     ax = axs[i//2,i%2]
                                    )
fig.suptitle("Temporal Availability of MONET2030 Metrics by Capital")
plt.tight_layout()
plt.show()

### 4.2) Data amount

In [None]:
n_metrics_per_cap = pd.Series({cap: len(metrics[cap].columns) for cap in four_caps})

fig, ax = plt.subplots()
sns.barplot(data=n_metrics_per_cap , ax=ax)
ax.grid(True)
ax.set_title("Number of Metrics per Capital")
ax.set_ylabel("Count")
plt.show()

In [None]:
nonnull_counts = pd.Series({cap: (~metrics[cap].isna()).sum().sum() for cap in four_caps})

fig, ax = plt.subplots()
sns.barplot(data=nonnull_counts, ax=ax)
ax.grid(True)
ax.set_title("Amount of Non-Null Data Points per Capital")
ax.set_ylabel("Count")
plt.show()

In [None]:
fig, axs = plt.subplots(2,2, figsize=(8,5), sharex=True, sharey=True)
for i, cap in enumerate(four_caps):
    ax = axs[i//2,i%2]
    (~metrics[cap].isna()).sum().plot(kind="hist", ax = ax, bins=list(range(10,60,5)))
    ax.set_ylabel("count")
    ax.set_title(cap)
    ax.grid(True)
fig.suptitle("Distribution of Number of Data Points per MONET2030 Metrics by Capital\n"
+"(e.g. How many social metrics have between 15 and 20 data points? -> 11)")
plt.tight_layout()
plt.show()

### 4.3) Analysis of value distributions across all metrics (split by capital)

In [None]:
fig, ax = plt.subplots(figsize=(17,5))
cap = "Social"
metrics[cap].boxplot(orientation="horizontal", ax=ax)
ax.set_yticklabels(metrics[cap].columns)
ax.set_title(f"Value distributions for MONET2030 in {cap} capital")
ax.set_xscale('log')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(17,10))
cap = "Human"
metrics[cap].boxplot(orientation="horizontal", ax=ax)
ax.set_yticklabels(metrics[cap].columns)
ax.set_xscale('log')
ax.set_title(f"Value distributions for MONET2030 in {cap} capital")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(17,8))
cap = "Natural"
metrics[cap].boxplot(orientation="horizontal", ax=ax)
ax.set_yticklabels(metrics[cap].columns)
ax.set_xscale('log')
ax.set_title(f"Value distributions for MONET2030 in {cap} capital")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(17,4))
cap = "Economic"
metrics[cap].boxplot(orientation="horizontal", ax=ax)
ax.set_yticklabels(metrics[cap].columns)
ax.set_xscale('log')
ax.set_title(f"Value distributions for MONET2030 in {cap} capital")
plt.show()

In [None]:
for cap in four_caps:
    fig, axs = plt.subplots(1,2, figsize=(17,3))
    ax = axs[0]
    sns.lineplot(data=metrics[cap], ax=ax, legend=False)
    ax.grid(True)
    
    ax = axs[1]
    sns.lineplot(data=metrics[cap], ax=ax)
    ax.grid(True)
    ax.set_yscale("log")
    
    ax.legend(ncol=7, bbox_to_anchor=[-0.1, -0.5], loc='center')
    fig.suptitle(f"Temporal evolution of MONET 2030 metrics ({cap} capital)")
    plt.show()

In [None]:
normalized_metrics = dict()
for cap, df in metrics.items():
    first_valid_values = df.apply(lambda col: col[col.first_valid_index()])
    normalized= df.div(first_valid_values)
    normalized_metrics[cap] = normalized
    
    fig, ax = plt.subplots(figsize=(17,2))
    sns.lineplot(data=normalized, ax=ax)
    ax.grid(True)

    ax.legend(ncol=7, bbox_to_anchor=[0.5, -0.5], loc='center')
    fig.suptitle(f"Temporal evolution of normalized MONET 2030 metrics ({cap} capital)")
    plt.show()

### 4.4) Best & Worst Performering Metrics

In [None]:
performer_ranking = metrics_meta_df[["metric_id"]].drop_duplicates()
performer_ranking[["slope", "slope_norm"]] = None
performer_ranking = performer_ranking.merge(directions, on="metric_id", how="left")
performer_ranking = performer_ranking[["metric_id", "slope", "desired_trend", "slope_norm"]].set_index("metric_id")
performer_ranking.head()

In [None]:
for mid in performer_ranking.index:
    data = monet_data[mid].dropna()
    x = [year for year in data.keys()]
    y_norm = data.values/data.values[0]
    slope = np.polyfit(x,y_norm,1)[0]
    performer_ranking.loc[mid, "slope"] = slope

    if performer_ranking.loc[mid, "desired_trend"]=="up":
        performer_ranking.loc[mid, "slope_norm"] = performer_ranking.loc[mid, "slope"]
    elif performer_ranking.loc[mid, "desired_trend"]=="down":
        performer_ranking.loc[mid, "slope_norm"] = -performer_ranking.loc[mid, "slope"]
    else:
        performer_ranking.loc[mid, "slope_norm"] = np.nan
    
performer_ranking = performer_ranking.dropna(subset="slope_norm").sort_values(by="slope_norm", ascending=False)
performer_ranking

In [None]:
performer_ranking_incl_cap = performer_ranking.reset_index().merge(metrics_meta_df, on="metric_id", how="left").set_index("metric_id")
performer_ranking_incl_cap = performer_ranking_incl_cap.rename({"Capital - Primary": "capital"}, axis=1)

In [None]:
# Add key indicator info
performer_ranking_incl_cap.loc[:, "is_key"] = False
performer_ranking_incl_cap.loc[performer_ranking_incl_cap["indicator_id"].isin(key_inds["id"]), "is_key"] = True

#### 4.4.1) Overall

In [None]:
top3 = performer_ranking_incl_cap.head(3)
top3.to_excel(const.top3_metrics_file)
top3

In [None]:
bottom3 = performer_ranking_incl_cap.tail(3)
bottom3.to_excel(const.bottom3_metrics_file)
bottom3

#### 4.4.2) Per capital

In [None]:
print("TOP 3 PERFORMERS PER CAPITAL")
print("============================")
columns = ["indicator_id", "sdg", "observable", "description", "capital", "is_key", "slope", "slope_norm", "desired_trend"]

n_top_key = 0
n_top_tot = 0
with pd.ExcelWriter(const.top3_metrics_per_cap_file) as writer:
    for cap in four_caps:
        print(cap)
        print(len(cap)*"-")
        perf_cap = performer_ranking_incl_cap[performer_ranking_incl_cap["capital"]==cap][columns]
        top3 = perf_cap.head(3)
        n_top_key += top3["is_key"].sum()
        n_top_tot += top3["is_key"].count()
        display(top3)
        top3.to_excel(writer, sheet_name=cap)

In [None]:
print("BOTTOM 3 PERFORMERS PER CAPITAL")
print("============================")
columns = ["indicator_id", "sdg", "observable", "description", "capital", "is_key", "slope", "slope_norm", "desired_trend"]

n_bottom_key = 0
n_bottom_tot = 0
with pd.ExcelWriter(const.bottom3_metrics_per_cap_file) as writer:
    for cap in four_caps:
        print(cap)
        print(len(cap)*"-")
        perf_cap = performer_ranking_incl_cap[performer_ranking_incl_cap["capital"]==cap][columns]
        bottom3 = perf_cap.tail(3)
        display(bottom3)
        n_bottom_key += bottom3["is_key"].sum()
        n_bottom_tot += bottom3["is_key"].count()
        bottom3.to_excel(writer, sheet_name=cap)

In [None]:
n_bottom_key

In [None]:
xs = np.array([1,2])
n_key = np.array([n_top_key, n_bottom_key])
n_tot = np.array([n_top_tot, n_bottom_tot])

fig, ax = plt.subplots()
ax.bar([1,2], n_key, facecolor="blue", label="key indicators")
ax.set_xticks([1,2], ["best-performing", "worst-performing"])
ax.grid(True)
ax.set_title("Number of key indicators within best- & worst-performing indicators\n(summed over all capitals)")
fig.savefig(const.n_key_indicators_per_performance_plot)
plt.show()

#### 4.4.3) Best & Worst Performing Key Indicators 

In [None]:
key_ind_trend = key_inds.merge(performer_ranking_incl_cap, left_on="id", right_on="indicator_id", how="left")[["id", "topic", "indicator", "dam_id", "capital", "slope_norm"]]
key_ind_trend = key_ind_trend.sort_values(by="slope_norm", ascending=False).dropna(subset="slope_norm")
key_ind_trend["rank"] = range(1, len(key_ind_trend)+1)
key_ind_trend.to_excel(const.key_indicator_performance_file)
key_ind_trend