# MONET2030 - Analysis

In [None]:
# Stdlib imports
import re
from pathlib import Path
from collections import namedtuple

# 3rd party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colormaps as cm

# Local imports
from pymonet import monet_scraper as scraper
from pymonet import monet_processor as processor
from pymonet import monet_consts as const
from pymonet import monet_analysis as analysis
from sipi_da_utils import utils, tsa, plot

## 1) Get Web Data

In [None]:
scraper_pipeline = scraper.MonetLoader()
raw_data = await scraper_pipeline.load()

## 2) Process/Transform Data

In [None]:
pipeline = processor.TransformationPipeline(raw_data,
                                            scraper_pipeline.indicators_metatable,
                                            scraper_pipeline.observables_metatable
                                           )
final_output = pipeline.run()

In [None]:
results = pipeline.collect_results()

## 3) Visual inspection

In [None]:
figsaxes = pipeline.create_inspection_plots(create='all', write=True)
#figsaxes = pipeline.create_inspection_plots(create=['clean vs raw'], write=False)

## 4) Analysis

### 4.1) Setup & Preparations

In [None]:
analyzer = analysis.MonetAnalyzer(results)

### 4.2) Data Availability & Coverage Analysis

#### 4.2.1) Analysis of capitals

Question: How many metrics per capital are there?

In [None]:
n_metrics_per_capital = analysis.nMetricsPerCapital(results["clean"])
n_metrics_per_capital.analyze()

In [None]:
analyzer.number_of_metrics_per_capital(data_fpath = const.n_metrics_per_cap_fpath,
                                       plot_fpath = const.n_metrics_per_cap_plot_fpath
                                      )

#### 4.2.2) Analysis of sparse data

Question: How many metrics per capital are too sparse (i.e. have less than 10 data points)

In [None]:
analyzer.number_of_sparse_metrics_per_capital(data_fpath = const.sparse_metrics_analysis_fpath,
                                              plot_fpath = const.n_sparse_by_capital_plot_fpath 
                                             )

#### 4.2.3) Analysis of irrelevant data

Question: How many metrics per capital are irrelevant to agenda2030?

In [None]:
analyzer.number_of_irrelevant_metrics_per_capital(data_fpath = const.irrelevant_metrics_analysis_fpath,
                                                  plot_fpath = const.n_irrelevant_by_capital_plot_fpath
                                                 )

### 4.3) Analysis of raw data availability

Goal: Visually represent how many data points are available for each metric. Add additional information such as when that data is available and what capital the metric belongs to. 

In [None]:
analyzer.raw_data_availability(data_fpath = const.data_availability_fpath,
                               plot_fpath = const.data_availability_chart_fpath
                              )

### 4.4) Correlation Analysis

Question: Which metrics are redundant?

In [None]:
monet_ca = utils.CorrelationAnalysis(final_output)
zero_lag_corrmat = monet_ca.cross_corr(lag=0)
agg_corrmat = monet_ca.max_abs_corr()

In [None]:
for th in [0.8, 0.85, 0.9, 0.95, 0.99]:
    monochronic_fpath = const.corra_dir/f"keep_monochron_corr_th{str(int(th*100))}.xlsx"
    diachronic_fpath = const.corra_dir/f"keep_diachron_corr_th{str(int(th*100))}.xlsx"
    
    to_keep_zero_lag, corr_xlsx_zero_lag = monet_ca.drop_strong_correlations(zero_lag_corrmat, 
                                                                             threshold=th, 
                                                                             id2name_map = metric2capital_map,
                                                                             fpath_corr=monochronic_fpath)
    
    to_keep_agg, corr_xlsx_agg = monet_ca.drop_strong_correlations(agg_corrmat,
                                                                   threshold=th, 
                                                                   id2name_map = metric2capital_map,
                                                                   fpath_corr = diachronic_fpath)

In [None]:
th_vec = [th/100 for th in range(80,100,2)]+[0.99, 0.999]

counts = pd.DataFrame(index=["Social", "Human", "Natural", "Economic"], columns=[])

for th in th_vec:
    to_keep, _ = monet_ca.drop_strong_correlations(agg_corrmat, threshold=th)
    counts_per_cap = metric2capital_map[metric2capital_map.index.isin(to_keep)].groupby("capital - primary").agg("count")
    counts = counts.join(counts_per_cap.rename({"metric_name": th}, axis=1), how="outer")
    
counts.loc["Total", :] = counts.sum()
counts

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
counts.T.plot(kind="line", ax=ax)
ax.set_xticks(th_vec, th_vec, rotation=60)
ax.set_yticks(range(1,42,2), range(1,42,2))
ax.set_xlabel("correlation threshold")
ax.set_ylabel("Number of non-redundant metrics")
ax.set_title("Counting non-redundant metrics in dependency of\ncorrelation threshold")
ax.grid()
plt.tight_layout()
fig.savefig(const.corra_dir / "n_nonredundant_per_threshold.pdf", bbox_inches="tight")

In [None]:
metric2capital_map[metric2capital_map.index.isin(to_keep_agg)].groupby("capital - primary").agg("count")

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,4))
ax = axs[0]
sns.histplot(zero_lag_corrmat.unstack().values, ax=ax, kde=True, bins=40)
ax.set_title("Distribution of correlation values")
ax.set_xlabel("corr")

ax = axs[1]
sns.histplot(zero_lag_corrmat.unstack().abs().values, ax=ax, bins=20)
ax.set_title("Distribution of abs(correlation) values")
ax.set_xlabel("abs(corr)")
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,4))
ax = axs[0]
sns.histplot(agg_corrmat.unstack().values, ax=ax, kde=True, bins=40)
ax.set_title("Distribution of correlation values")
ax.set_xlabel("corr")

ax = axs[1]
sns.histplot(agg_corrmat.unstack().abs().values, ax=ax, bins=20)
ax.set_title("Distribution of abs(correlation) values")
ax.set_xlabel("abs(corr)")
plt.tight_layout()
plt.show()

### 4.5) Trend & Performance Analysis

#### 4.5.1) Analysis of trend statistics per capital
Question: How well do the individual capitals do in terms of evolving in the right direction?

In [None]:
# Get a table with all metrics including their name and desired_trend and 
# prepare two columns for slope information: slope and slope_norm
directions = pd.read_csv(const.trend_directions).set_index("metric_id")
directions["dam_id"] = [int(metr_id.split("_")[0][:-1]) for metr_id in directions.index]
directions_with_names = directions.join(metric2capital_map[["metric_name"]])

performer_ranking = directions_with_names.copy()
performer_ranking[["slope", "slope_norm"]] = None
performer_ranking.head()

In [None]:
# Fill in the slope and slope_norm columns

# REMARK: slope_norm is introduced the put all metrics
# on a common scale. Depending on the desired_trend, for
# some metrics a higher (i.e. more positive) slope is 
# better, for other metrics a lower (i.e. more negative)
# slope is better. The column slope_norm simplifies this
# as a higher (more positive) slope_norm is always better.

for mid in performer_ranking.index:
    data = monet_data[mid].dropna()
    x = [year for year in data.keys()]
    y_norm = data.values/data.values[0]
    slope = np.polyfit(x,y_norm,1)[0]
    performer_ranking.loc[mid, "slope"] = slope
        
    if performer_ranking.loc[mid, "desired_trend"]=="up":
        performer_ranking.loc[mid, "slope_norm"] = performer_ranking.loc[mid, "slope"]
    elif performer_ranking.loc[mid, "desired_trend"]=="down":
        performer_ranking.loc[mid, "slope_norm"] = -performer_ranking.loc[mid, "slope"]
    else:
        performer_ranking.loc[mid, "slope_norm"] = np.nan
    
performer_ranking = performer_ranking.join(metric2capital_map[["capital - primary"]])
performer_ranking = performer_ranking.dropna(subset="slope_norm").sort_values(by="slope_norm", ascending=False)
performer_ranking

In [None]:
# Get summary statistics about the slope_norm
slope_norm_stats = performer_ranking.groupby("capital - primary").agg({"slope_norm": ["mean", "median", "std"]})
slope_norm_stats.to_csv(const.slope_stats_fpath)
slope_norm_stats

In [None]:
fig, ax = plt.subplots(figsize=(10,3))
sns.boxplot(data=performer_ranking, x="capital - primary", y="slope_norm", ax=ax)
ax.scatter([2],[-0.045], marker="v", color="red", label="direction of single extreme outlier")
ax.legend(loc="upper right")
ax.grid(True)
ax.set_ylim([-0.05,0.3])
ax.set_title("Distribution of normalized slopes per capital")
fig.savefig(const.slope_distro_plot_fpath)
plt.show()

#### 4.5.2) Best & Worst Performering Metrics
##### 4.5.2.a) Best & Worst Performering Metrics over all
Question: Which are the 3 best and worst performing metrics over all capitals?

In [None]:
ranked_metatable = performer_ranking.reset_index()\
                                    .merge(scraper_pipeline.metatable.drop(["observable", 
                                                                            "description", 
                                                                            "capital - primary", 
                                                                            "data_file_url", 
                                                                            "hyperlink", 
                                                                            "units", 
                                                                            "id"
                                                                           ], axis=1),
                                           on="dam_id"
                                          )
ranked_metatable = ranked_metatable[["metric_id", 
                                     "metric_name", 
                                     "capital - primary", 
                                     "sdg", 
                                     "topic", 
                                     "indicator_id",
                                     "observable", 
                                     "description",
                                     "is_key",
                                     "slope_norm",
                                     "desired_trend",
                                     "slope"
                                    ]].set_index("metric_id")
ranked_metatable

In [None]:
df = ranked_metatable
fig, ax = plt.subplots(figsize=(15,30))
# Reset index for plotting
df_plot = df.reset_index()
# Use a numeric x axis to avoid categorical spacing issues
df_plot["y_pos"] = range(len(df_plot))
    
# Plot bars
sns.barplot(
    data=df_plot,
    x="slope_norm", y="y_pos", hue="capital - primary",
    dodge=False,  # Keep all hues at the same x-position
    orient="horizontal",
    ax=ax,
    legend=True
)

# Optional: show fewer x-ticks
yticks = df_plot["y_pos"]
ylabels = df_plot["metric_name"]
ax.set_yticks(yticks)
ax.set_yticklabels(ylabels, fontsize=8)

# Labels and title
ax.set_xlabel("normalized slope (higher = better)")
ax.set_ylabel("metric names")
ax.set_title("Ranking of evolution over time of MONET2030 metrics")
ax.grid(True)

plt.tight_layout()
fig.savefig(const.performance_ranking_plot_fpath)
plt.show()

In [None]:
columns = ["metric_name", 
           "capital - primary", 
           "sdg", 
           "topic", 
           "indicator_id", 
           "observable", 
           "description", 
           "is_key", 
           "slope", 
           "slope_norm", 
           "desired_trend"]

In [None]:
top3 = ranked_metatable.sort_values(by="slope_norm", ascending=False).head(3)
top3 = top3[columns]
top3.to_excel(const.top3_metrics_fpath)
top3

In [None]:
bottom3 = ranked_metatable.sort_values(by="slope_norm", ascending=False).tail(3)
bottom3 = bottom3[columns]
bottom3.to_excel(const.bottom3_metrics_fpath)
bottom3

##### 4.5.2.a) Best & Worst Performering Metrics per capital
Question: Which are the 3 best and worst performing metrics within each capital?

In [None]:
print("TOP 3 PERFORMERS PER CAPITAL")
print("============================")
n_top_key = 0
n_top_tot = 0
with pd.ExcelWriter(const.top3_metrics_per_cap_fpath) as writer:
    for cap in four_caps:
        print(cap)
        print(len(cap)*"-")
        perf_cap = ranked_metatable[ranked_metatable["capital - primary"]==cap][columns]
        top3 = perf_cap.head(3)
        n_top_key += top3["is_key"].sum()
        n_top_tot += top3["is_key"].count()
        display(top3)
        top3.to_excel(writer, sheet_name=cap)

In [None]:
print("BOTTOM 3 PERFORMERS PER CAPITAL")
print("============================")
#columns = ["metric_name", "indicator_id", "sdg", "observable", "description", "capital - primary", "is_key", "slope", "slope_norm", "desired_trend"]

n_bottom_key = 0
n_bottom_tot = 0
with pd.ExcelWriter(const.bottom3_metrics_per_cap_fpath) as writer:
    for cap in four_caps:
        print(cap)
        print(len(cap)*"-")
        perf_cap = ranked_metatable[ranked_metatable["capital - primary"]==cap][columns]
        bottom3 = perf_cap.tail(3)
        display(bottom3)
        n_bottom_key += bottom3["is_key"].sum()
        n_bottom_tot += bottom3["is_key"].count()
        bottom3.to_excel(writer, sheet_name=cap)

#### 4.5.3) Best & Worst Performing Key Indicators 
Question: How many key indicators are there among the groups of best and worst performing metrics, respectively?

In [None]:
xs = np.array([1,2])
n_key = np.array([n_top_key, n_bottom_key])
n_tot = np.array([n_top_tot, n_bottom_tot])

fig, ax = plt.subplots()
ax.bar([1,2], n_key, facecolor="blue", label="key indicators")
ax.set_xticks([1,2], ["best-performing", "worst-performing"])
ax.grid(True)
ax.set_title("Number of key indicators within best- & worst-performing indicators\n(summed over all capitals)")
fig.savefig(const.n_key_indicators_per_performance_plot_fpath)
plt.show()

Question: What is the ranking of only the key indicators with respect to their evolution over time?

In [None]:
key_ind_trend = scraper_pipeline.key_indicators_df.merge(ranked_metatable, left_on="id", right_on="indicator_id", how="left")[["id", "indicator", "capital - primary", "slope_norm"]]
key_ind_trend = key_ind_trend.sort_values(by="slope_norm", ascending=False).dropna(subset="slope_norm")
key_ind_trend["rank"] = range(1, len(key_ind_trend)+1)
key_ind_trend.to_excel(const.key_indicator_performance_fpath)
key_ind_trend