# MONET2030 - Data Scraping/ETL

In [None]:
# Stdlib imports
import re
from pathlib import Path
from collections import namedtuple

# 3rd party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colormaps as cm

# Local imports
from pymonet import monet_scraper as scraper
from pymonet import monet_processor as processor
from pymonet import monet_consts as const
from sipi_da_utils import utils, tsa, plot

## 1) Get Web Data

In [None]:
scraper_pipeline = scraper.Scraper()
raw_data = await scraper_pipeline.scrape()

## 2) Process/Transform Data

In [None]:
pipeline = processor.TransformationPipeline(raw_data,
                                            scraper_pipeline.indicators_metatable,
                                            scraper_pipeline.observables_metatable
                                           )
final_output = pipeline.run()

## 3) Visual inspection

In [None]:
monet_data = pipeline.stages[2].output.copy()
monet_data.index = pd.to_datetime(monet_data.index, format="%Y")

monet_clean = pipeline.stages[3].output.copy()
monet_clean.index = pd.to_datetime(monet_clean.index, format="%Y")

monet_interpolated = pipeline.stages[4].output.copy()
monet_interpolated.index = tsa.fractional_years_to_datetime(monet_interpolated.index)

monet_envelopes = pipeline.stages[4].additional_results["uncertainty_envelopes"].copy()
monet_envelopes.index = tsa.fractional_years_to_datetime(monet_envelopes.index)

monet_residuals = pipeline.stages[5].output.copy()
monet_trends = pipeline.stages[5].additional_results["trends"].copy()
monet_zscores = final_output  # equivalent to pipeline.stages[6].output

In [None]:
plot.plot_data(monet_clean, 
               title="Clean vs raw data (clean = red line, raw = black dots)", 
               scatter_df=monet_data, 
              )

In [None]:
plot.plot_data(monet_interpolated,
               title="GP-interpolated vs clean data (interpolated = red line, clean = black dots)",
               scatter_df=monet_clean,
               error_df=monet_envelopes,
              )

In [None]:
plot.plot_data(monet_trends,
               title="Trend lines (red) through GP-interpolated data (black dots)",
               scatter_df=monet_interpolated,
              )

In [None]:
plot.plot_data(monet_residuals, 
               title="Residuals after detrending GP-interpolated data",
              )

In [None]:
plot.plot_data(monet_zscores,
               title="Normalized residuals of detrended data",
              )

## 4) Analysis

### 4.1) Setup & Preparations

In [None]:
metric2capital_map = pipeline.stages[1].additional_results['metric_id2name_map'][["metric_id", "metric_name", "capital - primary"]].set_index("metric_id")
metric2capital_map.head()

### 4.2) Data Availability & Coverage Analysis

In [None]:
all_metrics = [metric for metric in monet_data.columns]
irrelevant_metrics = [metric for metric in pipeline.stages[3].additional_results["irrelevant_metrics"].columns]
sparse_metrics = [col_idx.zfill(14) for col_idx, col in monet_data.items() if col.count()<10]
kept_metrics = [metric for metric in monet_clean.columns]
pruned_metrics = set(all_metrics) - set(kept_metrics)

irrelevant_metrics_df = metric2capital_map.loc[metric2capital_map.index.isin(irrelevant_metrics)]
sparse_metrics_df = metric2capital_map.loc[metric2capital_map.index.isin(sparse_metrics)]
kept_metrics_df = metric2capital_map.loc[metric2capital_map.index.isin(kept_metrics)]
pruned_metrics_df = metric2capital_map.loc[metric2capital_map.index.isin(pruned_metrics)]

#### 4.2.1) Analysis of sparse data

Question: How many metrics per capital are too sparse (i.e. have less than 10 data points)

In [None]:
sparse_metrics_df = sparse_metrics_df.join(monet_data.count(axis=0)
                                           .to_frame()
                                           .rename({0: "count"}, axis=1),
                                           how="left"
                                          )
sparse_metrics_df = sparse_metrics_df.sort_values(by="count", ascending=False)
sparse_metrics_df.to_csv(const.sparse_metrics_analysis_fpath) 
sparse_metrics_df.head()

In [None]:
sparse_metrics_per_capital = sparse_metrics_df.groupby("capital - primary").agg({"count": "count"})

fig, ax = plt.subplots(figsize=(8,3))
sparse_metrics_per_capital.plot(kind="bar", ax=ax, legend=False)
ax.grid(True)
ax.set_xlabel("capital")
ax.set_ylabel("count")
ax.set_title("Number of removed metrics per capital due to insufficient data availability")
plt.xticks(rotation=0)
plt.tight_layout()
fig.savefig(const.n_sparse_by_capital_plot_fpath)
plt.show()

#### 4.2.2) Analysis of irrelevant data

Question: How many metrics per capital are irrelevant to agenda2030?

In [None]:
irrelevant_metrics_df = irrelevant_metrics_df.join(monet_data.count(axis=0)
                                                   .to_frame()
                                                   .rename({0: "count"}, axis=1),
                                                   how="left"
                                                  )
irrelevant_metrics_df = irrelevant_metrics_df.sort_values(by="count", ascending=False)
irrelevant_metrics_df.to_csv(const.irrelevant_metrics_analysis_fpath)
irrelevant_metrics_df.head()

In [None]:
irrelevant_metrics_per_capital = irrelevant_metrics_df.groupby("capital - primary").agg({"count": "count"})

fig, ax = plt.subplots(figsize=(8,3))
irrelevant_metrics_per_capital.plot(kind="bar", ax=ax, legend=False)
ax.grid(True)
ax.set_xlabel("capital")
ax.set_ylabel("count")
ax.set_title("Number of metrics irrelevant to agenda2030 (per capital)")
plt.xticks(rotation=0)
plt.tight_layout()
fig.savefig(const.n_irrelevant_by_capital_plot_fpath)
plt.show()

### 4.3) Analysis of raw data availability

Goal: Visually represent how many data points are available for each metric. Add additional information such as when that data is available and what capital the metric belongs to. 

In [None]:
datapoint_counts = monet_data.count()\
                             .sort_values(ascending=False)\
                             .to_frame()\
                             .rename({0: "count"}, axis=1)
metric_availability = datapoint_counts.join(metric2capital_map)
metric_availability = metric_availability.dropna(subset=["capital - primary"])

In [None]:
fig, ax = plt.subplots(figsize=(17,30))
plot.raw_data_availability_barchart(metric_availability,
                                    "counts",
                                    "Metric Name",
                                    "Number of measured data points per metric",
                                    ax=ax)
plt.tight_layout()
plt.show()

In [None]:
monet_trp = monet_data.transpose()
monet_trp = monet_trp.loc[:,~monet_trp.columns.duplicated()].copy()

# Get existing years from column index (assumed datetime index)
existing_years = [col for col in monet_trp.columns]
full_year_range = list(range(min(existing_years), max(existing_years) + 1))

# Identify missing years
missing_years = [y for y in full_year_range if y not in existing_years]

# Add missing columns with white color
for year in missing_years:
    monet_trp[year] = np.nan * len(monet_trp)

# Reorder columns chronologically
monet_trp = monet_trp.reindex(sorted(monet_trp.columns), axis=1)

monet_trp["capital"] = metric2capital_map.loc[monet_trp.index, "capital - primary"]

In [None]:
df_plot = monet_trp.loc[metric_availability.index,:]

fig,axs=plt.subplots(1,2, figsize=(17,30), sharey=True, gridspec_kw = {"wspace": 0.01})
# Left plot panel
plot.visualize_data_availability_colored(df_plot,
                                         "Year",
                                         "Metric Name",
                                         "MONET2030 metric data availability across time",
                                         ax=axs[0]
                                        )

# Right plot panel
plot.raw_data_availability_barchart(metric_availability,
                                    "Number of measured data points per metric",
                                    "Metric Name",
                                    "Number of measured data points per metric",
                                    ax=axs[1],
                                    show_legend=False
                                    )
plt.tight_layout()
fig.suptitle("Data availability", y=0.9, fontsize=18)
fig.savefig(const.data_availability_dir / "data_availability_all.pdf", bbox_inches="tight")
plt.show()

### 4.4) Correlation Analysis

Question: Which metrics are redundant?

In [None]:
monet_ca = utils.CorrelationAnalysis(final_output)
zero_lag_corrmat = monet_ca.cross_corr(lag=0)
agg_corrmat = monet_ca.max_abs_corr()

In [None]:
th = 0.99
to_keep_zero_lag, corr_xlsx_zero_lag = monet_ca.drop_strong_correlations(zero_lag_corrmat, threshold=th)
to_keep_agg, corr_xlsx_agg = monet_ca.drop_strong_correlations(agg_corrmat, threshold=th)

In [None]:
th_vec = [th/100 for th in range(80,100,2)]+[0.99, 0.999]

counts = pd.DataFrame(index=["Social", "Human", "Natural", "Economic"], columns=[])

for th in th_vec:
    to_keep, _ = monet_ca.drop_strong_correlations(agg_corrmat, threshold=th)
    counts_per_cap = metric2capital_map[metric2capital_map.index.isin(to_keep)].groupby("capital - primary").agg("count")
    counts = counts.join(counts_per_cap.rename({"metric_name": th}, axis=1), how="outer")
    
counts.loc["Total", :] = counts.sum()
counts

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
counts.T.plot(kind="line", ax=ax)
ax.set_xticks(th_vec, th_vec, rotation=60)
ax.set_yticks(range(1,42,2), range(1,42,2))
ax.set_xlabel("correlation threshold")
ax.set_ylabel("Number of non-redundant metrics")
ax.set_title("Counting non-redundant metrics in dependency of\ncorrelation threshold")
ax.grid()
plt.tight_layout()
fig.savefig(const.corra_dir / "n_nonredundant_per_threshold.pdf", bbox_inches="tight")

In [None]:
metric2capital_map[metric2capital_map.index.isin(to_keep_agg)].groupby("capital - primary").agg("count")

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,4))
ax = axs[0]
sns.histplot(zero_lag_corrmat.unstack().values, ax=ax, kde=True, bins=40)
ax.set_title("Distribution of correlation values")
ax.set_xlabel("corr")

ax = axs[1]
sns.histplot(zero_lag_corrmat.unstack().abs().values, ax=ax, bins=20)
ax.set_title("Distribution of abs(correlation) values")
ax.set_xlabel("abs(corr)")
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,4))
ax = axs[0]
sns.histplot(agg_corrmat.unstack().values, ax=ax, kde=True, bins=40)
ax.set_title("Distribution of correlation values")
ax.set_xlabel("corr")

ax = axs[1]
sns.histplot(agg_corrmat.unstack().abs().values, ax=ax, bins=20)
ax.set_title("Distribution of abs(correlation) values")
ax.set_xlabel("abs(corr)")
plt.tight_layout()
plt.show()

## 4) Visual inspection