# MONET2030 - Data Scraping/ETL

In [None]:
# Stdlib imports
import re
from pathlib import Path

# 3rd party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
from pymonet import monet_scraper as scraper
from pymonet import monet_consts as const
from sipi_da_utils import utils

## 1) List of all MONET2030 indicators

First, let's scrape a list of all indicators and their meta information (e.g. the URLs pointing to the indicator-specific subpages). Let's write this info to a dataframe and store it to disk.

In [None]:
itl = scraper.IndicatorTableLoader(const.url_all_monet2030_indicators, 
                                   const.indicator_table_path
                                  )
await itl.get_table()

In [None]:
itl.table.head()

## 2) List of all data files for all MONET2030 indicators

Given a list of all subpages related to the MONET2030 indicators (see Step 1), we can now go a step further and scrape each of these subpages. Doing so we can find yet a new set of URLs that point to the actual indicator-specific data files. It is the data in these files we are ultimately interested in.

In [None]:
mitl = scraper.MetaInfoTableLoader(itl.table,
                                   const.metainfo_table_path
                                  )
await mitl.get_table()

In [None]:
mitl.table.head()

## 3) Download all the data files

In [None]:
dfl = scraper.DataFileLoader(mitl.table, const.raw_data_dir, const.processed_data_dir)

In [None]:
dfl.get_data(force_download=True)

## 4) Data Cleaning

In [None]:
monet_data = dfl.processed_data_list["stage3"]["metrics"]

In [None]:
monet_data.head()

### 4.1) Data set specific cleaning - Remove indicators not relevant for Agenda 2030

In [None]:
merged = itl.table.merge(mitl.table, left_on="id", right_on="indicator_id")
unrelevant_observables = set(merged.loc[merged["agenda2030_relevant"]==0, "dam_id"].values)

In [None]:
unrelevant_metrics = [c for c in monet_data.columns if int(c.split("_")[0][:-1]) in unrelevant_observables]

In [None]:
# Consistency check
set([int(m.split("_")[0][:-1]) for m in unrelevant_metrics]) == set(unrelevant_observables)

In [None]:
relevant_monet_df = monet_data.drop(unrelevant_metrics, axis=1).copy()
relevant_monet_df.head()

In [None]:
fig, ax = utils.visualize_data_availability(relevant_monet_df.transpose(), 
                                      title = "Data availablity for MONET2030",
                                      x_label = "Years",
                                      y_label = "Metric IDs"
                                     )

### 4.2) Standard data cleaning

In [None]:
cleaner = utils.DataCleaner(relevant_monet_df, verbose=2)
cleaner.remove_constant_columns()
cleaner.apply_time_filter(max_year = 2025)
cleaner.drop_sparse_columns(n_notnull_min = 4)

In [None]:
monet_clean = cleaner.df

## 5) Data Imputation 
Impute missing values through linear interpolation wherever possible

In [None]:
monet_interp = utils.interpolate_data(monet_clean)

In [None]:
fig, ax = utils.visualize_data_availability(monet_interp.transpose(), 
                                      title = "Data availablity for MONET2030",
                                      x_label = "Years",
                                      y_label = "Metric IDs"
                                     )

## 6) Correlation analysis

In [None]:
monet_ca = utils.CorrelationAnalysis(monet_interp, timeseries=True)
monet_ca.compute_correlation()
monet_ca.plot_corr_heatmap(title="")

In [None]:
non_redundant_metrics = monet_ca.drop_strong_correlations(threshold=0.9)
len(non_redundant_metrics)

In [None]:
monet_ca2 = utils.CorrelationAnalysis(monet_interp.loc[:, monet_interp.columns.isin(non_redundant_metrics)], timeseries=True)
monet_ca2.compute_correlation()
monet_ca2.plot_corr_heatmap(title="")

In [None]:
# Map non-redundant metrics to non-redundant observables/dam_ids
non_redundant_dam_ids = set([int(mtr.split("_")[0][:-1]) for mtr in non_redundant_metrics])
print(len(non_redundant_dam_ids))
mitl.table[mitl.table["dam_id"].isin(non_redundant_dam_ids)]

In [None]:
# Map non-redundant metrics to non-redundant observables/dam_ids
non_redundant_indicators = mitl.table.loc[mitl.table["dam_id"].isin(non_redundant_dam_ids), "indicator_id"].unique()
len(non_redundant_indicators)

In [None]:
itl.table[itl.table["id"].isin(non_redundant_indicators)]