# MONET2030 - Data Scraping/ETL

In [None]:
# Stdlib imports
import re
from pathlib import Path
from datetime import datetime as dt

# 3rd party imports
import pandas as pd

# Local imports
from pymonet import monet_etl as etl
from pymonet import monet_consts as const

## 1) List of all MONET2030 indicators

First, let's scrape a list of all indicators and their meta information (e.g. the URLs pointing to the indicator-specific subpages). Let's write this info to a dataframe and store it to disk.

In [None]:
async def scrape_indicator_table(indicator_table_url: str) -> pd.DataFrame:
    """
    Scrapes the indicator table from the WWW.

    Parameters
    ----------
    indicator_table_url : str
        URL pointing to the indicator table

    Returns
    -------
    pd.DataFrame
        DataFrame containing the full list of 
        MONET2030 indicators
    """
    print("Scraping...")
    # ETL process for Monet2030 indicator list
    etl_mil = etl.ETL_MonetIndicatorList(url_all_monet2030_indicators)
    await etl_mil.extract()
    etl_mil.transform()
    etl_mil.df.to_csv(indicator_table_path)
    print("-> done!")    
    
    return etl_ml.df

In [None]:
if not const.indicator_table_path.exists():  # Only scrape if the data is not already on disk
    monet_indicator_df = await scrape_indicator_table(const.url_all_monet2030_indicators)
else:  # Otherwise, read data from disk
    print("Reading from disk...")
    monet_indicator_df = pd.read_csv(const.indicator_table_path).set_index("ID")
    print("-> done!")

## 2) List of all data files for all MONET2030 indicators

Given a list of all subpages related to the MONET2030 indicators (see Step 1), we can now go a step further and scrape each of these subpages. Doing so we can find yet a new set of URLs that point to the actual indicator-specific data files. It is the data in these files we are ultimately interested in.

In [None]:
if not const.summary_table_path.exists():
    df_list = []
    counter = 0
    n_indicators = len(monet_indicator_df)
    
    start = dt.now()
    print("Scraping...")
    for idx, indicator in monet_indicator_df.iterrows():
        counter += 1
        print(f"{counter}/{n_indicators}", end="\r")
    
        # ETL process for specific Monet2030 indicator
        etl_mii = etl.ETL_MonetIndicatorInfo(indicator["Hyperlink"])
        await etl_mii.extract()
        etl_mii.transform()
    
        # Augment data
        etl_mii.df["Indicator"] = indicator["Indicator"]
        etl_mii.df["SDG"] = indicator["SDG"]
        etl_mii.df["Topic"] = indicator["Topic"]
        df_list.append(etl_mii.df)
    end = dt.now()
    elapsed = end - start
    print("-> done!")
    print(f"Finished after {elapsed.seconds} seconds.")

    # Concatenate individual dfs
    summary_table = pd.concat(df_list, ignore_index=True)
    # Resort columns
    summary_table = summary_table[["SDG", "Topic", "Indicator", "Observable", "Description", "Units", "damid", "Data_url"]]
    # Write resulting table to file
    summary_table.to_csv(const.summary_table_path, index=False)
else:
    print("Reading from disk...")
    summary_table = pd.read_csv(const.summary_table_path)
    print("-> done!")

## 3) Download all the data files

In [None]:
database = []
for href in complete_data_df["Data_url"]:
    database.append(pd.read_excel(href, sheet_name=None))

In [None]:
len(database)