In [115]:
# Stdlib imports
import re
from pathlib import Path
from datetime import datetime as dt

# 3rd party imports
# -- data analysis
import pandas as pd

# -- web scraping
import requests
import asyncio
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# Local imports


In [192]:
# Create a list of all MONET2030 indicators as well as their mapping to SDGs

async def parse_dynamic_webpage(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_timeout(5000)
        html = await page.content()
        await browser.close()
        return html


def create_monet_indicator_list(html):
    # Filter out relevant parts
    indicators = BeautifulSoup(html).find("tbody").find_all("tr")
    
    # Oragnize the MONET indicators into a DataFrame
    base_url = "https://www.bfs.admin.ch"
    sdgs = [{"SDG": int(indicator.find_all("a")[0]["aria-label"].split(":")[0].split()[1].strip()),
            "Topic": indicator.find_all("a")[0]["aria-label"].split(":")[1].strip(),
            "Indicator": indicator.find_all("a")[1]["aria-label"],
            "Hyperlink": base_url+indicator.find_all("a")[1]["href"].replace("content/",""),
            "Agenda2030_relevant": 1 if len(indicator.find_all("img", {"title": "Agenda 2030: relevant"}))==1 else 0
            } for indicator in indicators]
    
    sdg_df = pd.DataFrame(sdgs)

    # Add a unique identifier to each indicator
    sdg_df["SubtopicID"] = sdg_df.groupby("SDG").cumcount().add(1).astype(str)
    sdg_df["ID"] = sdg_df.apply(lambda x: "MI-" + str(x["SDG"]) + "." + x["SubtopicID"], axis=1)
    sdg_df.drop("SubtopicID", axis=1, inplace=True)
    sdg_df.set_index("ID", inplace=True)
    
    # Return
    return sdg_df

In [193]:
indicator_table_path = Path("../results/indicator_table.csv")
if not indicator_table_path.exists():
    print("Scraping...")
    monet_soup = await parse_dynamic_webpage('https://www.bfs.admin.ch/bfs/en/home/statistics/sustainable-development/monet-2030/all-indicators.html')
    monet_indicator_df = create_monet_indicator_list(monet_soup)
    monet_indicator_df.to_csv(indicator_table_path)
    print("-> done!")
else:
    print("Reading from disk...")
    monet_indicator_df = pd.read_csv(indicator_table_path).set_index("ID")
    print("-> done!")

Scraping...
-> done!


In [195]:
monet_indicator_df.head(10)

Unnamed: 0_level_0,SDG,Topic,Indicator,Hyperlink,Agenda2030_relevant
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MI-1.1,1,No poverty,Poverty rate,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-1.2,1,No poverty,Total social security expenditure,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-1.3,1,No poverty,Fatalities caused by natural events,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-2.1,2,Zero hunger,Fruit and vegetables consumption,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-2.2,2,Zero hunger,Nitrogen balance from agriculture,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-2.3,2,Zero hunger,Plant genetic resources for food and agriculture,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-2.4,2,Zero hunger,Organic farming,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-2.5,2,Zero hunger,Greenhouse gas emissions from agriculture,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
MI-2.6,2,Zero hunger,Arable land,https://www.bfs.admin.ch/bfs/en/home/statistik...,0
MI-3.1,3,Good health and well-being,Measles immunisation coverage,https://www.bfs.admin.ch/bfs/en/home/statistik...,1


In [65]:
async def scrape_indicator_info(url):
    html = await parse_dynamic_webpage(url)
    soup = BeautifulSoup(html, 'html.parser')
    ul = soup.find("ul", {"class": "search-results-list", "data-vue-component": "asset-list"})
    if not ul:
        return []
    data_file_elements = ul.find_all("li")
    return data_file_elements

In [102]:
def extract_all_data_files(data_file_elements):
    data_file_info_list = []
    for elm in data_file_elements:
        damid_list = re.findall(r'damid="\d+"', str(elm))
        assert len(damid_list)==1
        damid = re.search(r'\d+', damid_list[0]).group(0)
        desc_string = strip_tags(str(elm.find("div", {"class": "card__title"})))
        data_info = desc_string.split(" - ")
        if len(data_info)==2:
            observable = data_info[0]
            description = data_info[1]
            units = ""
        elif len(data_info)==3:
            observable = data_info[0]
            description = data_info[1]
            units = data_info[2]
            
        file_dict = {"damid": damid, 
                     "Data_url": f"https://dam-api.bfs.admin.ch/hub/api/dam/assets/{damid}/master",
                     "Observable": observable, 
                     "Description": description,
                     "Units": units,
                    }
        data_file_info_list.append(file_dict)
    return pd.DataFrame(data_file_info_list)

In [130]:
df_list = []
counter = 0
n_indicators = len(monet_indicator_df)

start = dt.now()
for idx, indicator in monet_indicator_df.iterrows():
    counter += 1
    print(f"{counter}/{n_indicators}", end="\r")
    data_elements = await scrape_indicator_info(indicator["Hyperlink"])
    df = extract_all_data_files(data_elements)
    df["Indicator"] = indicator["Indicator"]
    df["SDG"] = indicator["SDG"]
    df["Topic"] = indicator["Topic"]
    df_list.append(df)
end = dt.now()
elapsed = end - start
print(f"Finished after {elapsed.seconds} seconds.")

7/109

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


Finished after 927 seconds.


In [131]:
complete_data_df = pd.concat(df_list, ignore_index=True)[["SDG", "Topic", "Indicator", "Observable", "Description", "Units", "damid", "Data_url"]]

In [134]:
complete_data_df.to_csv("../results/monet_datafile_summary_table.csv", index=False)

In [139]:
len(complete_data_df)

131

In [136]:
database = []
for href in complete_data_df["Data_url"]:
    database.append(pd.read_excel(href, sheet_name=None))

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [138]:
len(database)

131

In [142]:
html = await parse_dynamic_webpage("https://www.bfs.admin.ch/bfs/en/home/statistics/sustainable-development/monet-2030/all-indicators.html")

In [149]:
indicators = BeautifulSoup(html).find("tbody").find_all("tr")

In [158]:
href = indicator.find_all("a")[1]["href"]

'/content/bfs/en/home/statistiken/nachhaltige-entwicklung/monet-2030/indikatoren/armutsquote.html'

In [161]:
indicators[0].find_all("a")[0]["aria-label"]

'SDG 1: No poverty'

In [182]:
indicators[0].find_all("img", {"title": "Agenda 2030: relevant"})

[<img alt="Agenda 2030: relevant" class="icon icon--xl" src="/content/dam/bfs/de/indicators/monet-agenda2030.svg" title="Agenda 2030: relevant"/>]

In [189]:
base_url = "https://www.bfs.admin.ch"
df = pd.DataFrame([{"SDG": indicator.find_all("a")[0]["aria-label"].split(":")[0].split()[1].strip(),
 "Topic": indicator.find_all("a")[0]["aria-label"].split(":")[1].strip(),
 "Indicator": indicator.find_all("a")[1]["aria-label"],
 "Hyperlink": base_url+indicator.find_all("a")[1]["href"].replace("content/",""),
 "Agenda2030_relevant": 1 if len(indicator.find_all("img", {"title": "Agenda 2030: relevant"}))==1 else 0
} for indicator in indicators])

In [191]:
df

Unnamed: 0,SDG,Topic,Indicator,Hyperlink,Agenda2030_relevant
0,1,No poverty,Poverty rate,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
1,1,No poverty,Total social security expenditure,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
2,1,No poverty,Fatalities caused by natural events,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
3,2,Zero hunger,Fruit and vegetables consumption,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
4,2,Zero hunger,Nitrogen balance from agriculture,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
...,...,...,...,...,...
104,16,"Peace, justice and strong institutions",Language use: multilingualism,https://www.bfs.admin.ch/bfs/en/home/statistik...,0
105,16,"Peace, justice and strong institutions",Participation in cultural activities,https://www.bfs.admin.ch/bfs/en/home/statistik...,0
106,16,"Peace, justice and strong institutions",Level of public debt,https://www.bfs.admin.ch/bfs/en/home/statistik...,0
107,17,Partnerships for the goals,Official Development Assistance,https://www.bfs.admin.ch/bfs/en/home/statistik...,1
