In [115]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from pathlib import Path
import asyncio
from playwright.async_api import async_playwright
from datetime import datetime as dt
from io import StringIO
from html.parser import HTMLParser

In [112]:
# Create a list of all MONET2030 indicators as well as their mapping to SDGs

def read_monet_page():
    # Access webpage
    indicator_list_url = "https://www.bfs.admin.ch/bfs/en/home/statistics/sustainable-development/monet-2030/all-indicators.html"
    r = requests.get(indicator_list_url)

    # Read page content
    soup = BeautifulSoup(r.text)

    # Return
    return soup

def page_exists(url):
    response = requests.head(url, allow_redirects=True, timeout=10)
    return response.status_code == 200
    
def create_monet_indicator_list(soup):
    # Filter out relevant parts
    monet_table_str = str(soup.find("wgl-indicator-set"))
    pattern = r'"title":.*?},"indicatorLink":{"href":".*?","title":.*?}'
    titles = re.findall(pattern, str(monet_table_str))
    
    sdgs, links, indicators = zip(*[t.replace('"title":','')\
                                     .replace("}","")\
                                     .replace('"indicatorLink":{"href":','').split('","') for t in titles
                                   ]
                                 )
    sdg_ids, topics = zip(*[sdg.split(":") for sdg in sdgs])
    
    # Oragnize the MONET indicators into a DataFrame
    sdgs = []

    base_url = "https://www.bfs.admin.ch"
    for sdg_id, top, ind, href in zip(sdg_ids, topics, indicators, links):
        sdg_number = int(re.findall(r"\d+", sdg_id)[0])
        indicator_url = base_url+href.replace("content/","")
        if not page_exists(indicator_url):
            indicator_url = "unknown"
        sdgs.append({"SDG": sdg_number,
                     "Topic": top.strip(),
                     "Indicator": ind.replace('"',''),
                     "Hyperlink": indicator_url})
    
    sdg_df = pd.DataFrame(sdgs)

    # Add a unique identifier to each indicator
    sdg_df["SubtopicID"] = sdg_df.groupby("SDG").cumcount().add(1).astype(str)
    sdg_df["ID"] = sdg_df.apply(lambda x: "MI-" + str(x["SDG"]) + "." + x["SubtopicID"], axis=1)
    sdg_df.drop("SubtopicID", axis=1, inplace=True)
    sdg_df.set_index("ID", inplace=True)
    
    # Return
    return sdg_df

In [3]:
indicator_table_path = Path("../results/indicator_table.csv")
if not indicator_table_path.exists():
    print("Scraping...")
    monet_soup = read_monet_page()
    monet_indicator_df = create_monet_indicator_list(monet_soup)
    monet_indicator_df.to_csv(indicator_table_path)
    print("-> done!")
else:
    print("Reading from disk...")
    monet_indicator_df = pd.read_csv(indicator_table_path).set_index("ID")
    print("-> done!")

Reading from disk...
-> done!


In [4]:
monet_indicator_df.head()

Unnamed: 0_level_0,SDG,Topic,Indicator,Hyperlink
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MI-1.1,1,No poverty,Poverty rate,https://www.bfs.admin.ch/bfs/en/home/statistik...
MI-1.2,1,No poverty,Total social security expenditure,https://www.bfs.admin.ch/bfs/en/home/statistik...
MI-1.3,1,No poverty,Fatalities caused by natural events,https://www.bfs.admin.ch/bfs/en/home/statistik...
MI-2.1,2,Zero hunger,Fruit and vegetables consumption,https://www.bfs.admin.ch/bfs/en/home/statistik...
MI-2.2,2,Zero hunger,Nitrogen balance from agriculture,https://www.bfs.admin.ch/bfs/en/home/statistik...


In [5]:
monet_indicator_df.iloc[0,3]

'https://www.bfs.admin.ch/bfs/en/home/statistiken/nachhaltige-entwicklung/monet-2030/indikatoren/armutsquote.html'

In [8]:
async def parse_dynamic_webpage(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_timeout(5000)
        html = await page.content()
        await browser.close()
        return html

In [33]:


class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [65]:
async def scrape_indicator_info(url):
    html = await parse_dynamic_webpage(url)
    soup = BeautifulSoup(html, 'html.parser')
    ul = soup.find("ul", {"class": "search-results-list", "data-vue-component": "asset-list"})
    if not ul:
        return []
    data_file_elements = ul.find_all("li")
    return data_file_elements

In [102]:
def extract_all_data_files(data_file_elements):
    data_file_info_list = []
    for elm in data_file_elements:
        damid_list = re.findall(r'damid="\d+"', str(elm))
        assert len(damid_list)==1
        damid = re.search(r'\d+', damid_list[0]).group(0)
        desc_string = strip_tags(str(elm.find("div", {"class": "card__title"})))
        data_info = desc_string.split(" - ")
        if len(data_info)==2:
            observable = data_info[0]
            description = data_info[1]
            units = ""
        elif len(data_info)==3:
            observable = data_info[0]
            description = data_info[1]
            units = data_info[2]
            
        file_dict = {"damid": damid, 
                     "Data_url": f"https://dam-api.bfs.admin.ch/hub/api/dam/assets/{damid}/master",
                     "Observable": observable, 
                     "Description": description,
                     "Units": units,
                    }
        data_file_info_list.append(file_dict)
    return pd.DataFrame(data_file_info_list)

In [130]:
df_list = []
counter = 0
n_indicators = len(monet_indicator_df)

start = dt.now()
for idx, indicator in monet_indicator_df.iterrows():
    counter += 1
    print(f"{counter}/{n_indicators}", end="\r")
    data_elements = await scrape_indicator_info(indicator["Hyperlink"])
    df = extract_all_data_files(data_elements)
    df["Indicator"] = indicator["Indicator"]
    df["SDG"] = indicator["SDG"]
    df["Topic"] = indicator["Topic"]
    df_list.append(df)
end = dt.now()
elapsed = end - start
print(f"Finished after {elapsed.seconds} seconds.")

7/109

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


Finished after 927 seconds.


In [131]:
complete_data_df = pd.concat(df_list, ignore_index=True)[["SDG", "Topic", "Indicator", "Observable", "Description", "Units", "damid", "Data_url"]]

In [134]:
complete_data_df.to_csv("../results/monet_datafile_summary_table.csv", index=False)

In [126]:
elapsed.seconds

3