In [49]:
import random
import re

import aiohttp
import bs4
from bs4 import BeautifulSoup


class Skip(Exception):
    pass


class BadEmpty(Exception):
    @classmethod
    def check(cls, iter, name, cont):
        l = list(iter)
        if not l:
            raise cls(f'{name} in {cont} has no elements')

class NotUnique(Exception):
    @classmethod
    def check(cls, iter, name, cont, n=1):
        l = list(iter)
        if len(l) != n:
            raise cls(f'{name} in {cont} has {len(l)} elements, not {n}')

blanks = re.compile(r'^\s*$')

def noblanks(iter):
    """ Removes useless elements like " ", "\\n", etc """
    for i in iter:
        s = None
        if type(i) == bs4.element.NavigableString:
            s = i.getText()
        elif type(i) == str:
            s = i
        if s is not None and re.match(blanks, s):
            continue

        yield i


domain = 'https://www.citypopulation.de'

continent_links = [
    'https://www.citypopulation.de/en/africa/',
    'https://www.citypopulation.de/en/america/',
    'https://www.citypopulation.de/en/asia/',
    'https://www.citypopulation.de/en/europe/',
    'https://www.citypopulation.de/en/oceania/',
]

countries_scrapped = []
scraping_errors = []
stat_options = set()
first_stat = set()

no_cindex = []
decoderrs = []

getcountry = re.compile(r"/en/([^/]+)/")

async def crawl_country(href, p=1):
    """_summary_

    Parameters
    ----------
    href : str
        URL Path
    p : int, optional
        Probability of being executed, by default 1

    """
    country = getcountry.search(href).group(1)
    countries_scrapped.append(country)

    if random.random() > p:
        raise Skip()

    country_url = f'{domain}{href}'
    async with aiohttp.ClientSession() as session:
        async with session.get(country_url) as response:
            try:
                country_html = await response.text()
            except UnicodeDecodeError as e:
                decoderrs.append(e)
                country_html = str(response._body)

    country_doc = BeautifulSoup(country_html, "html.parser")
    cindex = country_doc.select(".cindex")

    try:
        BadEmpty.check(cindex, 'stats_colboxes', country)
        indexes = list(noblanks(cindex[0].children))
        BadEmpty.check(indexes, 'stats_colboxes', country)
    except BadEmpty as e:
        no_cindex.append(country)
        raise Skip()

    for el in indexes:
        el: bs4.element.Tag
        el_c = set(el.get('class', []))
        if 'mcolboxes' in el_c:
            break
    else:
        BadEmpty.check([], 'mcolboxes', country)

    h3s = el.find_all("h3")

    options = [ h3.get_text() for h3 in h3s ]

    BadEmpty.check(options, 'stat options', country)
    first_stat.add(options[0])
    stat_options.update(options)


async with aiohttp.ClientSession() as session:
    for continent in continent_links:
        async with session.get(continent) as response:
            continent_html = await response.text()

        continent_doc = BeautifulSoup(continent_html, "html.parser")
        country_colboxes = continent_doc.select(".mcolboxes")

        mcolboxes = country_colboxes[0]
        # Oceania has 2 .mcolboxes, but the second is for Antartica

        NotUnique.check(noblanks(mcolboxes.children), 'mcolboxes.children', continent)
        country_ul = list(noblanks(mcolboxes.children))[0]

        for i, country_li in enumerate(noblanks(country_ul.children)):
            country_li: bs4.element.Tag
            li_c = list(noblanks(country_li.children))
            if not li_c:
                continue

            NotUnique.check(li_c, 'country_li.children', f'{continent},{i}')
            a = li_c[0]
            a: bs4.element.Tag

            href = a.get('href')
            if type(href) == list: # Sometiems it's a list
                href = href[0]
            href: str

            try:
                await crawl_country(href)
            except Skip:
                continue

            except Exception as e:
                scraping_errors.append((e, country))


In [34]:
from pprint import pprint
pprint(stat_options)
pprint(first_stat)
pprint(scraping_errors)

{' ',
 '&',
 ',',
 '-',
 '0',
 '1',
 'A',
 'B',
 'C',
 'D',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z'}
{'Cities and Urban Centers',
 'Departments & Cities',
 'Departments & Major Cities',
 'Departments and Cities',
 'Districts & Cities',
 'Districts & Places',
 'Governorates & Major Cities',
 'Island & Places',
 'Islands & Major Localities',
 'Islands & Major Towns',
 'Islands & Urban Localities',
 'Islands, Capital & Districts',
 'LGAs, Cities & Settlements',
 'Major Cities',
 'Municipalities & Urban Communities',
 'News',
 'Parishes & Cities',
 'Prefectures and Cities',
 'Provinces & Agglomerations',
 'Provinces & Cities',
 'Provinces & Major Cities',
 'Provinces & Major Urban Areas',
 'Provinces & Urban Areas',
 'Provinces and Cities',
 'Provinces, Cities & Urban Places',
 'Regions & Citie