In [None]:
import tqdm
import re
import requests

import pandas as pd

from bs4 import BeautifulSoup
from unicodedata import normalize
from datetime import datetime

# ESN Automatic Section Counter

## Functions

In [None]:
def get_soup_from_url(url):
    page = requests.get(url)
    return BeautifulSoup(page.text, 'lxml')


def get_global_counts(content):
    regex = "The ESN network consists at this moment of (\d+) local sections in (\d+) countries."
    text = normalize("NFKD", content.find('p').get_text())
    return [int(elem) for elem in re.search(regex, text).groups()]


def get_country_section_count(country_url):
    soup = get_soup_from_url(country_url)
    national_org_name = soup.find('h1', {'class': 'page-header'}).text
    
    section_count_paragraph = soup.find('div', {'class': 'num_sections_country'}).text
    section_count_regex = "Number of sections: (\d+)"
    section_count = int(re.search(section_count_regex, section_count_paragraph).group(1))

    return national_org_name, section_count


def get_cells(row, tag):
    return [elem.text.strip() for elem in row.find_all(tag)]

## Website scraping

### Global counts

In [None]:
main_url = "https://www.esn.org/sections"
main_soup = get_soup_from_url(main_url)
main_content = main_soup.find(id='content-block').find('div').find('div').find('div').find('div')

In [None]:
global_section_count, global_country_count = get_global_counts(main_content)
global_section_count, global_country_count

### Counts per country

In [None]:
country_divs = main_content.find('div').find_all('div')
country_urls = [elem.find('a')['href'] for elem in country_divs]
main_country_counts = pd.Series(dict([get_country_section_count(elem) for elem in tqdm.tqdm(country_urls)]))
main_country_counts = main_country_counts.rename('website')
main_country_counts = main_country_counts.rename(index={'ESN UK': 'ESN United Kingdom'})

In [None]:
agg_section_count, agg_country_count = main_country_counts.sum(), main_country_counts.count()
agg_section_count, agg_country_count

## Wiki scraping

In [None]:
wiki_url = "https://en.wikipedia.org/wiki/Erasmus_Student_Network"
wiki_table_candidates = get_soup_from_url(wiki_url).select('.wikitable.sortable')

if len(wiki_table_candidates) > 1:
    raise ValueError("There are multiple valid tags. Further clarification needed.")
elif len(wiki_table_candidates) == 0:
    raise ValueError("No valid tag detected.")
    
rows = wiki_table_candidates[0].find_all('tr')
headers = get_cells(rows[0], tag='th')
rows = [get_cells(elem, tag='td') for elem in rows[1:-1]]

table = pd.DataFrame(rows, columns=headers)
table['Name'] = table['Name'] \
    .str.split('(').apply(pd.Series)[0] \
    .str.replace('†', '', regex=False) \
    .str.replace('*', '', regex=False) \
    .str.strip()
table['Local sections'] = table['Local sections'].str.replace('-', '0').astype(int)

wiki_country_counts = table[['Name', 'Local sections']].set_index('Name')['Local sections']
wiki_country_counts = wiki_country_counts.rename('wiki')
wiki_country_counts.index.name = None

## Crossing datasets

In [None]:
counts_comparison = wiki_country_counts.to_frame().join(main_country_counts.to_frame(), how='outer').fillna(0).astype(int)
counts_comparison['different'] = counts_comparison['wiki'] != counts_comparison['website']
counts_comparison = counts_comparison[counts_comparison['different']][['wiki', 'website']]

## Results

In [None]:
print("Results as of", datetime.today().strftime("%B %d, %Y at %H:%M:%S"))

In [None]:
print("The official website currently indicates ESN comprises", global_section_count, "sections in", global_country_count, "countries.")

In [None]:
if global_section_count == agg_section_count:
    print("The aggregate and global section counts are the same.")
else:
    print("The aggregate section count is however different at", agg_section_count)

In [None]:
if global_country_count == agg_country_count:
    print("The aggregate and global country counts are the same.")
else:
    print("The aggregate country count is however different at", agg_country_count)

In [None]:
if len(counts_comparison) == 0:
    print("No differences detected between the website and the wiki!")
else:
    print("The following differences were detected between the website and the wiki:")
    print()
    print(counts_comparison)