In [1]:
# # # # Web scraping: Africa in International Math Olympiads # # # # 

In [2]:
# Importing packages

import requests,math
import numpy as np
from bs4 import BeautifulSoup

In [3]:
# # Data about performance of countries at the IMO

URL_imo = 'https://www.imo-official.org/results.aspx'
page_imo = requests.get(URL_imo)
imo = BeautifulSoup(page_imo.content, 'html.parser')
results_imo = imo.find(id='main')

countries_imo = results_imo.find_all('tr')
countries_imo.pop(0)
countries_imo.pop(0)

year = [results_imo.find_all('tr')[0]('th')[i]('a')[0]['title'][4:] for i in range(1,len(results_imo.find_all('tr')[0]('th'))-1)]
code = [0 for i in range(len(countries_imo))] # country codes
name = [0 for i in range(len(countries_imo))] # country names
data = [[float('nan') for y in year] for i in code] # country performance at IMO

for i in range(len(countries_imo)):
    code[i] = countries_imo[i].a.text
    name[i] = countries_imo[i].a['title']
    
for c in range(len(code)):
    for y in range(len(year)):
        if not (countries_imo[c].find_all('td')[y+1].a is None): # making sure <a> exists
            if ("team_r.aspx?" in countries_imo[c].find_all('td')[y+1].a["href"]): # if the country c participated the year y
                data[c][y] = int(countries_imo[c].find_all('td')[y+1].a.text)

In [4]:
# # African countries

URL_africa = 'https://allafrica.com/misc/sitemap/countries.html' # list of African countries
page_africa = requests.get(URL_africa)
africa = BeautifulSoup(page_africa.content, 'html.parser')
results_africa = africa.find(class_='category-grid')
countries_africa = [results_africa.find_all('a')[c].text.replace(u'\xa0', u' ') for c in range(1,len(results_africa.find_all('a')))]

In [5]:
# # Need to identify African countries (linking the two data bases)

#african_participant = [0 for c in name] # value 0 if African country never participated to IMO
african_participant = []

i=0
for c in range(len(name)):
    if name[c] in countries_africa:
        #african_participant[c] = 1 # value 1 if African country  participated to at least one IMO edition
        african_participant.append(c) # their position in IMO countries list ('name')
    i=i+1

africa_performance = [data[a] for a in african_participant] # performance of African countries

In [6]:
# # Some statistics

len(african_participant) # 19 African countries at the IMO

np.nanmin(africa_performance,axis=1) # best performance for each African country
name[african_participant[np.argmin(np.nanmin(africa_performance,axis=1))]],np.nanmin(africa_performance),year[np.nanargmin(data[african_participant[np.argmin(np.nanmin(africa_performance,axis=1))]])]
# Morocco achieved the best performance for an African country, 18, in 1982

np.nanmax(africa_performance),sum(sum(africa_performance[a]==np.nanmax(africa_performance) for a in range(len(africa_performance)-1))) # worst performance accross all African countries:110. Achieved 3 times

(110.0, 3)

In [8]:
# Relative performance: rank divided by number of participants

data_standardized = [[float('nan') for y in year] for i in code]

for i in range(len(code)):
    for y in range(len(year)):
        if not(math.isnan(data[i][y])):
            data_standardized[i][y] = np.divide(data[i][y],np.nanmax(data,axis=0)[y])
            
africa_performance_standardized = [data_standardized[a] for a in african_participant] # performance of African countries

In [9]:
np.nanmin(africa_performance_standardized,axis=1) # best relative performance for each African country
name[african_participant[np.argmin(np.nanmin(africa_performance_standardized,axis=1))]],np.nanmin(africa_performance_standardized),year[np.nanargmin(data[african_participant[np.argmin(np.nanmin(africa_performance_standardized,axis=1))]])]
# Morocco achieved the best relative performance for an African country in 1982

('Morocco', 0.3287671232876712, '1986')

In [None]:
# Plot: for each African country, evolution of its performance at IMO

In [None]:
# Map: for each African country, number of its participations at IMO

# Help 1: https://ramiro.org/notebook/basemap-choropleth/ # Not working properly

# Help 2: https://medium.com/using-specialist-business-databases/creating-a-choropleth-map-using-geopandas-and-financial-data-c76419258746

In [None]:
# Needed to install geopandas through Anaconda Prompt, by running the code "conda install -c conda-forge geopandas"

In [15]:
import pandas as pd #used to read in the revenue file 
import matplotlib.pyplot as plt #for plotting
#to read in shape file and provides high #level interface with #matplotlib library for making maps
import geopandas as gpd

#import world shape map externally, no need to have a saved file
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


In [18]:
#view first 5 rows
world.head()

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry
0,920938,Oceania,Fiji,FJI,8374.0,"(POLYGON ((180 -16.06713266364245, 180 -16.555..."
1,53950935,Africa,Tanzania,TZA,150600.0,POLYGON ((33.90371119710453 -0.950000000000000...
2,603253,Africa,W. Sahara,ESH,906.5,POLYGON ((-8.665589565454809 27.65642588959236...
3,35623680,North America,Canada,CAN,1674000.0,"(POLYGON ((-122.84 49.00000000000011, -122.974..."
4,326625791,North America,United States of America,USA,18560000.0,"(POLYGON ((-122.84 49.00000000000011, -120 49...."


In [19]:
#inspect count
world.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 6 columns):
pop_est       177 non-null int64
continent     177 non-null object
name          177 non-null object
iso_a3        177 non-null object
gdp_md_est    177 non-null float64
geometry      177 non-null geometry
dtypes: float64(1), geometry(1), int64(1), object(3)
memory usage: 8.4+ KB


In [12]:
#merge both data sets using country code/iso_a3 as unique identifiers
for_plotting = world.merge(revenue, left_on = 'iso_a3', right_on = 'Country Code')
#check the occurrence
for_plotting.info()

SyntaxError: invalid syntax (<ipython-input-12-cd3cd598c912>, line 1)