In [1]:
# import the needed modules
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests as req
import numpy as np
from IPython.display import HTML

In [2]:
#request with Requests library, using Postman interceptor - empty IS-Academia form
base_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247'
r = req.get(base_url)

In [3]:
# define BeautifulSoup object to parse the html doc
soup = bs(r.text,'lxml')

# Parsing the HTML form

In the following the parameters model, section, academic year, academic cycle and season are parsed, so that each variable is linked to its corresponding numeric code through a dictionary. Thereafter, the parameters names are obtained as the IS_Academia form accepts them.

In [5]:
# parse the model code
parse_model = soup.find_all('input')
model = {}
html_model = 'ww_i_reportmodel'
xls_model = 'ww_i_reportModelXsl'
for ind in parse_model:
    if ind['name']==html_model:
        model[html_model]=ind['value']
    if ind['name']==xls_model:
        model[xls_model]=ind['value']
model

{'ww_i_reportModelXsl': '133685271', 'ww_i_reportmodel': '133685247'}

In [6]:
# parse the section Informatique
section = {'Informatique' : soup.find('option',string='Informatique')['value']}
section

{'Informatique': '249847'}

In [7]:
# parse the academic year - loop over the years and find the code corresponding to the year string
academic_year = {}
for year in range(2007,2017):
    period = str(year)+'-'+str(year+1)
    academic_year[period] = soup.find('option', string = period)['value']
academic_year

{'2007-2008': '978181',
 '2008-2009': '978187',
 '2009-2010': '978195',
 '2010-2011': '39486325',
 '2011-2012': '123455150',
 '2012-2013': '123456101',
 '2013-2014': '213637754',
 '2014-2015': '213637922',
 '2015-2016': '213638028',
 '2016-2017': '355925344'}

In [8]:
# parse the academic cycle - define the sought cycles and look for the corresponding codes, as above
bachelor = []
master = []
for j in range(1,7):
    bachelor.append('Bachelor semestre '+str(j))
for j in range(1,5):
    master.append('Master semestre '+str(j))
bachelor_master = bachelor+master

academic_cycle = {}
for cycle in bachelor_master:
    academic_cycle[cycle] = soup.find('option', string = cycle)['value']
academic_cycle

{'Bachelor semestre 1': '249108',
 'Bachelor semestre 2': '249114',
 'Bachelor semestre 3': '942155',
 'Bachelor semestre 4': '942163',
 'Bachelor semestre 5': '942120',
 'Bachelor semestre 6': '942175',
 'Master semestre 1': '2230106',
 'Master semestre 2': '942192',
 'Master semestre 3': '2230128',
 'Master semestre 4': '2230140'}

In [9]:
# parse the academic season - same procedure as above
season = {}
season["Semestre d'automne"]=soup.find('option', string = "Semestre d'automne")['value']
season['Semestre de printemps']=soup.find('option', string = 'Semestre de printemps')['value']
season

{"Semestre d'automne": '2936286', 'Semestre de printemps': '2936295'}

In [10]:
# get the variables' names to fill in the IS-Academia form
variables = []
args = soup.find_all('select')
for var in args:
    variables.append(var['name'])
variables

['ww_x_UNITE_ACAD',
 'ww_x_PERIODE_ACAD',
 'ww_x_PERIODE_PEDAGO',
 'ww_x_HIVERETE']

# Extraction of the GPS identifier

Let us now proceed to the extraction of the html tables from IS-Academia. The previous dictionaries are used to pass the correct parameters to send the request. The crucial information to be retrieved is the parameter 'ww_x_GPS', which identifies the table to be downloaded.

In [11]:
# define a general dictionary to perform the search
research_keys = {}
research_keys[variables[0]] = section['Informatique']
research_keys[variables[1]] = academic_year['2013-2014']
research_keys[variables[2]] = academic_cycle['Bachelor semestre 1']
research_keys['ww_b_list'] = 1 # variable to get the tables through the GPS parameter
research_keys['ww_x_HIVERETE'] = 'null'
research_keys[html_model] = model[html_model]
research_keys[xls_model] = model[xls_model]
r0 = req.get(base_url, params = research_keys)
s0 = bs(r0.text,'lxml')

At this point we define a suitable function to "hack" the value of the variable ww_x_GPS. Let us consider the following example:

class="ww_x_GPS" href="javascript:void(0)" onclick="loadReport('ww_x_GPS=1378362120');return false;">Informatique, 2012-2013, Bachelor semestre 2

The GPS code is reported within the call to the html function 'loadReport', which is called when clicking the dataset. The function captureGPS seeks for the equal sign '=' inside the round parentheses and stops before the apostrophe sign. Once got the value of ww_x_GPS, one can use it to perform a request to the IS-Academia server passing a proper base url.

In [12]:
# function to detect the GPS code of the table
# strong assumption of the string's structure!
def captureGPS(string):
    letter = 0
    while(string[letter]!='='):
        letter=letter+1 #advance up to the =
    letter = letter+1 # reach the first number of the GPS code
    GPS = string[letter]
    letter = letter+1
    while (string[letter]!="'"):
        GPS = GPS+string[letter]
        letter = letter+1
    return GPS

In [14]:
GPS = 'ww_x_GPS'
# any (uniquely indentified) search output is made up by 2 tables: the 'Tous', which corresponds to ww_x_GPS = -1, and the table
# we actually want to extract. The method find_all returns this table as the second one in the list, so that we access it as
# soup.find_all('a')[1]
research_keys[GPS]=captureGPS(s0.find_all('a')[1]['onclick'])
request_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?'
r0 = req.get(request_url, params = research_keys)
s0 = bs(r0.text,'lxml')

In [45]:
# get the attributes of the dataframe to be created
attributes_list = s0.find_all('th')[2:]
attributes = []
for ind in attributes_list:
    attributes.append(ind.contents[0])
attributes

['Civilité',
 'Nom Prénom',
 'Orientation Bachelor',
 'Orientation Master',
 'Spécialisation',
 'Filière opt.',
 'Mineur',
 'Statut',
 'Type Echange',
 'Ecole Echange',
 'No Sciper']

In [85]:
# build up the dataframe
data = pd.read_html(r0.url)
data = data[0][3:] # data is a list of dataframes of length 1; remove the first 2 rows
del data[11]
del data[12]
data.columns = attributes
data = data.set_index('Nom Prénom')