In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import dateutil

Assuming no one starts B1 or M1 in spring --> find EPFL doc for this

In [2]:
# parameter keys
PARAM_GPS = 'ww_x_GPS'
PARAM_MAJ = 'ww_x_UNITE_ACAD'
PARAM_YEAR = 'ww_x_PERIODE_ACAD'
PARAM_STATUS = 'ww_x_PERIODE_PEDAGO'
PARAM_SEMESTER = 'ww_x_HIVERETE'

In [3]:
# base URLs
FILTER_BASE_URL = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_i_reportModelXsl=133685270'
GPS_BASE_URL = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_b_list=1'

# Create one DataFrame for all master records

In [4]:
r = requests.get("http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247")
c = r.content
soup = BeautifulSoup(c, 'html.parser')

In [5]:
# create dict from dropdown menus
def create_dict(field_list):
    field_dict = {}
    for i in range(1, len(field_list)):
        field_dict[field_list[i].string] = field_list[i]["value"]
    return field_dict

# Unité académique, Période académique, Période pédagogique, Type de semestre
major_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[0].find_all('option'))
acad_yr_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[1].find_all('option'))
status_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[2].find_all('option'))
sem_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[3].find_all('option'))

In [6]:
# assuming we only get two results with the first being "Tous"
def is_valid_gps(gps):
    return gps != "-1"

def extract_gps(content):
    soup = BeautifulSoup(content, 'html.parser')
    elements = soup.find_all('a', class_='ww_x_GPS')
    for element in elements:
        raw_info = element.attrs['onclick']
        gps = raw_info.split("'")[1].split('=')[1]
        if is_valid_gps(gps):
            return gps

In [7]:
# calculate the start date of a given academic year and semester type
def sem_start_date(academic_year, semester):
    start_year, next_year = academic_year.split('-')
    if semester == "Semestre d'automne":
        return start_year + '-09'
    else:
        return next_year + '-03'
    
sem_start_date("2015-2016", "Semestre d'automne")

'2015-09'

In [8]:
# create a table for masters
master_statuses = ['Master semestre 1', 'Master semestre 2', 'Master semestre 3']

df_dict = {}

for status in master_statuses:
    for year in sorted(acad_yr_dict.keys()):
        for sem in sem_dict.keys():
            # obtain gps
            payload_filter = {PARAM_MAJ: major_dict['Informatique'],
                              PARAM_YEAR: acad_yr_dict[year],
                              PARAM_STATUS: status_dict[status],
                              PARAM_SEMESTER: sem_dict[sem]}
            r_filt = requests.get(FILTER_BASE_URL, params=payload_filter)
            gps = extract_gps(r_filt.content)
            if gps == None:
                continue

            # get list of students
            payload_students = {PARAM_GPS: gps,
                                PARAM_MAJ: major_dict['Informatique'], 
                                PARAM_YEAR: acad_yr_dict[year], 
                                PARAM_STATUS: status_dict[status],
                                PARAM_SEMESTER: sem_dict[sem]}
            r_list = requests.get(GPS_BASE_URL, params=payload_students)

            table = pd.read_html(r_list.text, flavor='lxml', skiprows=1, header=0)[0]
            table = table.ix[:, [0,1,10]]
            table[status] = sem_start_date(year, sem)
            if status in df_dict.keys():
                df_dict[status] = pd.merge(df_dict[status], table, how="outer")
            else:
                df_dict[status] = table             


In [9]:
# Merge master semester 1 and 2 into one DataFrame
ans = pd.merge(df_dict[master_statuses[0]], df_dict[master_statuses[1]], how="inner")
ans.columns = ['Gender', 'Name', "SCIPER", "Master semestre 1", "Master semestre 2" ]
ans = ans[['SCIPER', "Gender", "Name", "Master semestre 1", "Master semestre 2"]]
ans.head()

Unnamed: 0,SCIPER,Gender,Name,Master semestre 1,Master semestre 2
0,153066.0,Monsieur,Aeberhard François-Xavier,2007-09,2008-03
1,153066.0,Monsieur,Aeberhard François-Xavier,2007-09,2009-03
2,153066.0,Monsieur,Aeberhard François-Xavier,2007-09,2010-03
3,180027.0,Madame,Agarwal Megha,2007-09,2008-03
4,152232.0,Monsieur,Anagnostaras David,2007-09,2008-03
