In [1]:
from bs4 import BeautifulSoup
import requests

Assuming no one starts B1 or M1 in spring --> find EPFL doc for this

In [2]:
# parameter keys
PARAM_GPS = 'ww_x_GPS'
PARAM_MAJ = 'ww_x_UNITE_ACAD'
PARAM_YEAR = 'ww_x_PERIODE_ACAD'
PARAM_STATUS = 'ww_x_PERIODE_PEDAGO'
PARAM_SEMESTER = 'ww_x_HIVERETE'

In [3]:
# base URLs
FILTER_BASE_URL = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_i_reportModelXsl=133685270'
GPS_BASE_URL = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_b_list=1'

In [4]:
r = requests.get("http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247")
c = r.content
soup = BeautifulSoup(c, 'html.parser')

In [5]:
# create dict from dropdown menus
def create_dict(field_list):
    field_dict = {}
    for i in range(1, len(field_list)):
        field_dict[field_list[i].string] = field_list[i]["value"]
    return field_dict

# Unité académique, Période académique, Période pédagogique, Type de semestre
major_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[0].find_all('option'))
acad_yr_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[1].find_all('option'))
status_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[2].find_all('option'))
sem_dict = create_dict(soup.body.find(id="filtre").find_all('tr')[3].find_all('option'))

In [6]:
# assuming we only get two results with the first being "Tous"
def is_valid_gps(gps):
    return gps != "-1"

def extract_gps(content):
    soup = BeautifulSoup(content, 'html.parser')
    elements = soup.find_all('a', class_='ww_x_GPS')
    for element in elements:
        raw_info = element.attrs['onclick']
        gps = raw_info.split("'")[1].split('=')[1]
        if is_valid_gps(gps):
            return gps

In [7]:
# create dict of B1 students, assuming only start in Autumn
b1_students = {}
for year in sorted(acad_yr_dict.keys()):
    # obtain gps
    payload_filter = {PARAM_MAJ: major_dict['Informatique'], PARAM_YEAR: acad_yr_dict[year], 
                  PARAM_STATUS: status_dict['Bachelor semestre 1'], PARAM_SEMESTER: sem_dict["Semestre d'automne"]}
    r_filt = requests.get(FILTER_BASE_URL, params=payload_filter)
    gps = extract_gps(r_filt.content)
    # get list of students
    payload_students = {PARAM_GPS: gps,
                  PARAM_MAJ: major_dict['Informatique'], 
                  PARAM_YEAR: acad_yr_dict[year], 
                  PARAM_STATUS: status_dict['Bachelor semestre 1'], 
                  PARAM_SEMESTER: sem_dict["Semestre d'automne"]}
    r_list = requests.get(GPS_BASE_URL, params=payload_students)
    # parse with beautiful soup
    soup_students = BeautifulSoup(r_list.content, 'html.parser')
    # students are starting after two rows
    rows = soup_students.find_all('tr')
    for k in range(2,len(rows)):
        test_student = rows[k].find_all('td')
        sciper = test_student[10].string
        # keep earliest year in case a student repeated first semester of Bachelor year
        if sciper not in b1_students:
            b1_students[sciper] = [test_student[0].string, test_student[7].string, year]

In [8]:
len(b1_students)

1323

In [9]:
# create dict of B6 students
b6_students = {}
for year in sorted(acad_yr_dict.keys()):
    for sem in sem_dict.keys():
        # obtain gps
        payload_filter = {PARAM_MAJ: major_dict['Informatique'],
                          PARAM_YEAR: acad_yr_dict[year],
                          PARAM_STATUS: status_dict['Bachelor semestre 6'],
                          PARAM_SEMESTER: sem_dict[sem]}
        r_filt = requests.get(FILTER_BASE_URL, params=payload_filter)
        gps = extract_gps(r_filt.content)
        # get list of students
        payload_students = {PARAM_GPS: gps,
                      PARAM_MAJ: major_dict['Informatique'], 
                      PARAM_YEAR: acad_yr_dict[year], 
                      PARAM_STATUS: status_dict['Bachelor semestre 6'], 
                      PARAM_SEMESTER: sem_dict[sem]}
        r_list = requests.get(GPS_BASE_URL, params=payload_students)
        # parse with beautiful soup
        soup_students = BeautifulSoup(r_list.content, 'html.parser')
        # students are starting after two rows
        rows = soup_students.find_all('tr')
        for k in range(2,len(rows)):
            test_student = rows[k].find_all('td')
            sciper = test_student[10].string
            # replace with latest year in case 6th semester was repeated
            b6_students[sciper] = [test_student[0].string, test_student[7].string, year, sem]

In [10]:
len(b6_students)

516

In [11]:
import pandas as pd

In [12]:
df_b1 = pd.DataFrame.from_dict(b1_students, orient='index')
df_b1.columns = ['Gender', 'Status', 'Start_Year']
df_b1 = df_b1[df_b1.Status=='Présent'] # remove student on leave/holiday
df_b1.index.rename('SCIPER', inplace=True)
df_b1.drop('Status', axis=1, inplace=True)
print(df_b1.shape)
df_b1.head()

(1322, 2)


Unnamed: 0_level_0,Gender,Start_Year
SCIPER,Unnamed: 1_level_1,Unnamed: 2_level_1
237686,Monsieur,2013-2014
181298,Monsieur,2007-2008
202946,Monsieur,2010-2011
253656,Monsieur,2015-2016
259268,Monsieur,2016-2017


In [13]:
df_b6 = pd.DataFrame.from_dict(b6_students, orient='index')
df_b6.columns = ['Gender', 'Status', 'End_Year', 'End_Sem']
df_b6 = df_b6[df_b6.Status=='Présent'] # remove student on leave/holiday
df_b6.index.rename('SCIPER', inplace=True)
df_b6.drop('Status', axis=1, inplace=True)
df_b6.drop('Gender', axis=1, inplace=True) # can be removed BEFORE
df_b6.drop('End_Sem', axis=1, inplace=True) # no student had 6th semester in autumn
print(df_b6.shape)
df_b6.head()

(390, 1)


Unnamed: 0_level_0,End_Year
SCIPER,Unnamed: 1_level_1
228408,2015-2016
217500,2013-2014
203712,2012-2013
169795,2010-2011
217439,2013-2014


In [14]:
len(df_b1.index.intersection(df_b6.index))/7 # around 40 students per year

40.857142857142854

In [15]:
grads = df_b1.join(df_b6, how='inner')
grads.head()

Unnamed: 0_level_0,Gender,Start_Year,End_Year
SCIPER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
228408,Monsieur,2012-2013,2015-2016
217500,Monsieur,2011-2012,2013-2014
203712,Monsieur,2010-2011,2012-2013
169795,Monsieur,2007-2008,2010-2011
217439,Monsieur,2011-2012,2013-2014
