In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By


from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
from tqdm.notebook import trange, tqdm


In [2]:
#set to True to scrape only the first 3 pages, False to scrape all pages
testing = True

In [3]:
def get_course_ids(soup):
    result_list=soup.find_all('tr', class_='slideout')
    result_list = [result.find(attrs={"data-id": True}) for result in result_list]
    result_list = [result['data-id'] for result in result_list]
    return result_list

In [4]:
def init_webdriver(year):
    driver = webdriver.Chrome()
    driver.get(f"https://studiegids.uva.nl/xmlpages/page/{str(year)}-{str(year+1)}/zoek-vak")

    #click on the button to show all courses
    show_all = driver.find_element(By.CLASS_NAME, 'search-all')
    show_all.click()

    #wait for the courses to load
    wait = WebDriverWait(driver, 10)
    element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'result')))

    return driver

In [5]:
def get_pages(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    num_courses = soup.find('div', class_='compare-bar clearfix').text.strip().split(' ')[0]
    num_courses = int(num_courses)
    num_pages = num_courses // 20 + 1


    return num_pages

In [6]:
def get_year(driver):

    #number of pages
    num_pages = get_pages(driver)
    print('Pages to scrape: ', num_pages)

    #scraping
    YEAR_COURSE_IDS = []
    for i in trange(3 if testing else num_pages-1):
        
        #get current html
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        course_ids = get_course_ids(soup)

        #check if first course id is already in list (didn't wait long enough), if so then wait and soup again
        while course_ids[0] in YEAR_COURSE_IDS:
            time.sleep(5)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            course_ids = get_course_ids(soup)

        #add to all course ids list
        YEAR_COURSE_IDS.extend(course_ids)
    
        #next page, wait for element to be clickable
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'next'))).click()

        #wait implicitly
        time.sleep(3)
    
    return YEAR_COURSE_IDS

In [8]:
#uva has from 2014-2015 onwards, loop through them
for year in range(2014, 2024):
    print('Scraping course IDs for year:',str(year)+'-'+str(year+1))
    driver = init_webdriver(year)
    YEAR_COURSE_IDS = get_year(driver)
    driver.close()
    df = pd.DataFrame(YEAR_COURSE_IDS)
    df.to_csv(f'data/course_id/course_ids_{str(year)}_{str(year+1)}.csv', index=False)

Scraping course IDs for year: 2014-2015
Pages to scrape:  205


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2015-2016
Pages to scrape:  211


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2016-2017
Pages to scrape:  216


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2017-2018
Pages to scrape:  221


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2018-2019
Pages to scrape:  234


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2019-2020
Pages to scrape:  237


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2020-2021
Pages to scrape:  234


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2021-2022
Pages to scrape:  247


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2022-2023
Pages to scrape:  249


  0%|          | 0/3 [00:00<?, ?it/s]

Scraping course IDs for year: 2023-2024
Pages to scrape:  251


  0%|          | 0/3 [00:00<?, ?it/s]