In [13]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By


from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
from tqdm.notebook import trange, tqdm


In [15]:
def init_webdriver(year):
    driver = webdriver.Chrome()

    driver.get(f"https://studiegids.uva.nl/xmlpages/page/{str(year)}-{str(year+1)}/zoek-vak")
    #add cookie
    driver.add_cookie({'name': 'uva-sgw-course-search-nl', 'value': 'p_institute:,p_programme:,p_credits:,p_instr_lang:,p_open_course:,p_period_start:,p_fetch_size:10000,p_course_year:,p_page:1,p_search_inside:,p_searchwords:'})
    driver.refresh()

    print('driver initialized')
    #click on the button to show all courses
    show_all = driver.find_element(By.CLASS_NAME, 'search-all')
    show_all.click()
    print('show all clicked, waiting for courses to load')

    #wait for the courses to load
    wait = WebDriverWait(driver, 10000)
    element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'result')))

    print('courses loaded')
    return driver

In [16]:
def get_course_ids(soup):
    print('getting course id strings')
    result_list=soup.find_all('tr', class_='slideout')
    
    print('extracting course ids')
    result_list = [result.find(attrs={"data-id": True}) for result in result_list]
    result_list = [result['data-id'] for result in result_list]

    print('done')
    return result_list

In [17]:
def get_year(driver):

    #scraping
    YEAR_COURSE_IDS = []
    
    print('getting soup')
    #get current html
    soup = BeautifulSoup(driver.page_source, 'lxml')
    print('got soup')
    #get course ids

    course_ids = get_course_ids(soup)


    print(len(course_ids), 'courses scraped')
    #add to all course ids list
    YEAR_COURSE_IDS.extend(course_ids)

    return YEAR_COURSE_IDS

In [18]:
for year in range(2018, 2024):
    print('Starting:',str(year)+'-'+str(year+1))
    driver = init_webdriver(year)
    YEAR_COURSE_IDS = get_year(driver)
    driver.close()
    df = pd.DataFrame(YEAR_COURSE_IDS)
    df.to_csv(f'data/course_id/course_ids_{str(year)}_{str(year+1)}.csv', index=False)

Starting: 2018-2019


driver initialized
show all clicked, waiting for courses to load
courses loaded
getting soup
got soup
getting course id strings
extracting course ids
done
4660 courses scraped
Starting: 2019-2020
driver initialized
show all clicked, waiting for courses to load
courses loaded
getting soup
got soup
getting course id strings
extracting course ids
done
4723 courses scraped
Starting: 2020-2021
driver initialized
show all clicked, waiting for courses to load
courses loaded
getting soup
got soup
getting course id strings
extracting course ids
done
4678 courses scraped
Starting: 2021-2022
driver initialized
show all clicked, waiting for courses to load
courses loaded
getting soup
got soup
getting course id strings
extracting course ids
done
4920 courses scraped
Starting: 2022-2023
driver initialized
show all clicked, waiting for courses to load
courses loaded
getting soup
got soup
getting course id strings
extracting course ids
done
4962 courses scraped
Starting: 2023-2024
driver initialized
s