In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
#set to True to scrape only the first 10 pages, False to scrape all pages
testing = False

In [3]:
def build_url_list(course_ids_df, year):

    course_ids = course_ids_df['0'].tolist()
    course_ids = [str(course_id) for course_id in course_ids]
    urls = [f'https://studiegids.uva.nl/xmlpages/page/{str(year)}-{str(year+1)}/zoek-vak/vak/{course_id}' for course_id in course_ids]
    return urls


In [4]:
def get_data(soup, year, url):
    
    #get the container which all the data is in
    container = soup.find('article')
    
    #title
    try:
        title = container.find('h1').text.strip()
    except:
        title = None

    #get the table with a few values
    item_info = soup.find('div', class_='item-info')
    rows = item_info.find_all('tr')
    
    #get language row
    language_meta = soup.find(True, string=['Language of instruction', 'Voertaal'])
    if language_meta:
        language = language_meta.find_next('td').text.strip()
    else:
        language = None

    #get college row
    college_meta = soup.find(True, string=['College/graduate', 'Onderwijsinstituut'])
    if college_meta:
        college = college_meta.find_next('td').text.strip()
    else:
        college = None
        
    #get lecturer row
    lecturer_meta = soup.find(True, string=['Lecturer(s)', 'Docent(en)'])
    if lecturer_meta:
        
        lecturer_container = lecturer_meta.find_next('td')
        lecturer_links = lecturer_container.find_all('a')
        lecturer_urls = [link['href'] for link in lecturer_links]
        
        #get lecturer names
        lecturer_names = [lecturer_links.text.strip() for lecturer_links in lecturer_links]
   
    else:
        lecturer_urls = []
        lecturer_names = []

    

    #get part of programme row
    part_of_programme_meta = soup.find(True, string=['Is part of', 'Onderdeel van'])
    if part_of_programme_meta:
        part_of_programme_container = part_of_programme_meta.find_next('td')
        part_of_programme_links = part_of_programme_container.find_all('a')
        part_of_programme_urls = [link['href'] for link in part_of_programme_links]
    else:
        part_of_programme_urls = []
    
    #get the text by finding the element after item-info
    text = item_info.find_next_siblings()
    text = [t.text.strip() for t in text]
    
    #build a dictionary with all the data to be added as a row to the dataframe
    course_dict = {
        'url': url,
        'year': year,
        'course_title': title, 
        'language': language, 
        'college': college, 
        'lecturers_urls': lecturer_urls,
        'lecturers': lecturer_names, 
        'programmes': part_of_programme_urls, 
        'text': text}
    
    return course_dict


In [7]:
import concurrent.futures

# FILEPATH: /c:/Users/Sahir/Documents/work with alex/UvA/retrieve_course_details.ipynb
for year in range(2023, 2024):
    course_id_file = 'data/course_id/course_ids_' + str(year) + '_' + str(year+1) + '.csv'
    df_ids = pd.read_csv(course_id_file)
    urls = build_url_list(df_ids, year)
    print("Collecting data for year " + str(year) + '-' + str(year+1))

    YEAR_COURSE_DETAILS = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for url in (urls[:10] if testing else urls):
            futures.append(executor.submit(requests.get, url))
        
        for future in tqdm(concurrent.futures.as_completed(futures)):
            response = future.result()
            soup = BeautifulSoup(response.content, 'lxml')
            data = get_data(soup, year, url)
            YEAR_COURSE_DETAILS.append(data)
    
    df = pd.DataFrame(YEAR_COURSE_DETAILS)
    df.to_csv('data/course_details/course_details_' + str(year) + '_' + str(year+1) + '.csv', index=False)

Collecting data for year 2023-2024


0it [00:00, ?it/s]