In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm 

#load course IDs
course_ids = pd.read_csv('data/course_IDs.csv')
course_ids = course_ids['course_ID'].tolist()
course_ids = [str(i) for i in course_ids]


In [2]:
TESTING = False

import random
random.seed(123)
course_ids_test = random.sample(course_ids, 200)


In [3]:
def get_details(soup, url):
    # get the course name
    title_article = soup.find('article', class_='content')
    year = soup.find('div', class_='edition').text.strip()
    
    course_title = soup.find('h1').text.strip()

    try:
        language = soup.find('dt', string='Language')
        language = language.find_next_sibling().text.strip()
    except:
        language = 'None'
    
    try:
        college = soup.find('dt', string='Faculty')
        college = college.find_next_sibling().text.strip()
    except:
        college = 'None'

    try:
        lecturers = soup.find('dt', string={'Teacher', 'Teachers'})
        lecturers = lecturers.find_next_sibling().text.strip()
        lecturers = lecturers.split('\n')
    except:
        lecturers = 'None'

    try:
        lecturers_urls = soup.find('dt', string={'Teacher', 'Teachers'})
        lecturers_urls_container = lecturers_urls.find_next_sibling()
        lecturers_urls = lecturers_urls_container.find_all('a')
        #get links
        if len(lecturers_urls) == 0:
            lecturers_urls = 'None'
        else:
            lecturers_urls = [i['href'] for i in lecturers_urls]
    except:
        lecturers_urls = 'None'

    try:
        programmes_prev = soup.find('h2', string='Part of')
        programmes_container = programmes_prev.find_next_sibling('ul')
        programmes = programmes_container.find_all('a')
        #get links
        programmes = [i['href'] for i in programmes]
    except:
        programmes = 'None'

    try:
        text_container = soup.find('div', class_='wrapper main clearfix')
        text = text_container.find('article', class_='content').text.strip()
    except:
        text = 'None'

    #create dictionary from course details
    course_details = {
        'url': url,
        'year': year,
        'course_title': course_title,
        'language': language,
        'college': college,
        'lecturers_urls': lecturers_urls,
        'lecturers': lecturers,
        'programmes': programmes,
        'text': text
    }

    return course_details

In [4]:
url = 'https://studiegids.universiteitleiden.nl/en/courses/'
import concurrent.futures


num_chunks = 100
chunk_size = len(course_ids) // num_chunks
course_ids_chunks = [course_ids[i:i+chunk_size] for i in range(0, len(course_ids), chunk_size)]

#check that all course IDs are in the chunks
assert sum([len(i) for i in course_ids_chunks]) == len(course_ids)



# loop over the course IDs 

def process_course(url_new):
    import time
    response = None
    while response is None:
        try:
            response = requests.get(url_new)
        except:
            print('Connection refused by the server, waiting 5 seconds')
            
            time.sleep(5)
            print('Trying again...')
            continue
    soup = BeautifulSoup(response.text, 'html.parser')
    course_details = get_details(soup, url_new)
    return course_details


details = []
for i in tqdm(range(len(course_ids_chunks))):
    for j in tqdm(range(len(course_ids_chunks[i]))):
        url_new = url + course_ids_chunks[i][j]
        result = process_course(url_new)
        details.append(result)


    chunk_details_df = pd.DataFrame(details)
    chunk_details_df.to_csv(f'data/chunks/course_details_{i}.csv', index=False)



  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?it/s]