In [5]:
import requests
import pandas as pd
from tqdm.notebook import tqdm

# Define headers
headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9,az;q=0.8',
    'Authorization': 'undefined undefined',
    'Cache-Control': 'no-cache, no-store, must-revalidate, private',
    'Client_type': 'web',
    'Content-Type': 'application/json',
    'Dnt': '1',
    'Manifest': '23.46_B253_86778e16',
    'Namens_studentnummer': '',
    'Pragma': 'no-cache',
    'Referer': 'https://osiris-student.uu.nl/onderwijscatalogus/extern/cursus',
    'Release_version': '86778e16b9c920e6044e0458fb619ca45ff3300e',
    'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'Taal': 'EN',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}

# Read course IDs from the file
with open('course_IDs.txt', 'r') as file:
    course_ids = [line.strip() for line in file]

# Initialize df
df = pd.DataFrame(columns=['url', 'year', 'course_title', 'language', 'college', 'lecturers_urls', 'lecturers', 'programmes', 'text'])

# Define the number of chunks
num_chunks = 50
chunk_size = len(course_ids) // num_chunks

# Loop through course IDs with tqdm for progress bar
for i in tqdm(range(0, len(course_ids), chunk_size), desc="Processing Courses in Chunks"):
    chunk_ids = course_ids[i:i+chunk_size]
    chunk_number = i // chunk_size

    #check if file already exists, if so skip
    try:
        pd.read_csv(f'course_details_{chunk_number}.csv')
        continue
    except:
        pass

    for course_id in tqdm(chunk_ids, desc="Processing Chunk"):
        url = f"https://osiris-student.uu.nl/student/osiris/owc/cursussen/{course_id}"

        # Make request with headers
        response = requests.get(url, headers=headers)

        # make dict
        data = response.json(strict=False)

        # initialize variables
        year = None
        course_title = None
        language = None
        college = None
        lecturers_urls = None
        lecturers = None
        programmes = None
        text = None

        # Extract information from the response
        try:
            for r in data['items']:
                if r['rubriek'] == 'rubriek-kop':
                    year = r['velden'][1]['waarde']
                    course_title = r['velden'][3]['waarde']

                if r['rubriek'] == 'rubriek-praktische-info':
                    language = r['velden'][0]['waarde']['velden'][4]['waarde']

                    
                if r['rubriek'] == 'rubriek-zoek':
                    college = r['velden'][0]['waarde']

                if r['rubriek'] == 'rubriek-docenten':
                    lecturers = []
                    lecturers_c = r['velden'][0]['waarde']
                    for l in lecturers_c:
                        try:
                            if l['omschrijving'] == 'Lecturer':
                                d = l['velden']
                                for i in d:
                                    lecturers.append(i['docent'])
                        except:
                            pass

                if r['rubriek'] == 'rubriek-inhoud':
                    text = ""
                    for i in r['velden']:
                        text += i['titel']
                        text += "\n"
                        text += i['waarde']
                        text += "\n"
        except:
            print(f"Error with course ID: {course_id}")
            pass

        data_dict = {
            'url': url,
            'year': year,
            'course_title': course_title,
            'language': language,
            'college': college,
            'lecturers_urls': lecturers_urls,
            'lecturers': lecturers,
            'programmes': programmes,
            'text': text
        }

        # Add dict to df
        df = df._append(data_dict, ignore_index=True)

    # Save the dataframe to a CSV file after processing each chunk
    df.to_csv(f'course_details_{chunk_number}.csv', index=False)

    # Clear the dataframe for the next chunk
    df = pd.DataFrame(columns=['url', 'year', 'course_title', 'language', 'college', 'lecturers_urls', 'lecturers', 'programmes', 'text'])

df


Processing Courses in Chunks:   0%|          | 0/51 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Error with course ID: 204979


Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Error with course ID: 211554


Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Error with course ID: 218012


Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Error with course ID: 231721


Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/721 [00:00<?, ?it/s]

Processing Chunk:   0%|          | 0/18 [00:00<?, ?it/s]

Unnamed: 0,url,year,course_title,language,college,lecturers_urls,lecturers,programmes,text


In [2]:
import pandas as pd
df = pd.read_csv('course_details_0.csv')
df

Unnamed: 0,url,year,course_title,language,college,lecturers_urls,lecturers,programmes,text
0,https://osiris-student.uu.nl/student/osiris/ow...,2016,Researching Intersectionally II,MCRMV16054,Faculty of Humanities,,['dr. C.L. Quinan'],,Course goals\n<html>\n\t<head>\n<meta http-equ...
1,https://osiris-student.uu.nl/student/osiris/ow...,2016,"""Seeing Like a State"": The Invention of Statis...",GE3V14039,Faculty of Humanities,,['dr. R.A. Krol'],,Course goals\n<html>\n<head>\n<meta http-equiv...
2,https://osiris-student.uu.nl/student/osiris/ow...,2016,The Western Way of War: War and Identity in Eu...,GE3V14021,Faculty of Humanities,,['dr. R. Strootman'],,"Course goals\n<html>\n\t<head><style type=""tex..."
3,https://osiris-student.uu.nl/student/osiris/ow...,2016,'Heritage': Dynamics of Collections,UCHUMHAR32,University College Utrecht,,"['dr. G.J. Vroege', 'dr. M.R. Bouquet', 'dr. T...",,"Course goals\n<html>\n\t<head><style type=""tex..."
4,https://osiris-student.uu.nl/student/osiris/ow...,2016,(In)tangible Heritage and Identity,GKMVD16009,Faculty of Humanities,,['dr. G.F.J. Plets'],,Course goals\n<html>\n\t<head>\n<meta http-equ...
...,...,...,...,...,...,...,...,...,...
716,https://osiris-student.uu.nl/student/osiris/ow...,2016,Consultancy Project Sustainable Business and I...,GEO4-2605,Faculty of Geosciences,,['dr. ir. S. Witjes'],,"Course goals\n<html>\n\t<head><style type=""tex..."
717,https://osiris-student.uu.nl/student/osiris/ow...,2016,Consumer Law and Regulation,ECB3CLR,"Faculty of Law, Economics and Governance",,['dr. J.M. Milo'],,"Content\n<html>\n<head>\n<meta http-equiv=""con..."
718,https://osiris-student.uu.nl/student/osiris/ow...,2016,Contemporary Cultural Theory,MCRMV16024,Faculty of Humanities,,['dr. K. Thiele'],,Course goals\n<html>\n\t<head>\n<meta http-equ...
719,https://osiris-student.uu.nl/student/osiris/ow...,2016,Contemporary Economics in Historical Perspective,ECB2GED,"Faculty of Law, Economics and Governance",,['dr. ir. M.J. Boumans'],,"Content\n<html>\n\t<head>\n<meta http-equiv=""c..."
