In [None]:
import requests
from lxml import html
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup
!pip install -q beautifulsoup4 > /dev/null 2>&1
from bs4 import BeautifulSoup

base_url = "https://campus.tum.de/tumonline"

# 1. Step: Get the List of Degree Programmes
Scraped from: [TUM Degree Programmes](https://campus.tum.de/tumonline/wbstpportfolio.wbStpList?pOrgNr=1&pSort=&pLanguageCode=DE&pStpStatus=N&pSjNr=1621) 

----> degree_programs.csv

In [120]:
# URL of the page to scrape
degree_programs_url = base_url + "/wbstpportfolio.wbStpList?pOrgNr=1&pSort=&pLanguageCode=DE&pStpStatus=N&pSjNr=1621"
# Fetch the page
degree_programs_raw = requests.get(degree_programs_url)
#degree_programs_raw.content

In [121]:
# Parse the HTML content
degree_programs_html = html.fromstring(degree_programs_raw.content)
# Show the HTML structure
# print(html.tostring(degree_programs_html, pretty_print=True).decode())

In [122]:
# Convert lxml element to string before passing to BeautifulSoup
degree_programs_soup = BeautifulSoup(html.tostring(degree_programs_html), "html.parser")

rows = degree_programs_soup.select("form > table > tbody > tr")[1:]  # Skip header row if present

def build_link(base_url, href):
    if not href or href.strip().lower().startswith("javascript"):
        return ""
    if not href.startswith("/"):
        href = "/" + href
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    return base_url + href

data = []
for row in rows:
    tds = row.find_all("td")
    if len(tds) < 6:
        continue  # skip malformed rows
    degree = tds[0].get_text(strip=True)
    id_ = tds[1].get_text(strip=True)
    curriculum_div = tds[2].find("div")
    curriculum = curriculum_div.get_text(strip=True) if curriculum_div else ""
    field_of_studies = tds[3].get_text(strip=True)
    ects_credits = tds[4].get_text(strip=True)
    semester = tds[5].get_text(strip=True)
    # Links (if present)
    spans = curriculum_div.find_all("span") if curriculum_div else []
    curriculum_link = ""
    modulhandbuch_link = ""
    if len(spans) > 0 and spans[0].find("a"):
        href = spans[0].find("a")["href"]
        curriculum_link = build_link(base_url, href)
    if len(spans) > 2 and spans[2].find("a"):
        href = spans[2].find("a")["href"]
        modulhandbuch_link = build_link(base_url, href)
    data.append({
        "Degree": degree,
        "ID": id_,
        "Curriculum": curriculum,
        "Field of studies": field_of_studies,
        "ECTS Credits": ects_credits,
        "Semester": semester,
        "Curriculum Link": curriculum_link,
        "Modulhandbuch Link": modulhandbuch_link,
    })

degree_programs_df = pd.DataFrame(data)
#degree_programs_df.head()
## Display the Curriculum Links where ID 121
#degree_programs_df = degree_programs_df[degree_programs_df['ID'] == '121']
## Print the Degree, Curriculum Link, and Modulhandbuch Link for each row
#for index, row in degree_programs_df.iterrows():
#    print(f"Degree: {row['Degree']}")
#    print(f"Curriculum Link: {row['Curriculum Link']}")
#    print(f"Modulhandbuch Link: {row['Modulhandbuch Link']}")
#    print("-" * 40)


In [123]:
# Save the DataFrame to a CSV file
degree_programs_df.to_csv('degree_programs.csv', index=False, encoding='utf-8-sig')

# 2. Step: Get each Curriculum (Only M.Sc. Information Systems for now) #TODO!!!!!!
Scraped from: [M.Sc. Information Systems / Curriculum](https://campus.tum.de/tumonline/wbstpcs.showSpoTree?pStStudiumNr=&pSJNr=1621&pStpStpNr=4997&pStartSemester=)

In [None]:
# Filter the degree_programs_df on ID = '121'
degree_programs_df_filtered = degree_programs_df[degree_programs_df['ID'] == '121']

for idx, prog_row in degree_programs_df_filtered.iterrows():
    curriculum_link = prog_row['Curriculum Link']
    if curriculum_link:
        curriculum_raw = requests.get(curriculum_link)
        curriculum_html = html.fromstring(curriculum_raw.content)
        curriculum_soup = BeautifulSoup(html.tostring(curriculum_html), "html.parser")

        data = []
        rows = curriculum_soup.select("form > table > tbody > tr")[1:]  # Skip header row

        current_lvl1 = ""
        current_credits = ""
        current_wf = ""

        for row in rows:
            tds = row.find_all("td")
            if len(tds) < 5:
                continue

            # Check if this row is lvl 1 (has credits and WF)
            credits = tds[3].select_one("div > span")
            wf = tds[4].select_one("div > span")
            a_tags = tds[0].select("div > span > a > span")

            if credits or wf:
                # Update current lvl 1 context
                if len(a_tags) > 0:
                    current_lvl1 = a_tags[0].get_text(strip=True)
                current_credits = credits.get_text(strip=True) if credits else ""
                current_wf = wf.get_text(strip=True) if wf else ""
            else:
                # This is a lvl 2 row
                if len(a_tags) > 0:
                    lvl2 = a_tags[0].get_text(strip=True)
                    data.append({
                        "ID": prog_row['ID'],
                        "Node filter-Name (1)": current_lvl1,
                        "Node filter-Name (2)": lvl2,
                        "Credits": current_credits,
                        "WF": current_wf
                    })

        curriculum_df = pd.DataFrame(data)
        display(curriculum_df.head())
    else:
        print(f"No Curriculum Link for ID {prog_row['ID']}")

In [None]:
# Filter the degree_programs_df on ID = '121'
degree_programs_df_filtered = degree_programs_df[degree_programs_df['ID'] == '121']

# Iterate over the filtered DataFrame
for idx, prog_row in degree_programs_df_filtered.iterrows():
    curriculum_link = prog_row['Curriculum Link']
    if curriculum_link:
        # Fetch the curriculum page
        curriculum_raw = requests.get(curriculum_link)
        curriculum_html = html.fromstring(curriculum_raw.content)
        
        # Parse the HTML content with BeautifulSoup
        curriculum_soup = BeautifulSoup(html.tostring(curriculum_html), "html.parser")

        # Scrape the content of the page
        data = []
        rows = curriculum_soup.select("form > table > tbody > tr")[1:]  # Skip header row

        for row in rows:
            tds = row.find_all("td")
            if len(tds) < 5:
                continue  # skip malformed rows

            # Node filter-Name lvl 1
            lvl1 = ""
            a_tags = tds[0].select("div > span > a > span")
            if len(a_tags) > 0:
                lvl1 = a_tags[0].get_text(strip=True)

            # Node filter-Name (2) from the next row if it exists
            lvl2 = ""
            if i + 1 < len(rows):
                next_tds = rows[i + 1].find_all("td")
                if len(next_tds) > 0:
                    next_a_tags = next_tds[0].select("div > span > a > span")
                    if len(next_a_tags) > 0:
                        lvl2 = next_a_tags[0].get_text(strip=True)

            # Credits
            credits = tds[3].select_one("div > span")
            credits = credits.get_text(strip=True) if credits else ""

            # WF
            wf = tds[4].select_one("div > span")
            wf = wf.get_text(strip=True) if wf else ""

            data.append({
                "ID": prog_row['ID'],
                "Node filter-Name (1)": lvl1,
                "Node filter-Name (2)": lvl2,
                "Credits": credits,
                "WF": wf
            })

        curriculum_df = pd.DataFrame(data)
        display(curriculum_df.head())
    else:
        print(f"No Curriculum Link for ID {prog_row['ID']}")

Unnamed: 0,ID,Node filter-Name (1),Node filter-Name (2),Credits,WF
0,121,Master's Thesis,,30,1
1,121,Pflichtmodule Informatik,,8,1
2,121,Pflichtmodule Wirtschaftsinformatik,,13,1
3,121,Wahlmodule Entwicklungspraktikum,,10,1
4,121,Übergreifende Wahlmodule,,53,1


In [None]:
rows = curriculum_page_content_html.xpath('//table[@id="tgt"]/tbody/tr')
data = []

i = 0
while i < len(rows):
    row = rows[i]
    tds = row.xpath('./td')
    if len(tds) >= 5:
        # Outer row values
        knotenfilter = tds[0].xpath('.//span/span[@class="KnotenText noBorder kt kt1 TextToolTip " or contains(@class, "KnotenText")]/text()')
        knotenfilter = knotenfilter[0].strip() if knotenfilter else tds[0].text_content().strip()
        empf_sem = tds[2].text_content().strip()
        ects_cr = tds[3].text_content().strip()
        gf = tds[4].text_content().strip()
        # Check for sub-rows (Level 2)
        next_row = row.getnext()
        has_subrow = False
        if next_row is not None:
            sub_tds = next_row.xpath('./td')
            if len(sub_tds) == 1:
                # There may be multiple Level 2 entries in the subrow
                level2_spans = sub_tds[0].xpath('./div/span/a/span') # TODO: Something is wrong here
                level2_links = sub_tds[0].xpath('./div/span/a/img') # TODO: Something is wrong here
                for idx, level2_elem in enumerate(level2_spans):
                    level2 = level2_elem.text_content().strip()
                    link = level2_links[idx].get('src') if idx < len(level2_links) else ""
                    data.append({
                        "Knotenfilter-Bezeichnung": knotenfilter,
                        "Knotenfilter-Bezeichnung Level 2": "",
                        "empf. Sem.": empf_sem,
                        "ECTS Cr.": ects_cr,
                        "GF": gf,
                        "Link": ""
                    })
                has_subrow = True
        if not has_subrow:
            data.append({
                "Knotenfilter-Bezeichnung": knotenfilter,
                "Knotenfilter-Bezeichnung Level 2": "",
                "empf. Sem.": empf_sem,
                "ECTS Cr.": ects_cr,
                "GF": gf,
                "Link": ""
            })
        i += 2 if has_subrow else 1
    else:
        i += 1

df_knoten = pd.DataFrame(data)
display(df_knoten)

Unnamed: 0,Knotenfilter-Bezeichnung,Knotenfilter-Bezeichnung Level 2,empf. Sem.,ECTS Cr.,GF,Link
0,Master's Thesis,,,30,1,
1,Pflichtmodule Informatik,,,8,1,
2,Pflichtmodule Wirtschaftsinformatik,,,13,1,
3,Wahlmodule Entwicklungspraktikum,,,10,1,
4,Übergreifende Wahlmodule,,,53,1,
5,Wahlmodule Überfachliche Grundlagen,,,6,1,


In [None]:
# Find all rows in the table body
rows = curriculum_page_content_html.xpath('//table[@id="tgt"]/tbody/tr')
data = []

for row in rows:
    tds = row.xpath('./td')
    # Only process rows with at least 5 columns (the relevant data rows)
    if len(tds) >= 5:
        # Knotenfilter-Bezeichnung (first <td>)
        knotenfilter = tds[0].xpath('.//span/span[@class="KnotenText noBorder kt kt1 TextToolTip " or contains(@class, "KnotenText")]/text()')
        knotenfilter = knotenfilter[0].strip() if knotenfilter else tds[0].text_content().strip()
        # empf. Sem. (third <td>)
        empf_sem = tds[2].text_content().strip()
        # ECTS Cr. (fourth <td>)
        ects_cr = tds[3].text_content().strip()
        # GF (fifth <td>)
        gf = tds[4].text_content().strip()
        data.append({
            "Knotenfilter-Bezeichnung": knotenfilter,
            "empf. Sem.": empf_sem,
            "ECTS Cr.": ects_cr,
            "GF": gf
        })

df_knoten = pd.DataFrame(data)
display(df_knoten)

Unnamed: 0,Knotenfilter-Bezeichnung,empf. Sem.,ECTS Cr.,GF
0,[20221] Information Systems,,120,1
1,Master's Thesis,,30,1
2,Pflichtmodule Informatik,,8,1
3,Pflichtmodule Wirtschaftsinformatik,,13,1
4,Wahlmodule Entwicklungspraktikum,,10,1
5,Übergreifende Wahlmodule,,53,1
6,Wahlmodule Überfachliche Grundlagen,,6,1


In [None]:
# Save the DataFrame to a CSV file
df_knoten.to_csv('msc_information_systems_areas.csv', index=False, encoding='utf-8-sig')

In [None]:
# Filter the degree_programs_df on ID = '121'
degree_programs_df_filtered = degree_programs_df[degree_programs_df['ID'] == '121']

# Iterate over the filtered DataFrame
for idx, prog_row in degree_programs_df_filtered.iterrows():
    curriculum_link = prog_row['Curriculum Link']
    if curriculum_link:
        # Fetch the curriculum page
        curriculum_raw = requests.get(curriculum_link)
        curriculum_html = html.fromstring(curriculum_raw.content)
        
        # Parse the HTML content with BeautifulSoup
        curriculum_soup = BeautifulSoup(html.tostring(curriculum_html), "html.parser")

        # Scrape the content of the page
        data = []
        rows = curriculum_soup.select("form > table > tbody > tr")[1:]  # Skip header row

        for row in rows:
            tds = row.find_all("td")
            if len(tds) < 5:
                continue  # skip malformed rows

            # Node filter-Name lvl 1
            lvl1 = ""
            a_tags = tds[0].select("div > span > a > span")
            if len(a_tags) > 0:
                lvl1 = a_tags[0].get_text(strip=True)

            # Node filter-Name (2) from the next row if it exists
            lvl2 = ""
            if i + 1 < len(rows):
                next_tds = rows[i + 1].find_all("td")
                if len(next_tds) > 0:
                    next_a_tags = next_tds[0].select("div > span > a > span")
                    if len(next_a_tags) > 0:
                        lvl2 = next_a_tags[0].get_text(strip=True)

            # Credits
            credits = tds[3].select_one("div > span")
            credits = credits.get_text(strip=True) if credits else ""

            # WF
            wf = tds[4].select_one("div > span")
            wf = wf.get_text(strip=True) if wf else ""

            data.append({
                "ID": prog_row['ID'],
                "Node filter-Name (1)": lvl1,
                "Node filter-Name (2)": lvl2,
                "Credits": credits,
                "WF": wf
            })

        curriculum_df = pd.DataFrame(data)
        display(curriculum_df.head())
    else:
        print(f"No Curriculum Link for ID {prog_row['ID']}")

Unnamed: 0,ID,Node filter-Name (1),Node filter-Name (2),Credits,WF
0,121,Master's Thesis,,30,1
1,121,Pflichtmodule Informatik,,8,1
2,121,Pflichtmodule Wirtschaftsinformatik,,13,1
3,121,Wahlmodule Entwicklungspraktikum,,10,1
4,121,Übergreifende Wahlmodule,,53,1


# 3. Step: Get the Course Details
Scraped from Courses Description e.g.: [Master's Thesis](https://campus.tum.de/tumonline/ee/ui/ca2/app/desktop/#/slc.cm.reg/student/modules/detail/light/458129/study-year/1621?$scrollTo=toc_modhb_beschreibung)
Via https://api.srv.nat.tum.de/public
and GET https://api.srv.nat.tum.de/api/v1/course

----> courses.csv


In [None]:
# This code fetches a all courses from the TUM API, iterate over each and flattens its JSON structure into a DataFrame.

courses_url = "https://api.srv.nat.tum.de/api/v1/course/"
courses_raw = requests.get(courses_url)
courses_json = courses_raw.json()  # This should be a list of course dicts

def flatten_course(course_json):
    flat = {
        'course_code': course_json.get('course_code'),
        'course_name': course_json.get('course_name'),
        'course_name_en': course_json.get('course_name_en'),
        'course_id': course_json.get('course_id'),
        'course_name_list': course_json.get('course_name_list'),
        'course_name_list_en': course_json.get('course_name_list_en'),
        'hoursperweek': course_json.get('hoursperweek'),
        'semester_tag': course_json.get('semester', {}).get('semester_tag'),
        'semester_title': course_json.get('semester', {}).get('semester_title'),
        'semester_title_en': course_json.get('semester', {}).get('semester_title_en'),
        'semester_key': course_json.get('semester', {}).get('semester_key'),
        'semester_period_start': course_json.get('semester', {}).get('semester_period_start'),
        'semester_period_end': course_json.get('semester', {}).get('semester_period_end'),
        'lecture_period_start': course_json.get('semester', {}).get('lecture_period_start'),
        'lecture_period_end': course_json.get('semester', {}).get('lecture_period_end'),
        'modified_tumonline': course_json.get('modified_tumonline'),
        'activity_id': course_json.get('activity', {}).get('activity_id'),
        'activity_name': course_json.get('activity', {}).get('activity_name'),
        'activity_name_en': course_json.get('activity', {}).get('activity_name_en'),
        'org_id': course_json.get('org', {}).get('org_id'),
        'org_code': course_json.get('org', {}).get('org_code'),
        'org_name': course_json.get('org', {}).get('org_name'),
        'org_name_en': course_json.get('org', {}).get('org_name_en'),
        'org_url': course_json.get('org', {}).get('org_url'),
        'org_type': course_json.get('org', {}).get('org_type'),
        'school_org_id': course_json.get('org', {}).get('school', {}).get('org_id') if course_json.get('org', {}).get('school') else None,
        'school_org_code': course_json.get('org', {}).get('school', {}).get('org_code') if course_json.get('org', {}).get('school') else None,
        'school_org_name': course_json.get('org', {}).get('school', {}).get('org_name') if course_json.get('org', {}).get('school') else None,
        'school_org_name_en': course_json.get('org', {}).get('school', {}).get('org_name_en') if course_json.get('org', {}).get('school') else None,
        'school_org_url': course_json.get('org', {}).get('school', {}).get('org_url') if course_json.get('org', {}).get('school') else None,
        'ghk': course_json.get('ghk'),
        'instruction_languages': ','.join(course_json.get('instruction_languages', [])),
        'description': course_json.get('description'),
        'description_en': course_json.get('description_en'),
        'tumonline_url': course_json.get('tumonline_url'),
    }
    return flat

# --- Place the code below here ---
all_courses = []
limit = 200  # maximum allowed by API
offset = 0

while True:
    params = {
        "limit": limit,
        "offset": offset
    }
    response = requests.get("https://api.srv.nat.tum.de/api/v1/course/", params=params)
    data = response.json()
    courses_batch = data['hits']
    if not courses_batch:
        break
    for course in courses_batch:
        all_courses.append(flatten_course(course))
    offset += limit
    if offset >= data['total_count']:
        break

courses_df = pd.DataFrame(all_courses)
display(courses_df.head())


Unnamed: 0,course_code,course_name,course_name_en,course_id,course_name_list,course_name_list_en,hoursperweek,semester_tag,semester_title,semester_title_en,...,school_org_id,school_org_code,school_org_name,school_org_name_en,school_org_url,ghk,instruction_languages,description,description_en,tumonline_url
0,3187,310 K,310 K,950800711,0000003187: 310 K (Fischer),0000003187: 310 K (Fischer),2.0,SS 2025,Sommersemester 2025,Summer Semester 2025,...,,,,,,90027189,DE,,,https://campus.tum.de/tumonline/ee/ui/ca2/app/...
1,4139,3D Scanning & Motion Capture (IN2354),3D Scanning & Motion Capture (IN2354),950801576,0000004139: 3D Scanning & Motion Capture (IN23...,0000004139: 3D Scanning & Motion Capture (IN23...,2.0,SS 2025,Sommersemester 2025,Summer Semester 2025,...,,,,,,90045507,EN,"3D reconstruction, RGB-D scanning (Kinect, Tan...","3D reconstruction, RGB-D scanning (Kinect, Tan...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
2,293,5G New Radio Communications: Physical Layer Ch...,5G New Radio Communications: Physical Layer Ch...,950803625,0000000293: 5G New Radio Communications: Physi...,0000000293: 5G New Radio Communications: Physi...,2.0,SS 2025,Sommersemester 2025,Summer Semester 2025,...,,,,,,90085722,EN,Die Integration neuartiger Bit- und Signalvera...,The integration of novel bit and signal proces...,https://campus.tum.de/tumonline/ee/ui/ca2/app/...
3,5544,Abiotischer Waldschutz,Abiotic forest protection,950802013,0000005544: Abiotischer Waldschutz (Menzel),0000005544: Abiotic forest protection (Menzel),0.5,SS 2025,Sommersemester 2025,Summer Semester 2025,...,,,,,,90063727,DE,,,https://campus.tum.de/tumonline/ee/ui/ca2/app/...
4,2854,Academic Prompt Engineering and Management: a ...,Academic Prompt Engineering and Management: a ...,950844553,0000002854: Academic Prompt Engineering and Ma...,0000002854: Academic Prompt Engineering and Ma...,1.0,SS 2025,Sommersemester 2025,Summer Semester 2025,...,,,,,,90101274,EN,This module provides students from all parts o...,This module builds on the lecture and provides...,https://campus.tum.de/tumonline/ee/ui/ca2/app/...


In [119]:
# Count the rows of the DataFrame
print(f"Total number of courses: {len(courses_df)}")

# Save the DataFrame to a CSV file
courses_df.to_csv('courses.csv', index=False, encoding='utf-8-sig')

Total number of courses: 6211


# 4. Step: Connect Area to Course