In [68]:
import requests
from lxml import html

# URL of the page to scrape
base_url = "https://campus.tum.de/tumonline"
study_programs_url = base_url + "/wbstpportfolio.wbStpList?pOrgNr=1&pSort=&pLanguageCode=DE&pStpStatus=N&pSjNr=1621"

# Fetch the page
study_programs = requests.get(study_programs_url)
study_programs.content



In [69]:
# Parse the HTML content
study_programs_content_html = html.fromstring(study_programs.content)
# Show the HTML structure
print(html.tostring(study_programs_content_html, pretty_print=True).decode())

<html lang="de" id="id-page-7027640" class="fwk-co app-str">
<head>

<meta http-equiv="" name="viewport" content="width=device-width, initial-scale=1">
<meta http-equiv="pragma" name="" content="no-cache">
<meta http-equiv="" name="Author" content="Technische Universit&#228;t M&#252;nchen">

<title>Studienangebot - TUMonline - Technische Universit&#228;t M&#252;nchen</title>

        <script type="text/javascript">
        window.caGlobalContext = {};
        if (window.CA_THEME_CACHE_TOKEN) {
          window.caGlobalContext.antiCacheToken = window.CA_THEME_CACHE_TOKEN;
        } else {
          window.caGlobalContext.antiCacheToken = '1510687599792';
        }

        var css =
            "<link rel='stylesheet' href='pages/co-default/css-variables.css?antiCache=" + window.caGlobalContext.antiCacheToken + "'>" +
            "<link id='id-custom-css-vars' rel='stylesheet' href='pages/co-user/css-variables.css?antiCache=" + window.caGlobalContext.antiCacheToken + "'>";
        conso

In [72]:
import pandas as pd
from urllib.parse import urljoin

rows = study_programs_content_html.xpath('//tr')
current_degree = None
programs = []

for row in rows:
    tds = row.xpath('./td')
    # Degree header: only one <td> with colspan
    if len(tds) == 1 and tds[0].get('colspan'):
        current_degree = tds[0].text_content().strip()
    # Program row: has class "coRow"
    elif 'coRow' in (row.get('class') or ''):
        program_id = row.get('id')
        if not program_id:
            continue
        if len(tds) < 7:
            continue
        # Field of studies from the 4th <td>
        field_of_studies = tds[3].text_content().strip()
        # Curriculum from the <a> in the 3rd <td>
        curriculum_link = tds[2].xpath('.//span/a[1]')
        curriculum = curriculum_link[0].text_content().strip() if curriculum_link else ""
        # Program link from the <a> in the 3rd <td>
        link = curriculum_link[0].get('href') if curriculum_link else ""
        full_link = urljoin(base_url, link) if link else ""
        # ECTS Credits from the 5th <td>
        ects_credits = tds[4].text_content().strip()
        # Semester from the 6th <td>
        semester = tds[5].text_content().strip()
        programs.append({
            'Degree': current_degree,
            'ID': program_id,
            'Curriculum': curriculum,
            'Field of studies': field_of_studies,
            'ECTS Credits': ects_credits,
            'Semester': semester,
            'Link': full_link,
        })

df = pd.DataFrame(programs)
display(df)

Unnamed: 0,Degree,ID,Curriculum,Field of studies,ECTS Credits,Semester,Link
0,04 TopMath Promotion,4857,Mathe. mit zusätzl. Promo.stg. Mathe. - Promot...,Mathematik,,8,javascript:alert('Details zum Curriculum sind ...
1,05 Teilpromotion,4094,Agrarwissenschaften (kA/0),Agrarwissenschaft/Landwirtschaft,6,6,https://campus.tum.de/wbstpcs.showSpoTree?pStS...
2,05 Teilpromotion,2550,Architektur (kA/0),Architektur,,6,https://campus.tum.de/wbstpcs.showSpoTree?pStS...
3,05 Teilpromotion,2722,Bauingenieurwesen (kA/0),Bauingenieurwesen/Ingenieurbau,,6,javascript:alert('Details zum Curriculum sind ...
4,05 Teilpromotion,4127,Biochemie (kA/0),Biochemie,,6,https://campus.tum.de/wbstpcs.showSpoTree?pStS...
...,...,...,...,...,...,...,...
551,98 Austauschprogramm (nicht studienbeitragspfl...,5315,Politikwissenschaft - Austausch (kA/20241),Politikwissenschaft,,,https://campus.tum.de/wbstpcs.showSpoTree?pStS...
552,98 Austauschprogramm (nicht studienbeitragspfl...,5323,Research on Teaching and Learning (kA/20241),Erziehungswissenschaft (Pädagogik),,,javascript:alert('Details zum Curriculum sind ...
553,98 Austauschprogramm (nicht studienbeitragspfl...,5331,Science and Technology Studies (kA/20241),ohne Angabe/ungeklärt,,,https://campus.tum.de/wbstpcs.showSpoTree?pStS...
554,98 Austauschprogramm (nicht studienbeitragspfl...,5310,Sportwissenschaft (kA/20241),Sportwissenschaft,,,https://campus.tum.de/wbstpcs.showSpoTree?pStS...


In [None]:
# Iterate over the dataframe and if the Curriculum is "Information Systems (kA/20221)" then store the Link in a variable. 
# We do this to access the website of the study program and scrape the information we need.

curriculum_to_scrape = "Information Systems (kA/20221)"
link_to_scrape = df.loc[df['Curriculum'] == curriculum_to_scrape, 'Link'].values[0]

# Cut the https://campus.tum.de/ from the link and replace it with https://campus.tum.de/tumonline
link_to_scrape = link_to_scrape.replace("https://campus.tum.de/", "https://campus.tum.de/tumonline/")

# Fetch the page for the specific curriculum
curriculum_page = requests.get(link_to_scrape)
# Parse the HTML content of the curriculum page
curriculum_page_content_html = html.fromstring(curriculum_page.content)
# Show the HTML structure of the curriculum page
print(html.tostring(curriculum_page_content_html, pretty_print=True).decode())


<html lang="de" id="id-page-studplan" class="fwk-co app-sts">
<head>

<meta http-equiv="" name="viewport" content="width=device-width, initial-scale=1">
<meta http-equiv="pragma" name="" content="no-cache">
<meta http-equiv="" name="Author" content="Technische Universit&#228;t M&#252;nchen">

<title>Studienplan - TUMonline - Technische Universit&#228;t M&#252;nchen</title>

        <script type="text/javascript">
        window.caGlobalContext = {};
        if (window.CA_THEME_CACHE_TOKEN) {
          window.caGlobalContext.antiCacheToken = window.CA_THEME_CACHE_TOKEN;
        } else {
          window.caGlobalContext.antiCacheToken = '1510687599792';
        }

        var css =
            "<link rel='stylesheet' href='pages/co-default/css-variables.css?antiCache=" + window.caGlobalContext.antiCacheToken + "'>" +
            "<link id='id-custom-css-vars' rel='stylesheet' href='pages/co-user/css-variables.css?antiCache=" + window.caGlobalContext.antiCacheToken + "'>";
        console

In [80]:
import pandas as pd

# Find all rows in the table body
rows = curriculum_page_content_html.xpath('//table[@id="tgt"]/tbody/tr')
data = []

for row in rows:
    tds = row.xpath('./td')
    # Only process rows with at least 5 columns (the relevant data rows)
    if len(tds) >= 5:
        # Knotenfilter-Bezeichnung (first <td>)
        knotenfilter = tds[0].xpath('.//span/span[@class="KnotenText noBorder kt kt1 TextToolTip " or contains(@class, "KnotenText")]/text()')
        knotenfilter = knotenfilter[0].strip() if knotenfilter else tds[0].text_content().strip()
        # empf. Sem. (third <td>)
        empf_sem = tds[2].text_content().strip()
        # ECTS Cr. (fourth <td>)
        ects_cr = tds[3].text_content().strip()
        # GF (fifth <td>)
        gf = tds[4].text_content().strip()
        data.append({
            "Knotenfilter-Bezeichnung": knotenfilter,
            "empf. Sem.": empf_sem,
            "ECTS Cr.": ects_cr,
            "GF": gf
        })

df_knoten = pd.DataFrame(data)
display(df_knoten)

Unnamed: 0,Knotenfilter-Bezeichnung,empf. Sem.,ECTS Cr.,GF
0,[20221] Information Systems,,120,1
1,Master's Thesis,,30,1
2,Pflichtmodule Informatik,,8,1
3,Pflichtmodule Wirtschaftsinformatik,,13,1
4,Wahlmodule Entwicklungspraktikum,,10,1
5,Übergreifende Wahlmodule,,53,1
6,Wahlmodule Überfachliche Grundlagen,,6,1


In [99]:
import pandas as pd

rows = curriculum_page_content_html.xpath('//table[@id="tgt"]/tbody/tr')
data = []

i = 0
while i < len(rows):
    row = rows[i]
    tds = row.xpath('./td')
    if len(tds) >= 5:
        # Outer row values
        knotenfilter = tds[0].xpath('.//span/span[@class="KnotenText noBorder kt kt1 TextToolTip " or contains(@class, "KnotenText")]/text()')
        knotenfilter = knotenfilter[0].strip() if knotenfilter else tds[0].text_content().strip()
        empf_sem = tds[2].text_content().strip()
        ects_cr = tds[3].text_content().strip()
        gf = tds[4].text_content().strip()
        # Check for sub-rows (Level 2)
        next_row = row.getnext()
        has_subrow = False
        if next_row is not None:
            sub_tds = next_row.xpath('./td')
            if len(sub_tds) == 1:
                # There may be multiple Level 2 entries in the subrow
                level2_spans = sub_tds[0].xpath('./div/span/a/span') # TODO: Something is wrong here
                level2_links = sub_tds[0].xpath('./div/span/a/img') # TODO: Something is wrong here
                for idx, level2_elem in enumerate(level2_spans):
                    level2 = level2_elem.text_content().strip()
                    link = level2_links[idx].get('src') if idx < len(level2_links) else ""
                    data.append({
                        "Knotenfilter-Bezeichnung": knotenfilter,
                        "Knotenfilter-Bezeichnung Level 2": "",
                        "empf. Sem.": empf_sem,
                        "ECTS Cr.": ects_cr,
                        "GF": gf,
                        "Link": ""
                    })
                has_subrow = True
        if not has_subrow:
            data.append({
                "Knotenfilter-Bezeichnung": knotenfilter,
                "Knotenfilter-Bezeichnung Level 2": "",
                "empf. Sem.": empf_sem,
                "ECTS Cr.": ects_cr,
                "GF": gf,
                "Link": ""
            })
        i += 2 if has_subrow else 1
    else:
        i += 1

df_knoten = pd.DataFrame(data)
display(df_knoten)

Unnamed: 0,Knotenfilter-Bezeichnung,Knotenfilter-Bezeichnung Level 2,empf. Sem.,ECTS Cr.,GF,Link
0,Master's Thesis,,,30,1,
1,Pflichtmodule Informatik,,,8,1,
2,Pflichtmodule Wirtschaftsinformatik,,,13,1,
3,Wahlmodule Entwicklungspraktikum,,,10,1,
4,Übergreifende Wahlmodule,,,53,1,
5,Wahlmodule Überfachliche Grundlagen,,,6,1,
