In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

### Start from Kusss Course Catalog and navigate to the curriculum of one Program

In [2]:
def get_study_program_dropdown_element(base_url_kusss):
    # thats where we start from -> the kusss page of the course catalogue
    url_course_catalog = base_url_kusss + "coursecatalogue-start.action"

    page_course_catalog = requests.get(url_course_catalog)
    soup = BeautifulSoup(page_course_catalog.text, 'html.parser')
    studienpläne_dropwdown = soup.find_all("form")[2]
    
    return studienpläne_dropwdown

In [3]:
def build_base_request_url_study_program(base_url_kusss, html_studienpläne_dropdown):

    # create url to get to the html page of a specific study program
    action = html_studienpläne_dropdown["action"]

    if html_studienpläne_dropdown["method"] == "get":
        method = "?"

    input_name = html_studienpläne_dropdown.input["name"]
    input_value = html_studienpläne_dropdown.input["value"]
    input_string = input_name+"="+input_value

    select_name = html_studienpläne_dropdown.find("select", {"class":"dropdown"})["name"]

    return base_url_kusss + action + method + input_string + "&" + select_name + "="

In [4]:
def get_study_program_details(html_studienpläne_dropdown):
    
    # get all the names and id values of all study programs
    studiengang_dict = {}
    for studiengang in html_studienpläne_dropdown.findAll("option", {"class":"dropdownentry"})[1:]:
        name_studiengang = " ".join([x for x in re.split("\n|\t| ", studiengang.get_text()) if x!=""])
        studiengang_dict[name_studiengang] = studiengang["value"]

    return studiengang_dict

In [5]:
#base_url_kusss = "https://www.kusss.jku.at/kusss/"

#html_element_studienpläne_dropdown = get_study_program_dropdown_element(base_url_kusss)

#study_program_dict = get_study_program_details(html_element_studienpläne_dropdown)

#study_program_id = "201"

#request_url_studiengang = build_base_request_url_study_program(base_url_kusss,html_element_studienpläne_dropdown)

#request_url_studiengang += study_program_id

### Get all courses of one study program

In [6]:
def get_all_lva_names_urls(request_url_study_program):
    page_course_catalog = requests.get(request_url_study_program)
    soup_studiengang = BeautifulSoup(page_course_catalog.text, 'html.parser')

    # all LVA names and urls in a Curriculum
    lva_elements = soup_studiengang.find_all("a", href=re.compile("coursecatalogue-get-courseclasses.action?"))

    # store all names and urls in dictionary
    dict_lva = dict()
    for lva in lva_elements:
        name = " ".join([x for x in re.split(" ", lva.get_text()) if x!=""])
        dict_lva[name] = lva["href"]

    return dict_lva

#get_all_lva_names_urls(request_url_studiengang)

In [7]:
#name_lva = 'VL Mathematik 1' # 'VL Mathematik 2'
#url_lva = 'coursecatalogue-get-courseclasses.action?curId=201&segId=1&grpCode=43383' # 'coursecatalogue-get-courseclasses.action?curId=201&segId=1&grpCode=43385'

In [8]:
#base_url_kusss + url_lva

### Get to registration page of the course and extract infos

In [9]:
def clean_string_lva_details(string, delimiters):
    return " ".join([x for x in re.split(delimiters, string) if x!=""])
    
def clean_string_lva_dates(string, delimiter):
    return re.split(delimiter, string)

In [60]:
def get_lva_meta_data(base_url_kusss, lva_url):

    # harvest all the information from the lva overview page
    lva_page = requests.get(base_url_kusss+lva_url)
    soup_lva_page = BeautifulSoup(lva_page.text, 'html.parser')

    lva_info = soup_lva_page.find_all("table")[-1].find_all("td")

    lva_type = lva_info[0].get_text()
    lva_url_registration_page = lva_info[1].a["href"]
    lva_number_of_groups = int(lva_info[2].get_text())

    # check if the course is held in the current semester
    if lva_number_of_groups == 0:
        #print("Course not held in the current semster!")
        return None
    
    # harvest all the information from the lva registration page
    base_url_kusss + lva_url_registration_page
    
    lva_registration_page = requests.get(base_url_kusss + lva_url_registration_page)
    soup_lva_registration_page = BeautifulSoup(lva_registration_page.text, 'html.parser')

    lva_registration_table = soup_lva_registration_page.find_all("table")[3]
    lva_details = lva_registration_table.findAll("td", {"align":"center"})

    # extract all inforamtion from lva details table
    lva_number = " ".join([x for x in re.split("\n|\t| ", lva_details[0].get_text()) if x!=""])
    max_number_students = int(clean_string_lva_details(lva_details[1].getText(), "\n|\t| "))
    enrolled_students = int(clean_string_lva_details(lva_details[-2].getText(), "\n|\t| "))

    lva_dates_url = lva_details[0].a["href"]

    

    return lva_number, lva_type, max_number_students, enrolled_students, lva_dates_url


def get_lva_dates_details(base_url_kusss, lva_nr, lva_dates_url, study_program_name):

    lva_details_page = requests.get(base_url_kusss + lva_dates_url)
    soup_lva_details_page = BeautifulSoup(lva_details_page.text, 'html.parser')

    # dataframe where all the dates are stored
    dates_dataframe = pd.DataFrame(columns=["LVA-Nummer", "Wochentag", "Datum", "Startzeit", "Endzeit", "Ort", "Anmerkung", "Studiengang"])

    lva_dates = []
    date_list_uncleaned = soup_lva_details_page.find_all("table")[6].find_all("tr")[1:-1]

    for idx in range(len(date_list_uncleaned)-1):

        date_info = [x.strip() for x in clean_string_lva_dates(date_list_uncleaned[idx].get_text(), "\n|\t|–") if x!=""]   
        
        if idx%2==0:
            if len(date_info) < 5:
                helper = [" "] * (5 - len(date_info))
                date_info += helper

            dates_dataframe.loc[len(dates_dataframe)] =   [lva_nr, date_info[0], date_info[1], date_info[2], date_info[3], date_info[4], "", study_program_name]
            lva_dates.append([date_info[0], date_info[1], date_info[2], date_info[3], date_info[4], ""])

        else:
            if len(date_info) != 0:
                dates_dataframe["Anmerkung"][idx//2] = " ".join(date_info)
                lva_dates[idx//2][-1] = " ".join(date_info)

    return dates_dataframe


In [61]:
base_url_kusss = "https://www.kusss.jku.at/kusss/"

html_element_studienpläne_dropdown = get_study_program_dropdown_element(base_url_kusss)

study_program_dict = get_study_program_details(html_element_studienpläne_dropdown)

lva_dataframe = pd.DataFrame(columns=["LVA-Nummer", "LVA-Name", "LVA-Typ", "Kapazität", "Anmeldungen", "URL", "Studiengang", "Studiengang Nummer Kusss"])
dates_dataframe = pd.DataFrame(columns=["LVA-Nummer", "Wochentag", "Datum", "Startzeit", "Endzeit", "Ort", "Anmerkung", "Studiengang"])

for study_program_name in study_program_dict:

    study_program_id = study_program_dict[study_program_name]
    request_url_studiengang = build_base_request_url_study_program(base_url_kusss,html_element_studienpläne_dropdown)
    request_url_studiengang += study_program_id

    lva_dict = get_all_lva_names_urls(request_url_studiengang)

    for lva_name in lva_dict:
        lva_url = lva_dict[lva_name] 
        res = get_lva_meta_data(base_url_kusss, lva_url)
        if res is None:
            continue
        else:
            lva_number, lva_type, max_number_students, enrolled_students, lva_dates_url = get_lva_meta_data(base_url_kusss, lva_url)
            lva_dataframe.loc[len(lva_dataframe)] = [lva_number, lva_name, lva_type, max_number_students, enrolled_students, lva_dates_url, study_program_name, study_program_id]

            current_lva_dates_df = get_lva_dates_details(base_url_kusss, lva_number, lva_dates_url, study_program_name)
            dates_dataframe = pd.concat([dates_dataframe, current_lva_dates_df])

    dates_dataframe.to_csv("dates.csv")
    lva_dataframe.to_csv("lva.csv")