In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Get all course hyperlinks shown by a kurser.ku.dk search.

Example
-------
search_url = (
    "https://kurser.ku.dk/#q=&education=&programme=&volume=&"
    "departments=DEPARTMENT_0011&faculty=FACULTY_0005&studyBlockId=&"
    "teachingLanguage=&period=&schedules=&studyId=&openUniversityInternational=0&searched=1"
)

links = get_course_links_from_search(search_url)
print(len(links), "links found")
print(links[:5])
"""

from __future__ import annotations
import re, html
from urllib.parse import urlparse, parse_qs, urljoin
from typing import List
import requests
from bs4 import BeautifulSoup as BS

BASE = "https://kurser.ku.dk"
HEADERS = {"User-Agent": "KU course link scraper (https://github.com/you)"}


def _hash_url_to_search_params(hash_url: str) -> dict:
    """
    Convert the fragment part (after '#') of the catalogue URL into the
    real querystring used by /search.
    """
    frag = urlparse(hash_url).fragment            # part after '#'
    qs = parse_qs(frag, keep_blank_values=True)

    # Mapping: hashed names → /search parameter names
    # (see submitSearchForm() in site JS)
    mapping = {
        "q": "q",
        "studyBlockId": "studyBlockId",
        "teachingLanguage": "teachingLanguage",
        "period": "period",
        "schedules": "schedules",
        "studyId": "studyId",
        "openUniversityInternational": "openUniversityInternational",
        "programme": "programme",
        "faculty": "faculty",
        "departments": "departments",
        "volume": "volume",
    }
    params = {dst: qs.get(src, [""])[0] for src, dst in mapping.items()}
    return params


def get_course_links_from_search(catalogue_url: str) -> List[str]:
    """
    Return a list of absolute course URLs for the supplied kurser.ku.dk
    *search page* (the URL with the long #fragment).
    """
    # 1. Convert hash-part into /search query parameters
    params = _hash_url_to_search_params(catalogue_url)

    # 2. Call the hidden JSON-less endpoint that delivers the table markup
    r = requests.get(urljoin(BASE, "/search"), params=params, headers=HEADERS, timeout=10)
    r.raise_for_status()

    soup = BS(r.text, "lxml")

    # 3. Extract every <a href="/course/…"> inside the results table
    links = [
        urljoin(BASE, a["href"])
        for a in soup.select("#searchresults a[href]")
        if re.match(r"^/course/", a["href"])
    ]

    return links


# ──────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    search_url = (
        "https://kurser.ku.dk/#q=&education=&programme=&volume=&"
        "departments=DEPARTMENT_0011&faculty=FACULTY_0005&studyBlockId=&"
        "teachingLanguage=&period=&schedules=&studyId=&openUniversityInternational=0&searched=1"
    )

    links = get_course_links_from_search(search_url)
    print(f"{len(links)} links found")
    for url in links[:10]:
        print(" •", url)


192 links found
 • https://kurser.ku.dk/course/ndaa09023u/2024-2025
 • https://kurser.ku.dk/course/ndaa09023u/2025-2026
 • https://kurser.ku.dk/course/ndak15006u/2024-2025
 • https://kurser.ku.dk/course/ndak15006u/2025-2026
 • https://kurser.ku.dk/course/ndaa09013u/2024-2025
 • https://kurser.ku.dk/course/ndaa09013u/2025-2026
 • https://kurser.ku.dk/course/ndak24003u/2025-2026
 • https://kurser.ku.dk/course/ndak24003u/2024-2025
 • https://kurser.ku.dk/course/ndak15012u/2024-2025
 • https://kurser.ku.dk/course/ndak15012u/2025-2026


# Scrape Course Contents

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Scrape a single course page from kurser.ku.dk.

Example
-------
url = "https://kurser.ku.dk/course/ndaa09023u/2024-2025"
data = scrape_course_page(url)
print(json.dumps(data, indent=2, ensure_ascii=False))
"""

from __future__ import annotations
import re, json, html, requests
from typing import Dict, Any, List, Optional
from bs4 import BeautifulSoup as BS

HEADERS = {"User-Agent": "KU course scraper (https://github.com/you)"}


# ──────────────────────────────────────────────────────────────────────────
def _norm(txt: str) -> str:
    """Collapse whitespace and strip."""
    return re.sub(r"\s+", " ", txt).strip()


def _section_text(div: BS) -> str:
    """Get plain-text content of a section (<div id="course-xxx"> …)."""
    return _norm(div.get_text(separator=" "))


def _parse_workload(div: Optional[BS]) -> Dict[str, int] | None:
    """
    Convert the “Workload” list to {'Lectures':36, 'Preparation':82, …}.
    Returns None if parsing fails.
    """
    if div is None:
        return None
    items = div.select("ul.list-unstyled.workload > li")
    if len(items) % 2:  # expect pairs: label/number
        return None
    out: Dict[str, int] = {}
    for i in range(0, len(items), 2):
        label = _norm(items[i + 0].text)
        val   = _norm(items[i + 1].text)
        try:
            out[label] = int(val)
        except ValueError:
            pass
    return out or None


def scrape_course_page(url: str) -> Dict[str, Any]:
    """Return a dict with the most useful course fields."""
    res: Dict[str, Any] = {
        "url": url,
        "title": None,
        "volume": None,
        "education": None,
        "content": None,
        "learning_outcome": None,
        "literature": None,
        "recommended_prereq": None,
        "teaching_methods": None,
        "workload": None,
        "feedback_form": None,
        "signup": None,
        "exam_html": None,
        # right-panel basics
        "language": None,
        "course_code": None,
        "ects": None,
        "level": None,
        "duration": None,
        "placement": None,
        "schedule": None,
        "capacity": None,
        "study_board": None,
        "department": None,
        "faculty": None,
        "course_coordinators": None,
        "last_modified": None,
    }

    html_text = requests.get(url, headers=HEADERS, timeout=10).text
    soup = BS(html_text, "lxml")

    # ───────── title, volume ─────────
    h1 = soup.select_one("h1.courseTitle")
    if h1:
        res["title"] = _norm(h1.text)
    vol = soup.select_one(".courseVolume")
    res["volume"] = _norm(vol.text) if vol else None

    # ───────── left-column sections ─────────
    sections = {
        "education":      "#course-name",
        "content":        "#course-content",
        "learning_outcome": "#course-description",
        "literature":     "#course-materials",
        "recommended_prereq": "#course-skills",
        "teaching_methods":  "#course-form",
        "workload_raw":      "#course-load",
        "feedback_form":     "#course-feedback",
        "signup":            "#course-signup",
        "exam_html":         "#course-exams1",
    }
    for key, sel in sections.items():
        div = soup.select_one(sel)
        if div:
            if key == "workload_raw":
                res["workload"] = _parse_workload(div)
            elif key == "exam_html":
                # keep basic HTML for later parsing if needed
                res[key] = div.decode()
            else:
                res[key] = _section_text(div)

    # ───────── right-panel “Course information” table ─────────
    info_dl = soup.select_one(".panel-body dl.dl-horizontal")
    if info_dl:
        dts = [ _norm(dt.text) for dt in info_dl.select("dt") ]
        dds = [ _norm(dd.text) for dd in info_dl.select("dd") ]
        mapping = dict(zip(dts, dds))
        res.update({
            "language":        mapping.get("Language"),
            "course_code":     mapping.get("Course code"),
            "ects":            mapping.get("Credit"),
            "level":           mapping.get("Level"),
            "duration":        mapping.get("Duration"),
            "placement":       mapping.get("Placement"),
            "schedule":        mapping.get("Schedule"),
            "capacity":        mapping.get("Course capacity"),
        })

    # study board / department / faculty / coordinators
    def _ul_text(sel: str) -> Optional[List[str]]:
        ul = soup.select_one(sel)
        if not ul: return None
        return [_norm(li.text) for li in ul.select("li")]

    res["study_board"]         = _ul_text("h5:contains('Study board') + ul")
    res["department"]          = _ul_text("h5:contains('Contracting department') + ul")
    res["faculty"]             = _ul_text("h5:contains('Contracting faculty') + ul")
    res["course_coordinators"] = _ul_text("h5:contains('Course Coordinators') + ul")

    # last-modified
    lm = soup.select_one(".last-modified")
    res["last_modified"] = _norm(lm.text) if lm else None

    return res


# ──────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    import sys, re
    url = next((a for a in sys.argv[1:] if re.match(r"^https?://", a)), 
               "https://kurser.ku.dk/course/ndaa09023u/2024-2025")
    print(json.dumps(scrape_course_page(url), indent=2, ensure_ascii=False))



{
  "url": "https://kurser.ku.dk/course/ndaa09023u/2024-2025",
  "title": "NDAA09023U Advanced Algorithms and Data Structures (AADS)",
  "volume": "Volume 2024/2025",
  "education": "MSc Programme in Computer Science MSc Programme in Computer Science (part time) MSc Programme in Computer Science with a minor subject MSc Programme in Bioinformatics",
  "content": "Algorithms is about finding scalable solutions to computational problems, and the reliance is only increasing as we enter the world of Big Data. We want algorithms that solve problems efficiently relative to the input size. Exponential time is hopeless. We generally want polynomial time, and for large problems we need linear time. Sometimes we employ data structures that represent the input so that queries about it can be answered very efficiently. In this mandatory course, we will study the list of algorithmic topics below. Some of these topics are covered in more depth in more specialised elective courses.",
  "learning_outc



# Create DF for multiple courses

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Batch-scrape many kurser.ku.dk course pages, stream them to a CSV, and return a
DataFrame with the newly scraped rows.

Requires that `scrape_course_page()` (shown earlier) is in scope or imported.
"""

from __future__ import annotations
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Tuple
import pandas as pd


# ──────────────────────────────────────────────────────────────────────────
def scrape_courses_to_df_csv(
    urls: List[str],
    csv_path: str,
    *,
    batch_size: int = 10,
    max_workers: int = 10,
    write_index: bool = False,
) -> pd.DataFrame:
    """
    • Scrapes every URL in `urls` (parallel inside each batch).  
    • After each batch finishes it *appends* rows to `csv_path`
      so nothing is lost if the run is interrupted.  
    • Returns a DataFrame with all newly scraped rows.

    Parameters
    ----------
    urls : list[str]
        Course URLs to scrape.
    csv_path : str
        Destination CSV file (created if absent, appended otherwise).
    batch_size : int, default 10
        How many URLs to process per batch.
    max_workers : int, default 10
        Thread-pool size (concurrency per batch).
    write_index : bool, default False
        Whether to include the DataFrame index in the CSV.

    Returns
    -------
    pd.DataFrame
        The rows scraped in *this* call (handy for immediate inspection).
    """
    header_needed = not os.path.isfile(csv_path)
    all_rows: list[dict] = []

    for i in range(0, len(urls), batch_size):
        batch = urls[i : i + batch_size]

        # -------- scrape one batch in parallel ----------
        rows: list[dict] = []
        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futs = {ex.submit(scrape_course_page, u): u for u in batch}
            for fut in as_completed(futs):
                rows.append(fut.result())

        # -------- append to cumulative list & CSV ----------
        all_rows.extend(rows)
        pd.DataFrame(rows).to_csv(
            csv_path,
            mode="a",
            header=header_needed,
            index=write_index,
            encoding="utf-8",
        )
        header_needed = False  # only for very first write

    return pd.DataFrame(all_rows)


# ──────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    example_urls = [
        "https://kurser.ku.dk/course/ndaa09023u/2024-2025",
        "https://kurser.ku.dk/course/ndaa09023u/2025-2026",
        "https://kurser.ku.dk/course/ndak15006u/2024-2025",
        "https://kurser.ku.dk/course/ndak15006u/2025-2026",
        "https://kurser.ku.dk/course/ndaa09013u/2024-2025",
        "https://kurser.ku.dk/course/ndaa09013u/2025-2026",
    ]

    csv_file = "ku_courses.csv"

    df = scrape_courses_to_df_csv(example_urls, csv_file, batch_size=3, max_workers=6)
    print(df.head())


                                                url  \
0  https://kurser.ku.dk/course/ndak15006u/2024-2025   
1  https://kurser.ku.dk/course/ndaa09023u/2024-2025   
2  https://kurser.ku.dk/course/ndaa09023u/2025-2026   
3  https://kurser.ku.dk/course/ndak15006u/2025-2026   
4  https://kurser.ku.dk/course/ndaa09013u/2025-2026   

                                               title            volume  \
0         NDAK15006U Advanced Computer Systems (ACS)  Volume 2024/2025   
1  NDAA09023U Advanced Algorithms and Data Struct...  Volume 2024/2025   
2  NDAA09023U Advanced Algorithms and Data Struct...  Volume 2025/2026   
3         NDAK15006U Advanced Computer Systems (ACS)  Volume 2025/2026   
4               NDAA09013U Advanced Programming (AP)  Volume 2025/2026   

                                           education  \
0                  MSc Programme in Computer Science   
1  MSc Programme in Computer Science MSc Programm...   
2  MSc Programme in Computer Science MSc Programm...   


# Scrape data for all links

In [None]:
csv_file = "ku_courses.csv"

df = scrape_courses_to_df_csv(links, csv_file, batch_size=3, max_workers=6)
print(df.head())