In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Scraper for KU grade-histogram pages.
Provides

    scrape_course_data(url)      -> dict   # one page, fault-tolerant
    scrape_courses_to_df(urls,n) -> DataFrame   # many pages (n workers)

If a field cannot be parsed it is set to None, so callers never see exceptions.
"""

from __future__ import annotations
import re
from typing import List, Dict, Any, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
import pandas as pd
from bs4 import BeautifulSoup as BS

# ──────────────────────────────────────────────────────────────────────────
HEADERS = {"User-Agent": "KU-stats demo scraper (https://github.com/you)"}

DK_GRADE_MAP = {
    "ej mødt":      "Absent",
    "ikke bestået": "Failed",
    "bestået":      "Passed",
}

clean     = lambda s: re.sub(r"\s+", " ", s).strip()
dk2en     = lambda s: DK_GRADE_MAP.get(s.lower(), s)
safe_get  = lambda seq, i: seq[i] if i < len(seq) else None

# ──────────────────────────────────────────────────────────────────────────
def _parse_grade_table(table: Optional[BS]) -> Optional[List[Dict[str, Any]]]:
    if table is None:
        return None
    try:
        parsed = []
        for tr in table.select("tr")[1:]:                 # skip header row
            cells = tr.select("td")
            if len(cells) < 2:
                continue
            parsed.append({
                "grade": dk2en(clean(cells[0].text)),
                "count": int(clean(cells[1].text)),
            })
        return parsed or None
    except Exception:
        return None

# ──────────────────────────────────────────────────────────────────────────
def scrape_course_data(url: str) -> Dict[str, Any]:
    """Scrape one histogram page, return dict with metadata + grade tables."""
    out: Dict[str, Any] = {
        "title":     None, "faculty": None, "institute": None,
        "term":      None, "ects": None,   "url": url,
        "exam":      None, "re_exam": None,
    }
    try:
        html = requests.get(url, headers=HEADERS, timeout=10).text
        soup = BS(html, "lxml")

        out["title"] = clean(soup.select_one("h2").text)

        outer = soup.select_one("form table")
        tds   = [clean(td.text) for td in outer.select(":scope > tr > td")]

        out["faculty"]   = safe_get(tds, 1)
        out["institute"] = safe_get(tds, 3)
        out["term"]      = safe_get(tds, 5)
        out["ects"]      = safe_get(tds, 7)

        grade_tbls: List[BS] = []
        for h3 in outer.find_all("h3"):
            if "resultater" in h3.get_text(strip=True).lower():
                nxt = h3.find_next_sibling("table")
                if nxt:
                    grade_tbls.append(nxt)

        out["exam"]    = _parse_grade_table(grade_tbls[0] if grade_tbls else None)
        out["re_exam"] = _parse_grade_table(grade_tbls[1] if len(grade_tbls) > 1 else None)

    except Exception:
        pass                                              # swallow all errors
    return out

# ──────────────────────────────────────────────────────────────────────────
def scrape_courses_to_df(urls: List[str], max_workers: int = 10) -> pd.DataFrame:
    """Scrape many URLs concurrently and return a pandas DataFrame."""
    results: List[Dict[str, Any]] = []
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(scrape_course_data, u): u for u in urls}
        for fut in as_completed(futs):
            results.append(fut.result())
    return pd.DataFrame(results)

# ──────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    urls = [
        "http://karakterstatistik.stads.ku.dk/Histogram/ASTB17277E/Summer-2023",
        "http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2022",
    ]

    df = scrape_courses_to_df(urls, 10)
    print(df.head())


                                               title  \
0  "Os og dem?" - Indvandrings- og integrationspo...   
1            Advanced Algorithms and Data Structures   

                                    faculty                   institute term  \
0       Det Samfundsvidenskabelige Fakultet  Institut for Statskundskab  s23   
1  Det Natur- og Biovidenskabelige Fakultet         Datalogisk Institut  v22   

  ects                                                url  \
0  7,5  http://karakterstatistik.stads.ku.dk/Histogram...   
1  7,5  http://karakterstatistik.stads.ku.dk/Histogram...   

                                                exam  \
0                                               None   
1  [{'grade': '12', 'count': 30}, {'grade': '10',...   

                                             re_exam  
0                                               None  
1  [{'grade': '12', 'count': 3}, {'grade': '10', ...  


In [2]:
urls = [
    "http://karakterstatistik.stads.ku.dk/Histogram/ASTB17277E/Summer-2023",
    "http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2022",
]

df = scrape_courses_to_df(urls, 10)
df.head()


Unnamed: 0,title,faculty,institute,term,ects,url,exam,re_exam
0,"""Os og dem?"" - Indvandrings- og integrationspo...",Det Samfundsvidenskabelige Fakultet,Institut for Statskundskab,s23,75,http://karakterstatistik.stads.ku.dk/Histogram...,,
1,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v22,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 30}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."


In [3]:
# ──────────────────────────────────────────────────────────────────────────
def save_df_to_csv(df: pd.DataFrame, path: str, *, index: bool = False) -> None:
    """
    Save the DataFrame to CSV (UTF-8).  By default the index is not written.
    """
    df.to_csv(path, index=index, encoding="utf-8")


In [4]:
urls = [
    "http://karakterstatistik.stads.ku.dk/Histogram/ASTB17277E/Summer-2023",
    "http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2022",
]

df = scrape_courses_to_df(urls)
save_df_to_csv(df, "ku_grade_histograms.csv")


# Get Links

In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re
import pandas as pd

import requests, re, json, sys
from bs4 import BeautifulSoup as BS

# def scrape_links():
#     # Setup Chrome headless mode
#     options = Options()
#     options.add_argument("--headless")
#     options.add_argument("--disable-gpu")

#     # Start the browser
#     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

#     # Open the page
#     #url = "https://karakterstatistik.stads.ku.dk/"
#     url = "https://karakterstatistik.stads.ku.dk/#searchText=&term=&block=&institute=1932&faculty=1868&searchingCourses=true&page=1"
#     driver.get(url)

#     # Click the "Søg" button
#     search_button = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Søg']")
#     search_button.click()

#     # Wait up to 10 seconds for the first result link to appear
#     try:
#         WebDriverWait(driver, 10).until(
#             EC.presence_of_element_located((By.CSS_SELECTOR, "#searchResults a[href]"))
#         )
#     except Exception as e:
#         print("Timeout waiting for search results.")
#         driver.quit()
#         exit()

#     # Now extract all links in the search results section
#     links = driver.find_elements(By.CSS_SELECTOR, "#searchResults a[href]")

#     # Filter links that match the desired pattern
#     #pattern = re.compile(r"^http://karakterstatistik\.stads\.ku\.dk/Histogram/.+/Summer-\d{4}$")
#     pattern = re.compile(r"^http://karakterstatistik\.stads\.ku\.dk/Histogram/")
#     filtered_links = [link.get_attribute("href") for link in links if pattern.match(link.get_attribute("href"))]

#     # # Output results
#     # for href in filtered_links:
#     #     print(href)

#     driver.quit()
#     return filtered_links
def scrape_links():
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.chrome import ChromeDriverManager
    import re

    # Setup Chrome in headless mode
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        url = "https://karakterstatistik.stads.ku.dk/#searchText=&term=&block=&institute=1932&faculty=1868&searchingCourses=true&page=1"
        driver.get(url)

        # Click "Søg"
        search_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//input[@type='submit' and @value='Søg']"))
        )
        search_button.click()

        # Wait for new results to load after the click
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#searchResults a[href]"))
        )

        # Re-fetch elements after DOM update
        page_links = driver.find_elements(By.CSS_SELECTOR, "#searchResults a[href]")

        pattern = re.compile(r"^http://karakterstatistik\.stads\.ku\.dk/Histogram/")
        filtered_links = []

        for link in page_links:
            try:
                href = link.get_attribute("href")
                if href and pattern.match(href):
                    filtered_links.append(href)
            except:
                continue  # silently skip stale elements

    except Exception as e:
        print("Error while waiting for or processing results:", e)
        filtered_links = []

    finally:
        driver.quit()

    return filtered_links


In [10]:
df_links = scrape_links()
print(df_links)

['http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2024/B1', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2024/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Summer-2024/B4', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2023/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2022/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2021/B1', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2020/B1', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAK15006E/Winter-2024/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAK15006E/Winter-2023/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAK15006E/Winter-2022/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAK15006E/Winter-2021/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAK15006E/Winter-2020/B2', 'http://karakterstatistik.stads.ku.dk/Histogram/NDAK22002E/Summer-2024/B4', 'http://kar

# Scrape all links

In [11]:

# df = scrape_courses_to_df(df_links, 10)
# save_df_to_csv(df, "ku_grade_histograms.csv")


# Bached scraping

In [11]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Tuple

# ──────────────────────────────────────────────────────────────────────────
def scrape_urls_to_csv(
    urls: List[str],
    csv_path: str,
    *,
    batch_size: int = 10,
    max_workers: int = 10,
    start_idx: int = 0,
    write_index: bool = False,
) -> Tuple[pd.DataFrame, int]:
    """
    Scrape `urls[start_idx:]` in batches of `batch_size`, append each finished
    batch to `csv_path`, and return:

        (df_all_new_rows, next_start_idx)

    • If the CSV already exists, new rows are *appended* (header written only
      for a fresh file).  
    • If you cancel the run, re-supply the returned `next_start_idx`
      to resume without re-scraping previous rows.
    """
    # Prepare CSV header logic
    file_exists = os.path.isfile(csv_path)
    header_needed = not file_exists and start_idx == 0

    all_rows: List[dict] = []
    next_idx = start_idx

    while next_idx < len(urls):
        batch = urls[next_idx : next_idx + batch_size]

        # ───── scrape one batch in parallel ─────
        batch_rows: List[dict] = []
        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futs = {ex.submit(scrape_course_data, u): u for u in batch}
            for fut in as_completed(futs):
                batch_rows.append(fut.result())

        # Append to running list (for return value)
        all_rows.extend(batch_rows)

        # Append to CSV on disk
        pd.DataFrame(batch_rows).to_csv(
            csv_path,
            mode="a",
            header=header_needed,
            index=write_index,
            encoding="utf-8",
        )
        header_needed = False            # header only for first write

        # Advance pointer
        next_idx += len(batch)

    return pd.DataFrame(all_rows), next_idx


In [12]:
urls = [
    "http://karakterstatistik.stads.ku.dk/Histogram/ASTB17277E/Summer-2023",
    "http://karakterstatistik.stads.ku.dk/Histogram/NDAA09023E/Winter-2022",
    # ... more links
]

csv_file = "ku_histograms2.csv"

# First run (start_idx defaults to 0)
df, next_i = scrape_urls_to_csv(df_links, csv_file, batch_size=10)
print(f"Finished up to index {next_i-1}")

# If you cancel halfway and later want to resume:
# df, next_i = scrape_urls_to_csv(urls, csv_file, start_idx=next_i)
df


Finished up to index 35


Unnamed: 0,title,faculty,institute,term,ects,url,exam,re_exam
0,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,s24/B4,75,http://karakterstatistik.stads.ku.dk/Histogram...,,
1,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v23/B2,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 38}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."
2,Advanced Computer Systems,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v24/B2,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 5}, {'grade': '10', ...","[{'grade': '12', 'count': 2}, {'grade': '10', ..."
3,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v20/B1,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 19}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."
4,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v24/B1,75,http://karakterstatistik.stads.ku.dk/Histogram...,,
5,Advanced Computer Systems,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v23/B2,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 13}, {'grade': '10',...","[{'grade': '12', 'count': 0}, {'grade': '10', ..."
6,Advanced Computer Systems,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v22/B2,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 12}, {'grade': '10',...","[{'grade': '12', 'count': 1}, {'grade': '10', ..."
7,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v22/B2,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 30}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."
8,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v24/B2,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 21}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."
9,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v21/B1,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 28}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."


# Load resulting table

In [13]:
import pandas as pd
import ast
from typing import List, Dict, Any

def load_histogram_csv(path: str) -> pd.DataFrame:
    """
    Read the CSV produced by `scrape_urls_to_csv`.

    The `exam` and `re_exam` columns were stored as stringified lists of dicts.
    This loader converts them back to Python objects so you can work with them
    directly.
    """
    # Read the raw CSV (everything is text at this point)
    df = pd.read_csv(path, dtype=str)

    # Convert columns that hold lists of dicts
    def _to_list_of_dicts(x: str) -> List[Dict[str, Any]] | None:
        x = x.strip()
        return ast.literal_eval(x) if x and x != "None" else None

    for col in ("exam", "re_exam"):
        if col in df.columns:
            df[col] = df[col].apply(_to_list_of_dicts)

    return df


In [15]:
df = pd.read_csv("ku_histograms.csv")
df.head()

Unnamed: 0,title,faculty,institute,term,ects,url,exam,re_exam
0,"""Os og dem?"" - Indvandrings- og integrationspo...",Det Samfundsvidenskabelige Fakultet,Institut for Statskundskab,s23,75,http://karakterstatistik.stads.ku.dk/Histogram...,,
1,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v22,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 30}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."
2,"""Os og dem?"" - Indvandrings- og integrationspo...",Det Samfundsvidenskabelige Fakultet,Institut for Statskundskab,s23,75,http://karakterstatistik.stads.ku.dk/Histogram...,,
3,Advanced Algorithms and Data Structures,Det Natur- og Biovidenskabelige Fakultet,Datalogisk Institut,v22,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 30}, {'grade': '10',...","[{'grade': '12', 'count': 3}, {'grade': '10', ..."
4,"""Os og dem?"" - Indvandrings- og integrationspo...",Det Samfundsvidenskabelige Fakultet,Institut for Statskundskab,s20,75,http://karakterstatistik.stads.ku.dk/Histogram...,"[{'grade': '12', 'count': 3}, {'grade': '10', ...",
