In [8]:
import pandas as pd
import os
import time
import random
import logging
from pathlib import Path
import zipfile
from pathlib import Path
import requests

In [9]:
cc_raw = pd.read_json("/srv/data/corpus-corporum/cc_metadata.json")

In [10]:
cc_raw.head(5)

Unnamed: 0,title,author,year,word_count,not_before,not_after,is_earlymodern
10,De facto Ungarie magne1236\n,Riccardus OFM,fl.1236,3340,1236.0,1236.0,False
100,De trinitate,Novatianus,fl. 260,44792,260.0,260.0,False
10000,De quadratura circuli,Franco Leodiensis,,1648,,,False
10001,Epistola ad Berengarium,Frollandus Sylvanectensis,,518,,,False
10002,Diplomata,Gervasius Remensis1055-1067,1055-1067,1532,1055.0,1067.0,False


In [11]:
cc_ids = list(cc_raw.index)

In [12]:
cc_ids[:10]

[10, 100, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007]

In [14]:
import requests

test_id = 10045  # change if you want

BASE_URL = "https://mlat.uzh.ch/php_modules/download.php"

resp = requests.get(
    BASE_URL,
    params={"type": "file-pos-xml", "idno": str(test_id)},
    timeout=60,
)


In [21]:
# ==== CONFIG ====

BASE_URL = "https://mlat.uzh.ch/php_modules/download.php"
OUTPUT_DIR = Path("../data/large_data/lemmatized-xmls/")   # directory to save files
OUTPUT_DIR.mkdir(exist_ok=True)

# Max retries per id if server errors / timeouts
MAX_RETRIES = 3

# Minimum acceptable size in bytes for a “real” XML file.
# Adjust if needed; 10 KB is very conservative.
MIN_VALID_SIZE = 1_000

# ==== LOGGING ====

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)


In [22]:
def download_pos_file(idno: int) -> None:
    """
    Download the POS-annotated archive for a given ID,
    and save it strictly as <idno>.zip.
    """
    params = {"type": "file-pos-xml", "idno": str(idno)}

    # We always save to:  <output_dir>/<idno>.zip
    out_path = OUTPUT_DIR / f"{idno}.zip"

    # Skip if already downloaded
    if out_path.exists():
        print(f"[{idno}] {out_path.name} already exists – skipping.")
        return

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"[{idno}] Attempt {attempt} – requesting…")

            resp = requests.get(
                BASE_URL,
                params=params,
                timeout=60,
                headers={"User-Agent": "PoliteCorpusDownloader/1.0"},
            )

            status = resp.status_code
            size = len(resp.content)
            ctype = resp.headers.get("Content-Type", "")

            print(f"[{idno}] HTTP {status}, type={ctype}, size={size} bytes")

            if status != 200:
                print(f"[{idno}] Non-200 status → skipping.")
                return

            if size < MIN_VALID_SIZE:
                print(f"[{idno}] Too small ({size} bytes) → likely error page. Skipping.")
                return

            with out_path.open("wb") as f:
                f.write(resp.content)

            print(f"[{idno}] Saved as {out_path} ({size} bytes).")
            return

        except requests.exceptions.RequestException as e:
            print(f"[{idno}] Error on attempt {attempt}: {e}")
            sleep_sec = 5 * attempt
            print(f"[{idno}] Sleeping {sleep_sec}s before retry…")
            time.sleep(sleep_sec)

    print(f"[{idno}] FAILED after {MAX_RETRIES} attempts.")

In [24]:
def run_downloader(ids):
    total = len(ids)
    print(f"Starting download for {total} IDs…")

    try:
        for idx, idno in enumerate(ids, start=1):
            print(f"\n=== [{idx}/{total}] ID {idno} ===")
            download_pos_file(idno)

            # Gentle randomized pause
            pause = random.uniform(1.0, 3.5)
            print(f"[{idno}] Sleeping {pause:.1f}s…")
            time.sleep(pause)

            # Optional long break every 200 IDs
            if idx % 200 == 0:
                long_pause = random.uniform(60, 120)
                print(f"[batch] {idx} processed → long break {long_pause:.0f}s.")
                time.sleep(long_pause)

    except KeyboardInterrupt:
        print("Interrupted by user.")

    print("Done.")

In [None]:
run_downloader(cc_ids)

Starting download for 7819 IDs…

=== [1/7819] ID 10 ===
[10] Attempt 1 – requesting…
[10] HTTP 200, type=application/octet-stream, size=21646 bytes
[10] Saved as ../data/large_data/lemmatized-xmls/10.zip (21646 bytes).
[10] Sleeping 3.0s…

=== [2/7819] ID 100 ===
[100] Attempt 1 – requesting…
[100] HTTP 200, type=application/octet-stream, size=192401 bytes
[100] Saved as ../data/large_data/lemmatized-xmls/100.zip (192401 bytes).
[100] Sleeping 2.7s…

=== [3/7819] ID 10000 ===
[10000] Attempt 1 – requesting…
[10000] HTTP 200, type=application/octet-stream, size=10753 bytes
[10000] Saved as ../data/large_data/lemmatized-xmls/10000.zip (10753 bytes).
[10000] Sleeping 1.2s…

=== [4/7819] ID 10001 ===
[10001] Attempt 1 – requesting…
[10001] HTTP 200, type=application/octet-stream, size=5018 bytes
[10001] Too small (5018 bytes) → likely error page. Skipping.
[10001] Sleeping 1.2s…

=== [5/7819] ID 10002 ===
[10002] Attempt 1 – requesting…
[10002] HTTP 200, type=application/octet-stream, size

In [6]:
len(os.listdir(OUTPUT_DIR))

6224

In [16]:
not_downloaded = []
for id in cc_ids:
    if str(id) + ".zip" not in os.listdir(OUTPUT_DIR):
        not_downloaded.append(id)

In [18]:
len(not_downloaded)

1595

In [25]:
not_downloaded[:10]

[10001, 10009, 10010, 10012, 10016, 10017, 10018, 10025, 10027, 10034]

In [None]:
run_downloader(not_downloaded)

Starting download for 1595 IDs…

=== [1/1595] ID 10001 ===
[10001] Attempt 1 – requesting…
[10001] HTTP 200, type=application/octet-stream, size=5018 bytes
[10001] Saved as ../data/large_data/lemmatized-xmls/10001.zip (5018 bytes).
[10001] Sleeping 1.9s…

=== [2/1595] ID 10009 ===
[10009] Attempt 1 – requesting…
[10009] HTTP 200, type=application/octet-stream, size=5850 bytes
[10009] Saved as ../data/large_data/lemmatized-xmls/10009.zip (5850 bytes).
[10009] Sleeping 3.4s…

=== [3/1595] ID 10010 ===
[10010] Attempt 1 – requesting…
[10010] HTTP 200, type=application/octet-stream, size=3428 bytes
[10010] Saved as ../data/large_data/lemmatized-xmls/10010.zip (3428 bytes).
[10010] Sleeping 3.2s…

=== [4/1595] ID 10012 ===
[10012] Attempt 1 – requesting…
[10012] HTTP 200, type=text/html; charset=UTF-8, size=21 bytes
[10012] Too small (21 bytes) → likely error page. Skipping.
[10012] Sleeping 3.1s…

=== [5/1595] ID 10016 ===
[10016] Attempt 1 – requesting…
[10016] HTTP 200, type=application/

In [33]:
zip_files = sorted(OUTPUT_DIR.glob("*.zip"))

print(f"Found {len(zip_files)} ZIP archives.\n")

for zip_path in zip_files:
    print(f"→ Processing {zip_path.name}")

    # Extract ID from filename (strip extension)
    idno = zip_path.stem                     # e.g. "144" from "144.zip"
    new_name = f"{idno}.xml"                 # target filename

    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            # get list of all files inside
            members = z.namelist()

            if not members:
                print(f"   ERROR: ZIP archive {zip_path.name} is empty.")
                continue

            # Expecting exactly one XML file inside; pick the first entry
            internal_name = members[0]

            print(f"   Extracting {internal_name} → {new_name}")

            # Extract to a temporary path
            extracted_path = z.extract(internal_name, OUTPUT_DIR)

        # Rename to <id>.xml
        extracted_file = Path(extracted_path)
        final_path = OUTPUT_DIR / new_name

        extracted_file.rename(final_path)

        # Delete the .zip file
        zip_path.unlink()
        print(f"   Saved as {new_name} and removed {zip_path.name}")

    except zipfile.BadZipFile:
        print(f"   ERROR: Bad ZIP file {zip_path.name}")
    except Exception as e:
        print(f"   ERROR while processing {zip_path.name}: {e}")

print("\nDone.")

Found 9 ZIP archives.

→ Processing 10.zip
   Extracting Riccardus_De-facto-Ungarie-magne.POS.xml → 10.xml
   Saved as 10.xml and removed 10.zip
→ Processing 100.zip
   Extracting 003_Novatianus_De-trinitate.POS.xml → 100.xml
   Saved as 100.xml and removed 100.zip
→ Processing 10000.zip
   Extracting 143_Franco-Leodiensis_De-quadratura-circuli.POS.xml → 10000.xml
   Saved as 10000.xml and removed 10000.zip
→ Processing 10002.zip
   Extracting 143_Gervasius-Remensis_Diplomata.POS.xml → 10002.xml
   Saved as 10002.xml and removed 10002.zip
→ Processing 10003.zip
   Extracting 143_Gozechinus-Scholasticus_Epistola-ad-Valcherum.POS.xml → 10003.xml
   Saved as 10003.xml and removed 10003.zip
→ Processing 10004.zip
   Extracting 143_Hermannus-Contractus_Carmen-de-conflictu-ovis-et-lini.POS.xml → 10004.xml
   Saved as 10004.xml and removed 10004.zip
→ Processing 10005.zip
   Extracting 143_Hermannus-Contractus_Chronicon.POS.xml → 10005.xml
   Saved as 10005.xml and removed 10005.zip
→ Process