# LSJ Greek Headwords (Lemmas) Scraper

This notebook scrapes Greek lexicon headwords (lemmas) from Perseus (LSJ: Liddell–Scott–Jones).
It uses `requests` and `BeautifulSoup` to fetch LSJ index/letter pages, discover entry-group pages,
and extract headwords from LSJ entry links. Results can be saved to CSV or JSON.

References:
- LSJ Index: https://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0057
- Example entry-group page: https://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0057:alphabetic+letter=*a:entry+group=221


In [None]:
# If needed, install dependencies into the active environment
%pip -q install requests beautifulsoup4

In [None]:
import csv
import json
import sys
import time
from dataclasses import dataclass, asdict
from typing import List, Iterable, Optional
from urllib.parse import unquote

import requests
from bs4 import BeautifulSoup

BASE_INDEX_URL = "https://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0057"
# Simple substring targets (after URL-decoding)
ENTRY_GROUP_SUBSTR = "Perseus:text:1999.04.0057:alphabetic+letter="
ENTRY_GROUP_AND = ":entry+group="
ENTRY_PAGE_SUBSTR = "Perseus:text:1999.04.0057:entry="

HEADERS = {
    'User-Agent': 'AI-in-Classics LSJ Notebook Scraper (+https://github.com/ahulloli/AI-in-Classics)'
}

@dataclass
class Headword:
    lemma: str
    url: str
    source: str = 'LSJ'
    language: str = 'grc'

def fetch(url: str, session: Optional[requests.Session] = None, timeout: int = 45) -> str:
    s = session or requests.Session()
    r = s.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text

def to_abs(href: str) -> str:
    if href.startswith('/hopper/'):
        return f'https://www.perseus.tufts.edu{href}'
    if href.startswith('http'):
        return href
    return f'https://www.perseus.tufts.edu/hopper/{href.lstrip('/')}'

def extract_entry_group_links(html: str) -> List[str]:
    soup = BeautifulSoup(html, 'html.parser')
    links: List[str] = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        href_dec = unquote(href)
        if (ENTRY_GROUP_SUBSTR in href_dec) and (ENTRY_GROUP_AND in href_dec):
            links.append(to_abs(href))
    # dedupe preserve order
    seen = set()
    uniq = []
    for u in links:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def generate_letter_urls() -> List[str]:
    letters = ['*a','*b','*g','*d','*e','*z','*h','*q','*i','*k','*l','*m',
               '*n','*c','*o','*p','*r','*s','*t','*u','*f','*x','*y','*w']
    return [f"https://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0057:alphabetic+letter={ltr}" for ltr in letters]

def extract_headwords_from_entry_group(html: str) -> List[Headword]:
    soup = BeautifulSoup(html, 'html.parser')
    out: List[Headword] = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        href_dec = unquote(href)
        if ENTRY_PAGE_SUBSTR in href_dec:
            text = a.get_text(strip=True)
            if not text:
                continue
            out.append(Headword(lemma=text, url=to_abs(href)))
    return out

def write_csv(path: str, items: Iterable[Headword]) -> None:
    with open(path, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(['language','source','lemma','url'])
        for it in items:
            w.writerow([it.language, it.source, it.lemma, it.url])

def write_json(path: str, items: Iterable[Headword]) -> None:
    with open(path, 'w', encoding='utf-8') as f:
        json.dump([asdict(i) for i in items], f, ensure_ascii=False, indent=2)


## Collect LSJ entry-group links via letter pages
We generate letter pages deterministically and extract entry-group links from each.

In [None]:
session = requests.Session()

# Fetch index (optional; we don't rely on it, but useful to ensure connectivity)
_ = fetch(BASE_INDEX_URL, session=session)

letter_urls = generate_letter_urls()
print(f'Letters: {len(letter_urls)}')

entry_groups = []
for i, u in enumerate(letter_urls, 1):
    try:
        html = fetch(u, session=session)
        groups = extract_entry_group_links(html)
        print(f'[{i}/{len(letter_urls)}] {u} -> {len(groups)} groups')
        for g in groups:
            if g not in entry_groups:
                entry_groups.append(g)
        time.sleep(0.4)  # polite delay
    except Exception as e:
        print('WARN letter', u, e, file=sys.stderr)

print('Total entry groups collected:', len(entry_groups))
entry_groups[:5]

## Extract headwords from entry-group pages
This step visits each entry-group and collects headword links (lemmas).

In [None]:
headwords: List[Headword] = []
for i, g in enumerate(entry_groups, 1):
    try:
        html = fetch(g, session=session)
        items = extract_headwords_from_entry_group(html)
        headwords.extend(items)
        if i % 10 == 0:
            print(f'Processed {i}/{len(entry_groups)} groups; total headwords so far: {len(headwords)}')
        time.sleep(0.4)
    except Exception as e:
        print('WARN group', g, e, file=sys.stderr)

# Deduplicate by lemma (keep first)
seen = set()
unique_hw: List[Headword] = []
for hw in headwords:
    if hw.lemma in seen:
        continue
    seen.add(hw.lemma)
    unique_hw.append(hw)

len(unique_hw)

## Save results to CSV (and optionally JSON)

In [None]:
OUT_CSV = 'src/Lemmatizer-GRK/lsj_headwords.csv'
OUT_JSON = 'src/Lemmatizer-GRK/lsj_headwords.json'

write_csv(OUT_CSV, unique_hw)
print('Wrote CSV:', OUT_CSV)
# Optionally also write JSON
# write_json(OUT_JSON, unique_hw)
# print('Wrote JSON:', OUT_JSON)


## Preview a few headwords

In [None]:
[(hw.lemma, hw.url) for hw in unique_hw[:15]]