# Perseus Atlas Greek Lemmas and Conjugations Scraper

This notebook scrapes Greek lemmas and their conjugated/inflected forms from the Perseus Atlas.
It uses `requests` and `BeautifulSoup` to fetch paginated lemma lists and individual lemma pages.

**References:**
- Atlas Lemma List: https://atlas.perseus.tufts.edu/lemmas/?lang=grc
- Individual lemma pages contain inflected forms and morphological data

**Data Collected:**
- Lemma headwords
- Atlas IDs and URLs
- Inflected forms (conjugations for verbs, declensions for nouns)
- Morphological information

In [None]:
# If needed, install dependencies into the active environment
%pip -q install requests beautifulsoup4 pandas

In [None]:
import csv
import json
import sys
import time
import re
from dataclasses import dataclass, asdict
from typing import List, Iterable, Optional, Dict
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

ATLAS_BASE = "https://atlas.perseus.tufts.edu"
LEMMA_LIST_URL = f"{ATLAS_BASE}/lemmas/?lang=grc"

HEADERS = {
    'User-Agent': 'AI-in-Classics Atlas Scraper (+https://github.com/DataDrivenHumanities/AI-in-Classics)'
}

@dataclass
class Lemma:
    lemma: str
    atlas_id: str
    url: str
    source: str = 'Atlas'
    language: str = 'grc'

@dataclass
class InflectedForm:
    lemma: str
    atlas_id: str
    inflected_form: str
    morphology: Optional[str] = None
    frequency: Optional[int] = None

def fetch(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> str:
    """Fetch URL with error handling."""
    s = session or requests.Session()
    r = s.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text

def to_abs(href: str) -> str:
    """Convert relative URL to absolute."""
    if href.startswith('http'):
        return href
    return urljoin(ATLAS_BASE, href)

## Step 1: Scrape Lemma List Pages

The Atlas has paginated lemma lists. We'll scrape multiple pages to collect lemmas.

In [None]:
def scrape_lemma_list_page(page_num: int, session: requests.Session) -> List[Lemma]:
    """Scrape lemmas from a single page of the Atlas."""
    url = f"{LEMMA_LIST_URL}&page={page_num}"
    print(f'Fetching page {page_num}: {url}')
    
    html = fetch(url, session=session)
    soup = BeautifulSoup(html, 'html.parser')
    
    lemmas = []
    
    # Find all lemma links (format: /lemma/XXXXX/)
    for link in soup.find_all('a', href=re.compile(r'/lemma/\d+/')):
        lemma_text = link.get_text(strip=True)
        
        # Skip navigation elements
        if lemma_text in ['start', 'prev', 'next', 'end', '[', ']'] or not lemma_text:
            continue
        
        href = link['href']
        
        # Extract Atlas ID from URL
        match = re.search(r'/lemma/(\d+)/', href)
        if not match:
            continue
        
        atlas_id = match.group(1)
        lemma_url = to_abs(href)
        
        lemmas.append(Lemma(
            lemma=lemma_text,
            atlas_id=atlas_id,
            url=lemma_url
        ))
    
    return lemmas

In [None]:
# Scrape multiple pages
session = requests.Session()

# Configuration
START_PAGE = 1
NUM_PAGES = 5  # Change this to scrape more pages (total: ~19,673)
DELAY = 0.5  # seconds between requests

all_lemmas = []

for page in range(START_PAGE, START_PAGE + NUM_PAGES):
    try:
        lemmas = scrape_lemma_list_page(page, session)
        all_lemmas.extend(lemmas)
        print(f'  -> Collected {len(lemmas)} lemmas (total: {len(all_lemmas)})')
        time.sleep(DELAY)
    except Exception as e:
        print(f'ERROR on page {page}: {e}', file=sys.stderr)

print(f'\nTotal lemmas collected: {len(all_lemmas)}')

In [None]:
# Preview collected lemmas
print(f'First 10 lemmas:')
for lemma in all_lemmas[:10]:
    print(f'  {lemma.lemma} (ID: {lemma.atlas_id})')

print(f'\nLast 10 lemmas:')
for lemma in all_lemmas[-10:]:
    print(f'  {lemma.lemma} (ID: {lemma.atlas_id})')

## Step 2: Scrape Individual Lemma Pages for Conjugations

Now we'll visit individual lemma pages to extract inflected forms (conjugations/declensions).

In [None]:
def scrape_lemma_inflections(lemma: Lemma, session: requests.Session) -> List[InflectedForm]:
    """Scrape inflected forms from an individual lemma page."""
    try:
        html = fetch(lemma.url, session=session)
        soup = BeautifulSoup(html, 'html.parser')
        
        inflections = []
        seen_forms = set()  # Avoid duplicates
        
        # Look for links to form pages (format: /form/NUMBER/)
        # These contain the inflected forms
        for link in soup.find_all('a', href=re.compile(r'/form/\d+/')):
            form_text = link.get_text(strip=True)
            
            # Clean up the form text (remove brackets, special chars from OCR errors)
            form_text = re.sub(r'[<>[\]{}]', '', form_text)
            form_text = form_text.strip()
            
            # Skip if empty, same as lemma, or already seen
            if not form_text or form_text == lemma.lemma or form_text in seen_forms:
                continue
            
            # Skip if it's just punctuation or numbers
            if re.match(r'^[.,;:·\[\]\d\s]+$', form_text):
                continue
            
            # Skip very short non-Greek forms (OCR errors)
            if len(form_text) <= 2 and not re.search(r'[α-ωΑ-Ω]', form_text):
                continue
            
            seen_forms.add(form_text)
            
            # Try to find morphological info nearby (in parent or sibling elements)
            morph_info = None
            parent = link.find_parent(['td', 'div', 'li'])
            if parent:
                # Look for morphology tags or description
                morph_tag = parent.find(['span', 'small'], class_=re.compile(r'morph|parse|grammar'))
                if morph_tag:
                    morph_info = morph_tag.get_text(strip=True)
            
            inflections.append(InflectedForm(
                lemma=lemma.lemma,
                atlas_id=lemma.atlas_id,
                inflected_form=form_text,
                morphology=morph_info
            ))
        
        return inflections
    
    except Exception as e:
        print(f'ERROR scraping inflections for {lemma.lemma}: {e}', file=sys.stderr)
        return []

In [None]:
# Scrape inflections for a sample of lemmas
# (Set SAMPLE_SIZE to None to scrape all)
SAMPLE_SIZE = 10  # Start with just 10 for testing
INFLECTION_DELAY = 1.0  # Be more polite when fetching individual pages

all_inflections = []
sample_lemmas = all_lemmas[:SAMPLE_SIZE] if SAMPLE_SIZE else all_lemmas

for i, lemma in enumerate(sample_lemmas, 1):
    print(f'[{i}/{len(sample_lemmas)}] Scraping inflections for: {lemma.lemma}')
    
    inflections = scrape_lemma_inflections(lemma, session)
    all_inflections.extend(inflections)
    
    print(f'  -> Found {len(inflections)} inflected forms')
    
    time.sleep(INFLECTION_DELAY)

print(f'\nTotal inflected forms collected: {len(all_inflections)}')

In [None]:
# Preview inflections
if all_inflections:
    print('Sample inflected forms:')
    for infl in all_inflections[:20]:
        morph = f' [{infl.morphology}]' if infl.morphology else ''
        print(f'  {infl.lemma} → {infl.inflected_form}{morph}')
else:
    print('No inflections found. The Atlas page structure may have changed.')
    print('You may need to inspect the HTML manually and update the scraping logic.')

## Step 3: Save Results to CSV

In [None]:
def write_lemmas_csv(path: str, items: Iterable[Lemma]) -> None:
    """Write lemmas to CSV."""
    with open(path, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(['language', 'source', 'lemma', 'atlas_id', 'url'])
        for it in items:
            w.writerow([it.language, it.source, it.lemma, it.atlas_id, it.url])

def write_inflections_csv(path: str, items: Iterable[InflectedForm]) -> None:
    """Write inflected forms to CSV."""
    with open(path, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(['lemma', 'atlas_id', 'inflected_form', 'morphology', 'frequency'])
        for it in items:
            w.writerow([it.lemma, it.atlas_id, it.inflected_form, it.morphology or '', it.frequency or ''])

def write_json(path: str, items: Iterable, item_type: str) -> None:
    """Write data to JSON."""
    with open(path, 'w', encoding='utf-8') as f:
        json.dump([asdict(i) for i in items], f, ensure_ascii=False, indent=2)

In [None]:
# Save lemmas
LEMMAS_CSV = 'database/atlas_lemmas.csv'
LEMMAS_JSON = 'database/atlas_lemmas.json'

write_lemmas_csv(LEMMAS_CSV, all_lemmas)
print(f'✓ Wrote {len(all_lemmas)} lemmas to {LEMMAS_CSV}')

write_json(LEMMAS_JSON, all_lemmas, 'lemmas')
print(f'✓ Wrote {len(all_lemmas)} lemmas to {LEMMAS_JSON}')

In [None]:
# Save inflections
if all_inflections:
    INFLECTIONS_CSV = 'database/atlas_inflections.csv'
    INFLECTIONS_JSON = 'database/atlas_inflections.json'
    
    write_inflections_csv(INFLECTIONS_CSV, all_inflections)
    print(f'✓ Wrote {len(all_inflections)} inflected forms to {INFLECTIONS_CSV}')
    
    write_json(INFLECTIONS_JSON, all_inflections, 'inflections')
    print(f'✓ Wrote {len(all_inflections)} inflected forms to {INFLECTIONS_JSON}')
else:
    print('⚠ No inflections to save')

## Step 4: Create DataFrame for Analysis

In [None]:
# Convert to pandas DataFrames
lemmas_df = pd.DataFrame([asdict(l) for l in all_lemmas])
print(f'Lemmas DataFrame:')
print(lemmas_df.head(10))
print(f'\nShape: {lemmas_df.shape}')

In [None]:
if all_inflections:
    inflections_df = pd.DataFrame([asdict(i) for i in all_inflections])
    print(f'Inflections DataFrame:')
    print(inflections_df.head(10))
    print(f'\nShape: {inflections_df.shape}')
    
    # Show summary by lemma
    print(f'\nInflections per lemma:')
    print(inflections_df.groupby('lemma').size().sort_values(ascending=False).head(10))

## Summary Statistics

In [None]:
print('='*60)
print('SCRAPING SUMMARY')
print('='*60)
print(f'Pages scraped: {NUM_PAGES}')
print(f'Total lemmas collected: {len(all_lemmas)}')
print(f'Lemmas with inflections checked: {len(sample_lemmas)}')
print(f'Total inflected forms found: {len(all_inflections)}')
if all_inflections:
    print(f'Average forms per lemma: {len(all_inflections) / len(sample_lemmas):.1f}')
print('\nOutput files:')
print(f'  - {LEMMAS_CSV}')
print(f'  - {LEMMAS_JSON}')
if all_inflections:
    print(f'  - {INFLECTIONS_CSV}')
    print(f'  - {INFLECTIONS_JSON}')
print('='*60)