In [40]:
import requests
from bs4 import BeautifulSoup
import time
import urllib.parse


def is_valid_label(label):
    """
    Return True for clean gloss labels: no namespaces, unwanted terms, or citation numbers.
    """
    if not label or label.strip() == '':
        return False
    invalid_terms = {'infinitive'}
    if ':' in label or label.lower() in invalid_terms:
        return False
    if label.strip().startswith('[') and label.strip().endswith(']'):
        return False
    return True


def get_definitions_from_ol(ol_tag):
    """
    Collect all valid <a> tags from each <li> in the <ol>:
    - Checks both direct <a> children and those within spans
    - Special handling for inflection links
    """
    defs = []
    for li in ol_tag.find_all('li', recursive=False):
        # First check for mention spans (special case for "att")
        mention_span = li.find('span', class_='use-with-mention')
        if mention_span:
            a_tag = mention_span.find('a')
            if a_tag and a_tag.get('title'):
                defs.append(a_tag['title'])
                continue  # Skip other checks if we found a mention span
        
        # Then check for regular definition links
        a_tags = li.find_all('a', href=True)
        for a in a_tags:
            # Skip inflection links
            href = a.get('href', '')
            if href.startswith('/wiki/') and href.endswith('#Swedish'):
                continue
                
            title = a.get('title', a.text.strip())
            if is_valid_label(title):
                defs.append(title)
    return defs


def fetch_translations(word, delay=1.0):
    word_lower = word.lower()
    url = f"https://en.wiktionary.org/wiki/{urllib.parse.quote(word_lower)}"
    print(f"[INFO] Fetching page for '{word}': {url}")

    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"[ERROR] HTTP {resp.status_code}")
        return []

    soup = BeautifulSoup(resp.text, 'html.parser')

    # locate Swedish section
    swedish_h2 = soup.find('h2', id='Swedish')
    if not swedish_h2:
        print(f"[WARN] No Swedish section for {word}")
        return []

    # get first definition list under Swedish
    first_ol = swedish_h2.find_next('ol')

    # inflection fallback - handle cases like "älskar" -> "älska"
    if first_ol:
        first_li = first_ol.find('li', recursive=False)
        if first_li:
            inf_link = first_li.find('a', href=lambda x: x and x.startswith('/wiki/') and x.endswith('#Swedish'))
            if inf_link:
                base_href = inf_link['href']
                base_url = f"https://en.wiktionary.org{base_href}"
                print(f"[INFO] Inflection detected: following to base form: {base_url}")
                time.sleep(delay)  # Be polite with delay between requests
                resp = requests.get(base_url)
                if resp.status_code == 200:
                    soup = BeautifulSoup(resp.text, 'html.parser')
                    swedish_h2 = soup.find('h2', id='Swedish')
                    if swedish_h2:
                        first_ol = swedish_h2.find_next('ol')
                else:
                    print(f"[WARN] Base form fetch failed: HTTP {resp.status_code}")

    # now collect definitions under POS
    translations = []
    pos_tags = ['Particle', 'Conjunction', 'Pronoun', 'Noun', 'Verb', 'Adjective', 'Adverb']
    for pos in pos_tags:
        header = swedish_h2.find_next(['h3', 'h4'], id=pos)
        if not header:
            continue
        ol = header.find_next_sibling('ol')
        if not ol:
            continue
        defs = get_definitions_from_ol(ol)
        if defs:
            translations.extend(defs)

    # fallback: use first_ol if no POS headers found
    if not translations and first_ol:
        translations = get_definitions_from_ol(first_ol)

    # dedupe
    seen = set()
    result = []
    for t in translations:
        if t not in seen:
            seen.add(t)
            result.append(t)

    print(f"[INFO] Final translations for '{word}': {result}")
    time.sleep(delay)
    return result


def translate_sentence(sentence):
    print("="*40)
    print(f"[INFO] Translating: '{sentence}'")
    print("="*40)
    final = {}
    for w in sentence.split():
        print(f"[INFO] Word: {w}")
        final[w] = fetch_translations(w)
    return final


if __name__ == '__main__':
    print(translate_sentence("den kvicka bruna räven hoppade över den lata hunden"))

[INFO] Translating: 'den kvicka bruna räven hoppade över den lata hunden'
[INFO] Word: den
[INFO] Fetching page for 'den': https://en.wiktionary.org/wiki/den
[INFO] Inflection detected: following to base form: https://en.wiktionary.org/wiki/tappat_bort#Swedish
[INFO] Final translations for 'den': ['Appendix:Glossary']
[INFO] Word: kvicka
[INFO] Fetching page for 'kvicka': https://en.wiktionary.org/wiki/kvicka
[INFO] Inflection detected: following to base form: https://en.wiktionary.org/wiki/kvick#Swedish
[INFO] Final translations for 'kvicka': ['quick', 'fast', 'witty']
[INFO] Word: bruna
[INFO] Fetching page for 'bruna': https://en.wiktionary.org/wiki/bruna
[INFO] Inflection detected: following to base form: https://en.wiktionary.org/wiki/brun#Swedish
[INFO] Final translations for 'bruna': ['brown', 'politics', 'derogatory', 'fascism', 'right-wing', 'nationalism']
[INFO] Word: räven
[INFO] Fetching page for 'räven': https://en.wiktionary.org/wiki/r%C3%A4ven
[INFO] Inflection detected: