In [29]:
import requests
from bs4 import BeautifulSoup
import time
import urllib.parse
import csv

In [30]:
word_map = {}
with open("words.csv", newline='', encoding='utf-8-sig') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if len(row) >= 2:
            word_map[row[0].strip()] = row[1].strip()

In [31]:
def is_valid_label(label):
    """
    Return True for clean gloss labels: no namespaces, unwanted terms, or citation numbers.
    """
    if not label or label.strip() == '':
        return False
    invalid_terms = {'infinitive'}
    if ':' in label or label.lower() in invalid_terms:
        return False
    if label.strip().startswith('[') and label.strip().endswith(']'):
        return False
    return True

In [32]:
def get_definitions_from_ol(ol_tag):
    """
    Collect all valid <a> tags from each <li> in the <ol> under Swedish:
    - Only consider /wiki/XYZ links without namespaces
    - Skip inflection pointers (#Swedish) and appendix/etc.
    """
    definitions = []
    for li in ol_tag.find_all('li', recursive=False):
        # handle any mention spans first
        mention = li.find('span', class_='use-with-mention')
        if mention:
            a = mention.find('a', href=True)
            if a and a['href'].startswith('/wiki/') and not a['href'].endswith('#Swedish'):
                term = a['href'].split('/wiki/')[1].split('#')[0]
                if ':' not in term:
                    title = a.get('title', a.text.strip())
                    if is_valid_label(title):
                        definitions.append(title)
            continue

        # then any direct <a> in <li>
        for a in li.find_all('a', href=True):
            href = a['href']
            if not href.startswith('/wiki/') or href.endswith('#Swedish'):
                continue
            term = href.split('/wiki/')[1].split('#')[0]
            if ':' in term:
                continue
            title = a.get('title', a.text.strip())
            if is_valid_label(title):
                definitions.append(title)
    return definitions

In [33]:
def fetch_translations(word, delay=1.0):
    word_lower = word.lower()
    url = f"https://en.wiktionary.org/wiki/{urllib.parse.quote(word_lower)}"
    print(f"[INFO] Fetching page for '{word}': {url}")

    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"[ERROR] HTTP {resp.status_code}")
        return []

    soup = BeautifulSoup(resp.text, 'html.parser')
    swedish_h2 = soup.find('h2', id='Swedish')
    if not swedish_h2:
        print(f"[WARN] No Swedish section for {word}")
        return []

    # get first list under Swedish
    first_ol = swedish_h2.find_next('ol')

    # inflection fallback: detect either direct <a>#Swedish OR form-of-definition spans
    def find_inflection_link(li):
        # direct a
        a = li.find('a', href=lambda x: x and x.startswith('/wiki/') and x.endswith('#Swedish'), recursive=False)
        if a:
            return a['href']
        # nested form-of-definition span
        span = li.find('span', class_='form-of-definition')
        if span:
            a2 = span.find('a', href=lambda x: x and x.startswith('/wiki/') and x.endswith('#Swedish'))
            if a2:
                return a2['href']
        return None

    if first_ol:
        first_li = first_ol.find('li', recursive=False)
        if first_li:
            inf_href = find_inflection_link(first_li)
            if inf_href:
                base_url = f"https://en.wiktionary.org{inf_href}"
                print(f"[INFO] Inflection detected: following to base form: {base_url}")
                time.sleep(delay)
                resp = requests.get(base_url)
                if resp.status_code == 200:
                    soup = BeautifulSoup(resp.text, 'html.parser')
                    swedish_h2 = soup.find('h2', id='Swedish')
                    first_ol = swedish_h2.find_next('ol') if swedish_h2 else None
                else:
                    print(f"[WARN] Base form fetch failed: HTTP {resp.status_code}")

    translations = []
    pos_tags = ['Particle', 'Conjunction', 'Pronoun', 'Noun', 'Verb', 'Adjective', 'Adverb']
    for pos in pos_tags:
        span = swedish_h2.find_next('span', id=pos)
        if not span:
            continue
        header = span.parent
        ol = header.find_next_sibling('ol')
        if not ol:
            continue
        defs = get_definitions_from_ol(ol)
        if defs:
            translations.extend(defs)

    # fallback to first_ol
    if not translations and first_ol:
        translations = get_definitions_from_ol(first_ol)

    # Deduplicate
    seen = set()
    result = []
    for t in translations:
        if t not in seen:
            seen.add(t)
            result.append(t)

    print(f"[INFO] Final translations for '{word}': {result}")
    time.sleep(delay)
    return result


In [34]:
def translate_sentence(sentence):
    print("=" * 40)
    print(f"[INFO] Translating: '{sentence}'")
    print("=" * 40)

    words = sentence.strip().split()
    translated_words = []

    for word in words:
        # Try exact match first
        if word in word_map:
            print(f"[INFO] Found in CSV: '{word}' -> '{word_map[word]}'")
            translation = word_map[word]
        else:
            # Try case-insensitive match
            lower_matches = [k for k in word_map if k.lower() == word.lower()]
            if lower_matches:
                match_key = lower_matches[0]
                print(f"[INFO] Case-insensitive match used: '{match_key}' -> '{word_map[match_key]}'")
                translation = word_map[match_key]
            else:
                defs = fetch_translations(word)
                if USE_FIRST_TRANSLATION_ONLY:
                    translation = defs[0].split()[0] if defs else word
                else:
                    translation = defs[0] if defs else word

        translated_words.append(translation)

    return " ".join(translated_words)


In [35]:
if __name__ == '__main__':
    USE_FIRST_TRANSLATION_ONLY = True
    print("=" * 40 + "\n" + translate_sentence("Den kvicka bruna räven hoppar över den lata hunden") + "\n" + "=" * 40)


[INFO] Translating: 'Den kvicka bruna räven hoppar över den lata hunden'
[INFO] Case-insensitive match used: 'den' -> 'the'
[INFO] Fetching page for 'kvicka': https://en.wiktionary.org/wiki/kvicka
[INFO] Inflection detected: following to base form: https://en.wiktionary.org/wiki/kvick#Swedish
[INFO] Final translations for 'kvicka': ['quick', 'fast', 'witty']
[INFO] Fetching page for 'bruna': https://en.wiktionary.org/wiki/bruna
[INFO] Inflection detected: following to base form: https://en.wiktionary.org/wiki/brun#Swedish
[INFO] Final translations for 'bruna': ['brown', 'politics', 'derogatory', 'fascism', 'right-wing', 'nationalism']
[INFO] Fetching page for 'räven': https://en.wiktionary.org/wiki/r%C3%A4ven
[INFO] Inflection detected: following to base form: https://en.wiktionary.org/wiki/r%C3%A4v#Swedish
[INFO] Final translations for 'räven': ['fox', 'experienced', 'cunning', 'person']
[INFO] Fetching page for 'hoppar': https://en.wiktionary.org/wiki/hoppar
[INFO] Inflection detecte