In [229]:
import requests
from bs4 import BeautifulSoup
import time
import urllib.parse
import csv
from sentence_transformers import SentenceTransformer, util
import string

embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [230]:
def pick_best_definition(word, definitions, sentence):
    """
    Use sentence embeddings to select the definition closest in meaning to the sentence context.
    """
    if not definitions:
        return word

    sentence_embedding = embedding_model.encode(sentence, convert_to_tensor=True)
    definition_embeddings = embedding_model.encode(definitions, convert_to_tensor=True)

    cos_scores = util.cos_sim(sentence_embedding, definition_embeddings)[0]
    best_idx = cos_scores.argmax().item()
    return definitions[best_idx]

In [231]:
word_map = {}
with open("words.csv", newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if len(row) >= 2:
            word_map[row[0].strip()] = row[1].strip()


In [232]:
def is_valid_label(label):
    """
    Return True for clean gloss labels: no namespaces, unwanted terms, or citation numbers.
    """
    if not label or label.strip() == '':
        return False
    invalid_terms = {'infinitive'}
    if ':' in label or label.lower() in invalid_terms:
        return False
    if label.strip().startswith('[') and label.strip().endswith(']'):
        return False
    return True

def strip_punctuation(word):
    return word.translate(str.maketrans('', '', string.punctuation))


In [233]:
def get_definitions_from_ol(ol_tag):
    """
    Collect all valid <a> tags from each <li> in the <ol> under TEXT_SOURCE_LANGUAGE:
    - Only consider /wiki/XYZ links without namespaces
    - Optionally skip inflection pointers (#TEXT_SOURCE_LANGUAGE) and appendix/etc.
    """
    definitions = []
    for li in ol_tag.find_all('li', recursive=False):
        # handle any mention spans first
        mention = li.find('span', class_='use-with-mention')
        if mention:
            a = mention.find('a', href=True)
            if a and a['href'].startswith('/wiki/'):
                href = a['href']
                term = href.split('/wiki/')[1].split('#')[0]
                if ':' not in term:
                    if FILTER_BY_TARGET_LANGUAGE and href.endswith(f'#{TEXT_SOURCE_LANGUAGE}'):
                        continue
                    title = a.get('title', a.text.strip())
                    if is_valid_label(title):
                        definitions.append(title)
            continue

        # then any direct <a> in <li>
        for a in li.find_all('a', href=True):
            href = a['href']
            if not href.startswith('/wiki/'):
                continue
            term = href.split('/wiki/')[1].split('#')[0]
            if ':' in term:
                continue
            if FILTER_BY_TARGET_LANGUAGE and href.endswith(f'#{TEXT_SOURCE_LANGUAGE}'):
                continue
            title = a.get('title', a.text.strip())
            if is_valid_label(title):
                definitions.append(title)

    return definitions


In [234]:
def fetch_translations(word, delay=1.0):
    base_url = f"https://{TRANSLATION_TARGET_ISO639}.wiktionary.org"
    url = f"{base_url}/wiki/{urllib.parse.quote(word)}"
    print(f"[INFO] Fetching page for '{word}': {url}")

    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"[ERROR] HTTP {resp.status_code}")
        return []

    soup = BeautifulSoup(resp.text, 'html.parser')
    lang_h2 = soup.find('h2', id=TEXT_SOURCE_LANGUAGE)
    if not lang_h2:
        print(f"[WARN] No {TEXT_SOURCE_LANGUAGE} section for {word}")
        return []

    first_ol = lang_h2.find_next('ol')

    def find_inflection_link(li):
        a = li.find('a', href=lambda x: x and x.startswith('/wiki/') and x.endswith(f'#{TEXT_SOURCE_LANGUAGE}'), recursive=False)
        if a:
            return a['href']
        span = li.find('span', class_='form-of-definition')
        if span:
            a2 = span.find('a', href=lambda x: x and x.startswith('/wiki/') and x.endswith(f'#{TEXT_SOURCE_LANGUAGE}'))
            if a2:
                return a2['href']
        return None

    if first_ol:
        first_li = first_ol.find('li', recursive=False)
        if first_li:
            inf_href = find_inflection_link(first_li)
            if inf_href:
                base_form_url = f"{base_url}{inf_href}"
                print(f"[INFO] Inflection detected: following to base form: {base_form_url}")
                time.sleep(delay)
                resp = requests.get(base_form_url)
                if resp.status_code == 200:
                    soup = BeautifulSoup(resp.text, 'html.parser')
                    lang_h2 = soup.find('h2', id=TEXT_SOURCE_LANGUAGE)
                    first_ol = lang_h2.find_next('ol') if lang_h2 else None
                else:
                    print(f"[WARN] Base form fetch failed: HTTP {resp.status_code}")

    translations = []
    pos_tags = ['Particle', 'Conjunction', 'Pronoun', 'Noun', 'Verb', 'Adjective', 'Adverb']
    for pos in pos_tags:
        span = lang_h2.find_next('span', id=pos)
        if not span:
            continue
        header = span.parent
        ol = header.find_next_sibling('ol')
        if not ol:
            continue
        defs = get_definitions_from_ol(ol)
        if defs:
            translations.extend(defs)

    # fallback to first_ol
    if not translations and first_ol:
        translations = get_definitions_from_ol(first_ol)

    # Deduplicate cleanly
    result = list(dict.fromkeys(translations))

    print(f"[INFO] Final translations for '{word}': {result}")
    time.sleep(delay)
    return result

In [235]:
def translate_sentence(sentence):
    print("=" * 40)
    print(f"[INFO] Translating: '{sentence}'")
    print("=" * 40)

    words = sentence.strip().split()
    translated_words = []

    for word in words:
        original_word = word  # Keep original in case we fallback

        if STRIP_PUNCTUATION:
            word = strip_punctuation(word)

        if word in word_map:
            print(f"[INFO] Found in CSV: '{word}' -> '{word_map[word]}'")
            translation = word_map[word]
        else:
            lower_matches = [k for k in word_map if k.lower() == word.lower()]
            if lower_matches:
                match_key = lower_matches[0]
                print(f"[INFO] Case-insensitive match used: '{match_key}' -> '{word_map[match_key]}'")
                translation = word_map[match_key]
            else:
                defs = fetch_translations(word)
                if USE_FIRST_TRANSLATION_ONLY:
                    translation = defs[0].split()[0] if defs else original_word
                else:
                    translation = pick_best_definition(word, defs, sentence) if defs else original_word

        translated_words.append(translation)

    return " ".join(translated_words)


In [None]:
if __name__ == '__main__':
    USE_FIRST_TRANSLATION_ONLY = True
    STRIP_PUNCTUATION = True
    FILTER_BY_TARGET_LANGUAGE = True
    TEXT_SOURCE_LANGUAGE = "Swedish"
    TRANSLATION_TARGET_LANGUAGE = "English"
    TRANSLATION_TARGET_ISO639 = "en"

    print("=" * 40 + "\n" + translate_sentence("") + "\n" + "=" * 40)


[INFO] Translating: 'hej världen. det här är min översättnings modell'
[INFO] Fetching page for 'hej': https://en.wiktionary.org/wiki/hej
[INFO] Final translations for 'hej': ['hi', 'hello', 'bye', 'hey']
[INFO] Fetching page for 'världen': https://en.wiktionary.org/wiki/v%C3%A4rlden
[INFO] Inflection detected: following to base form: https://en.wiktionary.org/wiki/v%C3%A4rld#Swedish
[INFO] Final translations for 'världen': ['world', 'the end of the world']
[INFO] Fetching page for 'det': https://en.wiktionary.org/wiki/det
[INFO] Final translations for 'det': ['it']
[INFO] Fetching page for 'här': https://en.wiktionary.org/wiki/h%C3%A4r
[INFO] Final translations for 'här': ['here']
[INFO] Found in CSV: 'är' -> 'is'
[INFO] Fetching page for 'min': https://en.wiktionary.org/wiki/min
[INFO] Final translations for 'min': ['minute']
[INFO] Fetching page for 'översättnings': https://en.wiktionary.org/wiki/%C3%B6vers%C3%A4ttnings
[INFO] Inflection detected: following to base form: https://en.