# Extraction

In [15]:
import json
import pandas as pd
from typing import List


def load_wiktextract(file_path: str) -> pd.DataFrame:
    """Loads JSONL file with wiktextract data (See: https://github.com/tatuylonen/wiktextract)"""

    with open(file_path) as f:
        lines = f.read().splitlines()

    line_dicts: List[dict] = [json.loads(line) for line in lines]
    df: pd.DataFrame = pd.DataFrame(line_dicts)

    return df.fillna("")

In [16]:
dump_path = "/media/ducha/SSDSHARED/VN/wikt/kaikki.org-dictionary-Vietnamese.jsonl"
wikt_df = load_wiktextract(dump_path)

In [17]:
wikt_df.columns

Index(['pos', 'forms', 'word', 'lang', 'lang_code', 'senses', 'head_templates',
       'sounds', 'etymology_text', 'etymology_templates', 'etymology_number',
       'derived', 'wikipedia', 'related', 'synonyms', 'categories', 'antonyms',
       'descendants', 'coordinate_terms', 'meronyms', 'hypernyms'],
      dtype='object')

In [18]:
example = "với"

In [19]:
# Find all entries that contain the word "hàng"
ex_entries = wikt_df[wikt_df["word"] == example]

In [20]:
ex_entries

Unnamed: 0,pos,forms,word,lang,lang_code,senses,head_templates,sounds,etymology_text,etymology_templates,...,derived,wikipedia,related,synonyms,categories,antonyms,descendants,coordinate_terms,meronyms,hypernyms
7064,verb,"[{'form': '𢭲', 'tags': ['CJK']}]",với,Vietnamese,vi,"[{'links': [['reach', 'reach']], 'glosses': ['...","[{'name': 'head', 'args': {'1': 'vi', '2': 've...","[{'ipa': '[vəːj˧˦]', 'tags': ['Hà-Nội']}, {'ip...",,,...,,,,,,,,,,
7065,noun,"[{'form': '𢭲', 'tags': ['CJK']}]",với,Vietnamese,vi,"[{'links': [['stretch', 'stretch'], ['hand', '...","[{'name': 'head', 'args': {'1': 'vi', '2': 'no...","[{'ipa': '[vəːj˧˦]', 'tags': ['Hà-Nội']}, {'ip...",,,...,,,,,,,,,,
7066,prep,"[{'form': '众', 'tags': ['CJK']}, {'form': '喟',...",với,Vietnamese,vi,"[{'examples': [{'text': 'cộng 2 với 3', 'engli...","[{'name': 'head', 'args': {'1': 'vi', '2': 'pr...","[{'ipa': '[vəːj˧˦]', 'tags': ['Hà-Nội']}, {'ip...",Cognate with Muong Bi pỡi.\nProbably related a...,"[{'name': 'cog', 'args': {'1': 'mtq', '2': '-'...",...,,,"[{'english': '(to subtract) from', 'word': 'đi...","[{'tags': ['obsolete'], 'word': 'vuối', '_dis1...",,,,,,
7067,particle,,với,Vietnamese,vi,"[{'examples': [{'text': 'Cho con đi với.', 'en...","[{'name': 'head', 'args': {'1': 'vi', '2': 'pa...","[{'ipa': '[vəːj˧˦]', 'tags': ['Hà-Nội']}, {'ip...",Cognate with Muong Bi pỡi.\nProbably related a...,"[{'name': 'cog', 'args': {'1': 'mtq', '2': '-'...",...,,,,"[{'tags': ['obsolete'], 'word': 'vuối', '_dis1...",,,,,,


In [21]:
from typing import List
import pandas as pd


def get_entries(wikt_df: pd.DataFrame, word: str) -> pd.DataFrame:
    return wikt_df[wikt_df["word"] == word]

## Abstecher: glosses and raw_glosses analysis

In [22]:
# glosses_lens = []
# raw_glosses_lens = []
# for i, row in wikt_df.iterrows():
#     senses = row.get("senses", [])
#     for senses in senses:
#         if "glosses" in senses:
#             glosses_lens.append((i, len(senses["glosses"])))
#         if "raw_glosses" in senses:
#             raw_glosses_lens.append((i, len(senses["raw_glosses"])))
# max(glosses_lens, key=lambda x: x[1]), max(raw_glosses_lens, key=lambda x: x[1])

In [23]:
wikt_df.iloc[7066]["senses"]

[{'examples': [{'text': 'cộng 2 với 3',
    'english': 'to add 2 to 3',
    'type': 'example'},
   {'text': 'nhân tổng với hiệu',
    'english': 'to multiply this sum by this difference',
    'type': 'example'}],
  'links': [['with', 'with'],
   ['together with', 'together with'],
   ['along with', 'along with'],
   ['and', 'and'],
   ['arithmetic', 'arithmetic'],
   ['cộng', 'cộng#Vietnamese'],
   ['add', 'add#English'],
   ['nhân', 'nhân#Vietnamese'],
   ['multiply', 'multiply#English'],
   ['to', 'to#English'],
   ['by', 'by#English']],
  'raw_glosses': ['with; together with; along with; and',
   '(arithmetic, after cộng (“to add”) or nhân (“to multiply”)) to or by'],
  'glosses': ['with; together with; along with; and',
   '(arithmetic, after cộng (“to add”) or nhân (“to multiply”)) to or by',
   'to or by'],
  'topics': ['arithmetic'],
  'raw_tags': ['after cộng (“to add”) or nhân (“to multiply”)'],
  'id': 'en-với-vi-prep-~f1SuIAl',
  'categories': [{'name': 'Arithmetic',
    'ki

In [24]:
[
    e["raw_glosses"] if "raw_glosses" in e else e["glosses"]
    for e in wikt_df.iloc[7066]["senses"]
]

[['with; together with; along with; and',
  '(arithmetic, after cộng (“to add”) or nhân (“to multiply”)) to or by'],
 ['with; together with; along with; and',
  '(geometry, after kề (“adjacent”), bù (“supplementary”), song song (“parallel”), vuông góc (“perpendicular”), etc.) to'],
 ['Short for đối với (“to”).'],
 ['Ellipsis of với chẳng.']]

# Formatting

Notes:

- each `"sense"` stands for a meaning of the word (of this pos) they are dicts:
  - [x] `glosses` are the quick translations
    - each meaning-pos combination might have multiple meanings
  - [x] sometimes `raw_glosses` are available with a more detailed explanation
  - [x] also use `examples` if available
  - [x] also use `synonyms` if available
  - [ ] also use `etymology` if available

In [25]:
ex_entries.iloc[0]

pos                                                                 verb
forms                                   [{'form': '𢭲', 'tags': ['CJK']}]
word                                                                 với
lang                                                          Vietnamese
lang_code                                                             vi
senses                 [{'links': [['reach', 'reach']], 'glosses': ['...
head_templates         [{'name': 'head', 'args': {'1': 'vi', '2': 've...
sounds                 [{'ipa': '[vəːj˧˦]', 'tags': ['Hà-Nội']}, {'ip...
etymology_text                                                          
etymology_templates                                                     
etymology_number                                                     1.0
derived                                                                 
wikipedia                                                               
related                                            

In [26]:
ex_senses = ex_entries.iloc[2]["senses"]
ex_senses[0]

{'examples': [{'text': 'cộng 2 với 3',
   'english': 'to add 2 to 3',
   'type': 'example'},
  {'text': 'nhân tổng với hiệu',
   'english': 'to multiply this sum by this difference',
   'type': 'example'}],
 'links': [['with', 'with'],
  ['together with', 'together with'],
  ['along with', 'along with'],
  ['and', 'and'],
  ['arithmetic', 'arithmetic'],
  ['cộng', 'cộng#Vietnamese'],
  ['add', 'add#English'],
  ['nhân', 'nhân#Vietnamese'],
  ['multiply', 'multiply#English'],
  ['to', 'to#English'],
  ['by', 'by#English']],
 'raw_glosses': ['with; together with; along with; and',
  '(arithmetic, after cộng (“to add”) or nhân (“to multiply”)) to or by'],
 'glosses': ['with; together with; along with; and',
  '(arithmetic, after cộng (“to add”) or nhân (“to multiply”)) to or by',
  'to or by'],
 'topics': ['arithmetic'],
 'raw_tags': ['after cộng (“to add”) or nhân (“to multiply”)'],
 'id': 'en-với-vi-prep-~f1SuIAl',
 'categories': [{'name': 'Arithmetic',
   'kind': 'topical',
   'parents

In [27]:
def process_senses(senses: List[dict]) -> tuple[List[dict], str]:
    """Processes Wiktionary senses and extracts meanings and examples.

    Parameters
    ----------
    senses : List[dict]
        The senses extracted from Wiktionary

    Returns
    -------
    tuple[List[dict], str]
        A list of processed senses and a string representation of the meanings
    """
    senses_processed = []

    for sense in senses:
        if "glosses" not in sense and "raw_glosses" not in sense:
            continue

        meaning = (
            sense["raw_glosses"][0] if "raw_glosses" in sense else sense["glosses"][0]
        )  # Always take the first, main meaning

        # Check if the meaning is already in the list
        sense_dict = next(
            (s for s in senses_processed if s["meaning"] == meaning),
            {"meaning": meaning},
        )
        # if the only key was "meaning"
        created_new = len(sense_dict) == 1

        if "examples" in sense:
            examples = sense_dict.get("examples", [])
            for ex in sense["examples"]:
                if "english" not in ex:
                    examples.append(ex["text"])
                else:
                    examples.append(ex["text"] + " ― " + ex["english"])
            sense_dict["examples"] = examples
        if created_new:  # Only append if a new sense dict was created
            senses_processed.append(sense_dict)

    meanings_formatted = [sense["meaning"] for sense in senses_processed]
    senses_str = "; ".join(meanings_formatted)
    return senses_processed, senses_str


def json_dump_entries(entries: pd.DataFrame) -> tuple[str, str]:
    """Converts Wiktionary entries to a JSON string and a short string representation (for the back field.)

    Parameters
    ----------
    entries : pd.DataFrame
        Entries that were extracted from Wiktionary

    Returns
    -------
    tuple[str, str]
        Converted JSON string and a short string representation
    """
    out_entries = []
    entries_short_str = []
    for _, row in entries.iterrows():
        cur_entry = {}
        cur_entry["pos"] = row["pos"]
        if "synonyms" in row and row["synonyms"]:  # Synonyms can be NaN
            cur_entry["synonyms"] = [syn["word"] for syn in row["synonyms"]]
        cur_entry["meanings"], senses_string = process_senses(row["senses"])
        if not cur_entry["meanings"]:
            continue
        out_entries.append(cur_entry)

        entries_short_str.append(f"{row['pos']}: {senses_string}")

    short_meanings = " | ".join(entries_short_str)

    return json.dumps(out_entries), short_meanings

In [29]:
json_str, short_str = json_dump_entries(ex_entries)
with open("example.json", "w", encoding="utf-8") as f:
    f.write(json_str)

# Format to CSV