In [2]:
import gzip, json
from tqdm import tqdm

In [9]:
def shrink_wiktextract_entry(entry: dict) -> dict | None:
    """
    Keep only full keys relevant to etymological relationships.
    If the entry has no etymology/derivational info at all, return None.
    """

    # Main etymology-bearing keys
    target_keys = {
        "word",
        "lang",
        "lang_code",
        "pos",
        "etymology_number",
        "etymology_text",
        "etymology_templates",
        "derived",
        "descendants",
        "alt_of",
        "form_of",
        "categories",
        "redirects",
        "literal_meaning",
        "wikidata"
    }

    # Filter condition: does this entry have anything we care about?
    has_info = any(
        key in entry and entry[key]
        for key in ("etymology_text", "etymology_templates", "derived", "descendants")
    )

    if not has_info:
        return None  # discard entry entirely

    # Build output using full unmodified values
    out = {}
    for key in target_keys:
        if key in entry:
            out[key] = entry[key]  # keep full content of the key

    return out


In [10]:
in_path  = "raw-wiktextract-data.jsonl.gz"
out_path = "filtered-etymology.jsonl.gz"

count_keep = 0
count_skip = 0
lines_processed = 0

with gzip.open(in_path, "rt", encoding="utf-8") as fin, \
     gzip.open(out_path, "wt", encoding="utf-8") as fout:

    try:
        for line in tqdm(fin):
            lines_processed += 1
            entry = json.loads(line)

            slim = shrink_wiktextract_entry(entry)

            if slim is None:
                count_skip += 1
            else:
                count_keep += 1
                fout.write(json.dumps(slim, ensure_ascii=False,
                                      separators=(",", ":")) + "\n")

    except EOFError:
        print(f"\nWARNING: truncated gzip: unexpected EOF after {lines_processed} lines.")

10343099it [07:48, 22089.42it/s]


In [11]:
print("keep:", count_keep)
print("skip:", count_skip)
print("lines processed:", lines_processed)
print("written to:", out_path)

keep: 2880555
skip: 7462544
lines processed: 10343099
written to: filtered-etymology.jsonl.gz


In [12]:
import gzip
with gzip.open("filtered-etymology.jsonl.gz", "rb") as src, open("file.jsonl", "wb") as dst:
    dst.write(src.read())
