In [1]:
import hunspell

import os
import pandas as pd
import re
import hunspell
from glob import glob

import json

In [2]:
# Initialize Hunspell with our Lithuanian dictionary
hobj = hunspell.HunSpell(
    "../data/lt_dictionary/lt_LT.dic", "../data/lt_dictionary/lt_LT.aff"
)

In [3]:
def tokenize(text):
    """Tokenize the text"""
    return re.findall(r"\b\w+\b", text.lower())


def get_non_lt_info(text):
    words = text.split()
    valid_words = [word for word in words if len(word) > 2 and word.isalpha()]

    non_lt_words = [word for word in valid_words if not hobj.spell(word)]

    if len(valid_words) == 0:
        return 0.0, []

    ratio = len(non_lt_words) / len(valid_words)
    return ratio, non_lt_words

In [4]:
summary_rows = []

# Find all .jsonl files in data/ and subfolders
jsonl_files = glob("../data/**/*.jsonl", recursive=True)

for file_path in jsonl_files:
    try:
        all_texts = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    if "text" in obj:
                        all_texts.append(obj["text"])
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON in {file_path}")

        if not all_texts:
            print(f"Skipping {file_path}: no 'text' entries found")
            continue

        combined_text = " ".join(all_texts)

        # Apply your analysis on the whole combined text
        non_lt_ratio, non_lt_words = get_non_lt_info(combined_text)

        dataset_name = os.path.relpath(file_path, "../data/")
        summary_rows.append(
            {
                "dataset_name": dataset_name,
                "non_lt_word_ratio": round(non_lt_ratio, 4) * 100,
                "non_lt_words": sorted(set(non_lt_words)),
            }
        )

        # print(f"Processed {dataset_name}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

In [5]:
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("../output/non_lt_proportion_of_word.csv", index=False)
summary_df

Unnamed: 0,dataset_name,non_lt_word_ratio,non_lt_words
0,lt_test/133386_lt.jsonl,16.9,"[Erik, Landesgericht, Lodewijk, Salzburg, Tane..."
1,lt_test/216087_lt.jsonl,3.68,"[Amministrativo, Antrepriză, Attuazione, Bonya..."
2,lt_test/228262_lt.jsonl,1.54,"[Alimentos, Açúcares, BRF, Barrett, Bennett, C..."
3,lt_test/243341_lt.jsonl,1.57,"[Anatomy, Anderson, Bundesgerichtshof, Christo..."
4,lt_test/244056_lt.jsonl,4.71,"[Ampuero, Parker, Russell, SESV]"
5,lt_test/254781_lt.jsonl,2.02,"[Acosta, Burke, ESS, Kaufmann, Kielatis, Konin..."
6,lt_test/264516_lt.jsonl,18.32,"[Administrativa, Aduaneira, Arbitragem, Arbitr..."
7,lt_test/264898_lt.jsonl,13.04,"[Azemovič, EUIPO, Kevin, Mudrovčič, Polyakov, ..."
8,lt_test/264939_lt.jsonl,13.08,"[ALMARA, Aceites, Almenara, Borisov, EUIPO, Fi..."
9,lt_test/66624_lt.jsonl,3.55,"[Bench, Court, Division, Goldsworthy, High, Ju..."


In [6]:
summary_df.iloc[0, 2]

['Erik', 'Landesgericht', 'Lodewijk', 'Salzburg', 'Taner', 'Ulrich']

In [8]:
test_avg = summary_df[summary_df["dataset_name"].str.startswith("lt_test")][
    "non_lt_word_ratio"
].mean()
train_avg = summary_df[summary_df["dataset_name"].str.startswith("lt_train")][
    "non_lt_word_ratio"
].mean()

print(f"Average non-Lithuanian word ratio (lt_test): {round(test_avg, 2)}%")
print(f"Average non-Lithuanian word ratio (lt_train): {round(train_avg, 2)}%")

Average non-Lithuanian word ratio (lt_test): 7.84%
Average non-Lithuanian word ratio (lt_train): 8.9%
