In [13]:
import os
import json
import hunspell
import pandas as pd
import re
from pathlib import Path

# Initialize Hunspell
hobj = hunspell.HunSpell(
    "../data/lt_dictionary/lt_LT.dic", "../data/lt_dictionary/lt_LT.aff"
)


def tokenize(text):
    return re.findall(r"\b[\wĄČĘĖĮŠŲŪŽąčęėįšųūž]+\b", text)


def classify_words(text):
    words = tokenize(text)
    valid_words = [w for w in words if len(w) > 2 and w.isalpha()]

    misspelled = []
    # foreign = []

    for word in valid_words:
        if not hobj.spell(word):
            suggestions = hobj.suggest(word)
            if suggestions:
                misspelled.append(word)  # probably a Lithuanian misspelling
            # else:
            #    foreign.append(word)     # likely a foreign word

    total_words = len(valid_words)
    return {
        "misspelled_words": misspelled,
        # "foreign_words": foreign,
        "misspelled_ratio": len(misspelled) / total_words if total_words > 0 else 0.0,
        # "foreign_ratio": len(foreign) / total_words if total_words > 0 else 0.0
    }

In [14]:
results = []

for file in Path("../data/").rglob("*.jsonl"):
    try:
        texts = []
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    if "text" in obj:
                        texts.append(obj["text"])
                except json.JSONDecodeError:
                    print(f"Skipping bad JSON in {file}")

        if not texts:
            continue

        combined_text = " ".join(texts)
        stats = classify_words(combined_text)

        results.append(
            {
                "dataset": str(file.relative_to("../data/")),
                "misspelled_ratio": round(stats["misspelled_ratio"], 4),
                # "foreign_ratio": round(stats["foreign_ratio"], 4),
                "misspelled_words": sorted(set(stats["misspelled_words"])),
                # "foreign_words": sorted(set(stats["foreign_words"]))
            }
        )

        print(f"Processed {file.name}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

result_df = pd.DataFrame(results)

Processed 133386_lt.jsonl
Processed 216087_lt.jsonl
Processed 228262_lt.jsonl
Processed 243341_lt.jsonl
Processed 244056_lt.jsonl
Processed 254781_lt.jsonl
Processed 264516_lt.jsonl
Processed 264898_lt.jsonl
Processed 264939_lt.jsonl
Processed 66624_lt.jsonl
Processed 133386_lt-checkpoint.jsonl
Processed 132839_lt.jsonl
Processed 133868_lt.jsonl
Processed 133885_lt.jsonl
Processed 206292_lt.jsonl
Processed 219163_lt.jsonl
Processed 219207_lt.jsonl
Processed 219410_lt.jsonl
Processed 220609_lt.jsonl
Processed 222403_lt.jsonl
Processed 222829_lt.jsonl
Processed 223729_lt.jsonl
Processed 224632_lt.jsonl
Processed 224772_lt.jsonl
Processed 227242_lt.jsonl
Processed 227877_lt.jsonl
Processed 228216_lt.jsonl
Processed 230481_lt.jsonl
Processed 230565_lt.jsonl
Processed 231187_lt.jsonl
Processed 231801_lt.jsonl
Processed 232365_lt.jsonl
Processed 233946_lt.jsonl
Processed 236364_lt.jsonl
Processed 236373_lt.jsonl
Processed 236422_lt.jsonl
Processed 236903_lt.jsonl
Processed 237009_lt.jsonl
Pr

In [18]:
result_df["misspelled_ratio"] = result_df["misspelled_ratio"] * 100
result_df.iloc[:-2, :].to_csv(
    "../output/misspelled_proportion_of_words.csv", index=False
)
result_df.iloc[:-2, :]

Unnamed: 0,dataset,misspelled_ratio,misspelled_words
0,lt_test/133386_lt.jsonl,7.23,"[Erik, Landesgericht, Taner]"
1,lt_test/216087_lt.jsonl,4.19,"[Amministrativo, Antrepriză, Attuazione, BENJO..."
2,lt_test/228262_lt.jsonl,1.54,"[AFI, Alimentos, Assche, BRF, Barrett, Bennett..."
3,lt_test/243341_lt.jsonl,2.06,"[Anatomy, Anderson, Atlas, Campbell, Christof,..."
4,lt_test/244056_lt.jsonl,3.88,"[Ampuero, Parker, Russell, SESV]"
...,...,...,...
56,lt_train/254450_lt.jsonl,11.61,"[Busch, EUIPO, European, Nestlé, Société, Vevė..."
57,lt_train/259843_lt.jsonl,20.18,"[Brien, Davis, EUIPO, Hofmann, Larsson, Messer..."
58,lt_train/260370_lt.jsonl,4.30,"[Arslan, Luis, Santis]"
59,lt_train/262421_lt.jsonl,2.35,"[ASVG, Acosta, Aliev, Alvin, Atkinson, BGBl, B..."


In [20]:
test_avg = result_df.iloc[:-2, :][
    result_df.iloc[:-2, :]["dataset"].str.startswith("lt_test")
]["misspelled_ratio"].mean()
train_avg = result_df.iloc[:-2, :][
    result_df.iloc[:-2, :]["dataset"].str.startswith("lt_train")
]["misspelled_ratio"].mean()

print(f"Average non-Lithuanian word ratio (lt_test): {round(test_avg, 2)}%")
print(f"Average non-Lithuanian word ratio (lt_train): {round(train_avg, 2)}%")

Average non-Lithuanian word ratio (lt_test): 6.68%
Average non-Lithuanian word ratio (lt_train): 7.68%
