In [1]:
from datasets import load_dataset
from itertools import islice
import json
from datetime import datetime

In [3]:
import re
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from collections import defaultdict

In [4]:
def extract_valid_sentence(text, min_len=20, max_len=200):
    sentences = re.split(r'[.!?。？！]\s*', text.strip())
    for sentence in sentences:
        sentence = sentence.strip()
        if min_len <= len(sentence) <= max_len:
            return sentence
    return None

min_len = 20
max_len = 200
max_per_language = 1000

with open("test_data.jsonl", "r", encoding="utf-8") as f:
    raw_data = [json.loads(line) for line in f]

seen_texts = set()
language_counts = defaultdict(int)
filtered_data = []

for item in raw_data:
    lang = item["lang_group"]
    
    if language_counts[lang] >= max_per_language:
        continue

    sentence = extract_valid_sentence(item["text"], min_len, max_len)
    if sentence and sentence not in seen_texts:
        seen_texts.add(sentence)
        language_counts[lang] += 1
        filtered_data.append({"lang": lang, "text": sentence})

with open("test_data_cleaned.jsonl", "w", encoding="utf-8") as f:
    for item in filtered_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Saved {len(filtered_data)} filtered sentences")
for lang, count in language_counts.items():
    print(f"{lang}: {count}")

Saved 4000 filtered sentences
eng_Latn: 1000
bul_Cyrl: 1000
kor_Hang: 1000
jpn_Jpan: 1000
