In [None]:
from datasets import load_dataset
import re
from transliterate import translit
from bs4 import BeautifulSoup

## Dataset informations

https://hplt-project.org/datasets/v2.0 \
https://oscar-project.org/ \
https://huggingface.co/datasets/jerteh/SrpKorNews 

In [None]:
hplt_dataset = load_dataset("HPLT/HPLT2.0_cleaned", "srp_Cyrl", cache_dir="/data", streaming=True, split="train") 
oscar_dataset = load_dataset("oscar-corpus/OSCAR-2201", "sr", cache_dir="/data", trust_remote_code=True, streaming=True, split="train")
srpkor_dataset = load_dataset("jerteh/SrpKorNews", cache_dir="/data", streaming=True, split="train")

In [None]:
# used for hplt and oscar
def filter_cyrillic_text(text, min_length=20):
    '''
    Filters input text by removing lines that:
    - Contain fewer than 2 Cyrillic words
    - Are shorter than the specified minimum length
    - Contain bracketed numbers (e.g., [1], [23])
    
    Also removes leading non-alphabetic characters such as digits and special characters from each valid line.
    
    Parameters:
        text (str): The input multi-line text.
        min_length (int): The minimum length a line must have to be kept. Default is 20.
    
    Returns:
        str: The cleaned text with filtered lines joined by newline characters.
    '''
    cyrillic_regex = re.compile('[\u0400-\u04FF]+')
    lines = text.splitlines()
    brackets_regex = re.compile(r'\[\d+\]')

    filtered_lines = [
        brackets_regex.sub("", line).lstrip("1234567890;#&:").strip()
        for line in lines
        if len(line) >= min_length and len(re.findall(cyrillic_regex, line)) >= 2
    ]

    return "\n".join(filtered_lines)


In [None]:
# used for srpkornews
def extract_and_transliterate_text(text, min_length=20):
    '''
    Parses and extracts visible text from HTML, then transliterates each sentence from Serbian Cyrillic to Latin.
    It also:
    - Removes bracketed number patterns (e.g., [1], [23])
    - Skips lines shorter than the minimum length
    - Removes leading digits and special characters from each line

    Parameters:
        text (str): HTML content as a string.
        min_length (int): Minimum length a sentence must have to be included. Default is 20.
    
    Returns:
        str: Transliterated and cleaned text with sentences joined by newlines.
    '''
    soup = BeautifulSoup(text, "html.parser")
    parsed_text = soup.get_text(separator=". ", strip=True)
    lines = re.findall(r'[^.]+\.?', parsed_text)

    brackets_regex = re.compile(r'\[\d+\]')
    filtered_lines = [
        translit(brackets_regex.sub("", line).lstrip("1234567890;#&:").strip(), "sr")
        for line in lines
        if len(line) >= min_length
    ]

    return "\n".join(filtered_lines)

In [None]:
processed_hplt = hplt_dataset.map(lambda x: {"text": filter_cyrillic_text(x["text"])})
processed_oscar = oscar_dataset.map(lambda x: {"text": filter_cyrillic_text(x["text"])})
processed_srpkor = srpkor_dataset.map(lambda x: {"text": extract_and_transliterate_text(x["text"])})

processed_datasets = [processed_hplt, processed_oscar, processed_srpkor]

In [None]:
import json
from tqdm import tqdm

output_jsonl = "concatenated_dataset.jsonl"
with open(output_jsonl, "w", encoding="utf-8") as f:
    for ds in processed_datasets:
        for example in tqdm(ds):
            json_line = json.dumps({"text": example["text"]})
            f.write(json_line + "\n")