In [None]:
from datasets import load_dataset

ds_en = load_dataset("wikimedia/wikipedia", "20231101.en", streaming=True, split="train")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [2]:
import re
def simple_classify(text, keywords):
    return any(pattern.search(text) for pattern in keywords)

In [3]:
from itertools import islice
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
def parallel_filter(ds, compiled_keywords, max_length=None, stride=1, num_threads=8):
    iterable = (x for i, x in enumerate(ds) if i % stride == 0)
    if max_length:
        iterable = islice(iterable, max_length)
    iterable = list(iterable)
    
    filtered = []

    def check(element):
        return element if simple_classify(element["text"], compiled_keywords) else None

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(tqdm(executor.map(check, iterable), total=len(iterable)))
        
    filtered = [result for result in results if result is not None]
    return filtered

In [4]:
import pickle
keywords_en = [
    r"\bsport(s)?\b",
    r"\bathlete(s)?\b",
    r"\bcoach(es)?\b"
]

compiled_patterns = [re.compile(p, re.IGNORECASE) for p in keywords_en]

ds_sport_en = parallel_filter(ds_en, compiled_patterns, max_length=100000, stride=100,  num_threads=8)
print(len(ds_sport_en))
with open("../data/corpus/ds_sport_en.pkl", "wb") as f:
    pickle.dump(ds_sport_en, f)

100%|██████████| 64079/64079 [00:00<00:00, 489012.79it/s]


6552


In [5]:
print(ds_sport_en[0])

{'id': '1727', 'url': 'https://en.wikipedia.org/wiki/Amphipolis', 'title': 'Amphipolis', 'text': 'Amphipolis (; ) is a municipality in the Serres regional unit, Macedonia, Greece. The seat of the municipality is Rodolivos. It was an important ancient Greek polis (city), and later a Roman city, whose large remains can still be seen.\n\nAmphipolis was originally a colony of ancient Athenians and was the site of the battle between the Spartans and Athenians in 422 BC. It was later the place where Alexander the Great prepared for campaigns leading to his invasion of Asia in 335 BC. Alexander\'s three finest admirals, Nearchus, Androsthenes and Laomedon, resided in Amphipolis. After Alexander\'s death, his wife Roxana and their son Alexander IV were imprisoned and murdered in 311 BC.\n\nExcavations in and around the city have revealed important buildings, ancient walls and tombs. The finds are displayed at the archaeological museum of Amphipolis. At the nearby vast Kasta burial mound, an an

In [6]:
ds_fr = load_dataset("wikimedia/wikipedia", "20231101.fr", streaming=True, split="train")

keywords_fr = [
    r"\bsport(s)?\b",
    r"\bathl[ée]te(s)?\b",
    r"\bentra[îi]neur(s)?\b"
]
compiled_patterns_fr = [re.compile(p, re.IGNORECASE) for p in keywords_fr]

ds_sport_fr = parallel_filter(ds_fr, compiled_patterns_fr, max_length=100000, stride=100, num_threads=8)
print(len(ds_sport_fr))
with open("../data/corpus/ds_sport_fr.pkl", "wb") as f:
    pickle.dump(ds_sport_fr, f)

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 25647/25647 [00:00<00:00, 270199.58it/s]

1570





In [7]:
ds_zh = load_dataset("wikimedia/wikipedia", "20231101.zh", streaming=True, split="train")

In [8]:
keywords_zh = [
    r"运动",
    r"运动员",
    r"教练"
]
compiled_patterns_zh = [re.compile(p, re.IGNORECASE) for p in keywords_zh]
ds_sport_zh = parallel_filter(ds_zh, compiled_patterns_zh, max_length=100000, stride=100, num_threads=8)
print(len(ds_sport_zh))
with open("../data/corpus/ds_sport_zh.pkl", "wb") as f:
    pickle.dump(ds_sport_zh, f)

'HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/wikimedia/wikipedia/resolve/b04c8d1ceb2f5cd4588862100d08de323dccfbaa/20231101.zh/train-00004-of-00006.parquet
Retrying in 1s [Retry 1/5].
100%|██████████| 13848/13848 [00:00<00:00, 135244.37it/s]

938





In [1]:
import os
import re
import pickle

def sanitize_filename(name):
    # remove illegal characters
    return re.sub(r'[\\/*?:"<>|]', "_", name).strip()

def export_dataset_to_txt(pickle_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    with open(pickle_path, "rb") as f:
        dataset = pickle.load(f)

    for i, item in enumerate(dataset):
        title = item.get("title", f"untitled_{i}")
        text = item.get("text", "")

        safe_title = sanitize_filename(title)
        file_path = os.path.join(output_dir, f"{safe_title}.txt")

        with open(file_path, "w", encoding="utf-8") as out_f:
            out_f.write(text)

    print(f"Exported {len(dataset)} items to {output_dir}")

In [2]:
export_dataset_to_txt("../data/corpus/ds_sport_en.pkl", "../data/corpus")

Exported 6552 items to ../data/corpus


In [3]:
export_dataset_to_txt("../data/corpus/ds_sport_fr.pkl", "../data/corpus")

Exported 1570 items to ../data/corpus


In [4]:
export_dataset_to_txt("../data/corpus/ds_sport_zh.pkl", "../data/corpus")

Exported 938 items to ../data/corpus
