## Подготовка датасета Open RAG Benchmark

In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from pathlib import Path
from tqdm import tqdm
import json, os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CHUNK_MAX_CHARS = 1800   # примерный размер чанка
CHUNK_MIN_CHARS = 600
CHUNK_OVERLAP_CHARS = 200

Создание папок

In [9]:
from huggingface_hub import list_repo_files, hf_hub_download
from pathlib import Path
import json

DATA_ROOT = Path("data")
RAW_DIR = DATA_ROOT / "raw"
CORPUS_DIR = RAW_DIR / "corpus"

RAW_DIR.mkdir(parents=True, exist_ok=True)
CORPUS_DIR.mkdir(parents=True, exist_ok=True)

print("Folders ready:", RAW_DIR, CORPUS_DIR)

Folders ready: data\raw data\raw\corpus


Смотрим на файлы датасета https://huggingface.co/datasets/vectara/open_ragbench

In [10]:
files = list_repo_files("vectara/open_ragbench", repo_type="dataset")

print("Total files:", len(files))
for f in files[:20]:   # показать первые 20
    print(f)

Total files: 1006
.gitattributes
README.md
pdf/arxiv/answers.json
pdf/arxiv/corpus/2401.01872v2.json
pdf/arxiv/corpus/2401.02247v4.json
pdf/arxiv/corpus/2401.02564v2.json
pdf/arxiv/corpus/2401.03126v3.json
pdf/arxiv/corpus/2401.03305v2.json
pdf/arxiv/corpus/2401.03328v2.json
pdf/arxiv/corpus/2401.03345v2.json
pdf/arxiv/corpus/2401.03776v8.json
pdf/arxiv/corpus/2401.05657v5.json
pdf/arxiv/corpus/2401.05762v4.json
pdf/arxiv/corpus/2401.05851v4.json
pdf/arxiv/corpus/2401.06326v4.json
pdf/arxiv/corpus/2401.06740v2.json
pdf/arxiv/corpus/2401.06867v3.json
pdf/arxiv/corpus/2401.06959v2.json
pdf/arxiv/corpus/2401.06987v2.json
pdf/arxiv/corpus/2401.07152v3.json


Скачивание файлов с метаданными и Q&A

In [11]:
META_DIR = RAW_DIR

def download_file(path_in_repo, dest_dir=META_DIR):
    local_path = hf_hub_download(
        repo_id="vectara/open_ragbench",
        filename=path_in_repo,
        repo_type="dataset",
    )
    dest = dest_dir / Path(path_in_repo).name
    dest.write_bytes(Path(local_path).read_bytes())
    print("Saved:", dest)
    return dest

# Находим все meta-файлы
meta_files = [f for f in files if f.endswith(".json") and "/corpus/" not in f]

print("Meta files:", meta_files)

# Скачиваем
downloaded_meta_paths = []
for f in meta_files:
    downloaded_meta_paths.append(download_file(f, META_DIR))


Meta files: ['pdf/arxiv/answers.json', 'pdf/arxiv/pdf_urls.json', 'pdf/arxiv/qrels.json', 'pdf/arxiv/queries.json']
Saved: data\raw\answers.json
Saved: data\raw\pdf_urls.json
Saved: data\raw\qrels.json
Saved: data\raw\queries.json


Скачивание файлов статей

In [13]:
# ищем все файлы в папке corpus/
corpus_files = [f for f in files if "/corpus/" in f]

print("Corpus files:", len(corpus_files))

for f in tqdm(corpus_files):
    local_path = hf_hub_download(
        repo_id="vectara/open_ragbench",
        filename=f,
        repo_type="dataset",
    )
    dest = CORPUS_DIR / Path(f).name
    dest.write_bytes(Path(local_path).read_bytes())

Corpus files: 1000


100%|██████████| 1000/1000 [03:11<00:00,  5.23it/s]


Пример текста

In [14]:
example_doc = next(CORPUS_DIR.glob("*.json"))

with open(example_doc, "r", encoding="utf-8") as f:
    doc = json.load(f)

print("Document keys:", doc.keys())
print("Title:", doc.get("title"))
print("Number of sections:", len(doc.get("sections", [])))
print(doc["sections"][0]["text"][:500])

Document keys: dict_keys(['title', 'sections', 'id', 'authors', 'categories', 'abstract', 'updated', 'published'])
Title: Multiple Imputation of Hierarchical Nonlinear Time Series Data with an
  Application to School Enrollment Data
Number of sections: 9
#### Abstract

International comparisons of hierarchical time series data sets based on survey data, such as annual country-level estimates of school enrollment rates, can suffer from large amounts of missing data due to differing coverage of surveys across countries and across times. A popular approach to handling missing data in these settings is through multiple imputation, which can be especially effective when there is an auxiliary variable that is strongly predictive of and has a smaller a


In [15]:
from pathlib import Path

DATA_ROOT = Path("data")
RAW_DIR = DATA_ROOT / "raw"
CORPUS_DIR = RAW_DIR / "corpus"

PROCESSED_DIR = DATA_ROOT / "processed"
SAMPLES_DIR = DATA_ROOT / "samples"
BENCHMARK_DIR = DATA_ROOT / "benchmark"

for d in [PROCESSED_DIR, SAMPLES_DIR, BENCHMARK_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Processed:", PROCESSED_DIR)
print("Samples:", SAMPLES_DIR)

Processed: data\processed
Samples: data\samples


Функция для чанкования

In [56]:
import re

def chunk_text(
    text: str,
    max_chars: int = 1800,
    min_chars: int = 800,
    overlap_chars: int = 200,
):
    """
    Балансированный чанкер для научного текста с LaTeX:
    - предпочитает резать по абзацам (\n\n),
    - дальше по границе предложения (. ? !),
    - дальше по пробелу,
    - старается не резать внутри формул ($...$, $$...$$),
    - использует overlap, но начало чанка сдвигает к границе слова,
    - не зацикливается.
    """
    if not text:
        return []

    text = text.strip()
    n = len(text)
    if n == 0:
        return []

    chunks = []
    start = 0
    sentence_punct = ".?!。！？"

    def is_inside_math(pos: int) -> bool:
        """Эвристика: нечётное количество $ до позиции → внутри формулы."""
        before = text[:pos]
        dollar_count = 0
        i = 0
        while i < len(before):
            if before[i] == "\\":
                i += 2  # пропускаем экранированные символы
                continue
            if before[i] == "$":
                dollar_count += 1
            i += 1
        return dollar_count % 2 == 1

    while start < n:
        # Хвост меньше max_chars — забираем целиком
        if n - start <= max_chars:
            chunk = text[start:].strip()
            if chunk:
                chunks.append(chunk)
            break

        window_end = min(n, start + max_chars)
        search_from = start + min_chars
        if search_from >= window_end:
            search_from = start

        split = None

        # 1) Абзацный разрыв \n\n
        para_pos = text.rfind("\n\n", search_from, window_end)
        if para_pos != -1 and para_pos > start:
            candidate = para_pos + 2
            if not is_inside_math(candidate):
                split = candidate

        # 2) Граница предложения
        if split is None:
            candidate = -1
            for i in range(window_end - 1, search_from - 1, -1):
                ch = text[i]
                if ch in sentence_punct:
                    j = i + 1
                    if j >= n or text[j].isspace():
                        candidate = i + 1
                        break
            if candidate != -1 and candidate > start and not is_inside_math(candidate):
                split = candidate

        # 3) Пробел
        if split is None:
            candidate = text.rfind(" ", search_from, window_end)
            if candidate != -1 and candidate > start:
                split = candidate

        # 4) Жёсткий разрез
        if split is None or split <= start:
            split = window_end

        # Сам чанк
        chunk = text[start:split].strip()
        if chunk:
            chunks.append(chunk)

        # ==== новый start с оверлапом, БЕЗ разрезания слов ====

        if overlap_chars <= 0:
            next_start = split
        else:
            raw_next = max(split - overlap_chars, 0)

            # Ищем ближайшую границу слова между raw_next и split:
            # сначала пробел/перенос вперёд от raw_next
            boundary = -1
            for i in range(raw_next, split):
                if text[i].isspace():
                    boundary = i + 1  # начало следующего слова
                    break

            if boundary != -1:
                next_start = boundary
            else:
                # если внутри overlap нет пробелов — начинаем с split
                next_start = split

        # Пропускаем начальные пробелы/переводы строк
        while next_start < n and text[next_start].isspace():
            next_start += 1

        # Гарантируем движение вперёд
        if next_start <= start:
            next_start = start + max(1, max_chars // 2)

        start = next_start

    return chunks



Создание чанков

In [57]:
import json
from tqdm import tqdm

chunk_records = []

corpus_files = sorted(CORPUS_DIR.glob("*.json"))

print("Corpus files:", len(corpus_files))

for doc_path in tqdm(corpus_files, desc="Chunking corpus"):
    with open(doc_path, "r", encoding="utf-8") as f:
        doc = json.load(f)

    doc_id = doc.get("id") or doc.get("doc_id") or doc_path.stem
    title = doc.get("title", "")

    sections = doc.get("sections") or []
    if not sections and "text" in doc:
        # fallback: весь текст одной секцией
        sections = [{"heading": "", "text": doc["text"]}]

    for sec_idx, sec in enumerate(sections):
        sec_heading = sec.get("heading") or sec.get("title") or ""
        sec_text = sec.get("text") or ""

        if not sec_text.strip():
            continue

        chunks = chunk_text(sec_text)
        for ch_idx, ch in enumerate(chunks):
            chunk_records.append({
                "chunk_id": f"{doc_id}_sec{sec_idx}_chunk{ch_idx}",
                "doc_id": doc_id,
                "section_index": sec_idx,
                "chunk_index": ch_idx,
                "title": title,
                "section_heading": sec_heading,
                "text": ch,
            })

print("Total chunks:", len(chunk_records))

Corpus files: 1000


Chunking corpus: 100%|██████████| 1000/1000 [00:23<00:00, 41.92it/s]

Total chunks: 66949





Запись чанков (всех и примеров) на локальный диск

In [58]:
import json
from pathlib import Path

DATA_ROOT = Path("data")
PROCESSED_DIR = DATA_ROOT / "processed"
SAMPLES_DIR = DATA_ROOT / "samples"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
SAMPLES_DIR.mkdir(parents=True, exist_ok=True)

chunks_path = PROCESSED_DIR / "chunks.jsonl"
sample_path = SAMPLES_DIR / "chunks_sample.jsonl"

def save_jsonl(records, path: Path, limit: int | None = None):
    n = len(records) if limit is None else min(len(records), limit)
    with open(path, "w", encoding="utf-8") as f:
        for i in range(n):
            f.write(json.dumps(records[i], ensure_ascii=False) + "\n")
    print(f"Saved {n} records to {path}")

save_jsonl(chunk_records, chunks_path)          # все 60k
save_jsonl(chunk_records, sample_path, 50)      # сэмпл для GitHub/README

Saved 66949 records to data\processed\chunks.jsonl
Saved 50 records to data\samples\chunks_sample.jsonl


In [55]:
def mb(path: Path) -> float:
    return path.stat().st_size / (1024 * 1024)

print("Total chunks:", len(chunk_records))
print("chunks.jsonl size:", f"{mb(chunks_path):.2f} MB")

Total chunks: 67206
chunks.jsonl size: 107.41 MB


In [59]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

# загружаем ключи из .env
load_dotenv()

hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("HF_TOKEN not found in .env")

# 1. логин в HuggingFace
login(token=hf_token)

print("Logged in to HuggingFace!")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in to HuggingFace!


Пушим чанки в качестве датасета на HuggingfaceHub

In [60]:
from datasets import Dataset
import json

# загружаем чанки
processed_chunks = []
with open("data/processed/chunks.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        processed_chunks.append(json.loads(line))

chunks_ds = Dataset.from_list(processed_chunks)

# пушим только корпус
chunks_ds.push_to_hub("Ilya-huggingface/open_ragbench_chunks")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]
Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  5.69ba/s][A
Processing Files (0 / 0): |          |  0.00B /  0.00B            
Processing Files (0 / 1):   1%|▏         |  526kB / 39.9MB,  527kB/s  
Processing Files (0 / 1):   3%|▎         | 1.05MB / 39.9MB,  876kB/s  
Processing Files (0 / 1):   5%|▌         | 2.10MB / 39.9MB, 1.50MB/s  
Processing Files (0 / 1):   9%|▉         | 3.68MB / 39.9MB, 2.30MB/s  
Processing Files (0 / 1):  20%|█▉        | 7.89MB / 39.9MB, 4.39MB/s  
Processing Files (0 / 1):  25%|██▌       | 9.99MB / 39.9MB, 5.00MB/s  
Processing Files (0 / 1):  30%|███       | 12.1MB / 39.9MB, 5.50MB/s  
Processing Files (0 / 1):  34%|███▍      | 13.7MB / 39.9MB, 5.70MB/s  
Processing Files (0 / 1):  41%|████      | 16.3MB / 39.9MB, 6.28MB/s  
Processing Files (0 / 1):  44%|████▎     | 17.4MB / 39.9MB,

CommitInfo(commit_url='https://huggingface.co/datasets/Ilya-huggingface/open_ragbench_chunks/commit/d62422bf18dacab2788306ef5953a8966baac85b', commit_message='Upload dataset', commit_description='', oid='d62422bf18dacab2788306ef5953a8966baac85b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Ilya-huggingface/open_ragbench_chunks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Ilya-huggingface/open_ragbench_chunks'), pr_revision=None, pr_num=None)