# Big data? 🤗 Datasets to the rescue!

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

This notebook demonstrates how to work with very large datasets using streaming and memory-efficient techniques.

In [None]:
# Install required packages for big data processing
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Install zstandard for handling compressed datasets
# Many large datasets use zstd compression for better compression ratios
!uv pip install zstandard

In [None]:
# Load a very large dataset (15+ million examples)
# This PUBMED dataset contains scientific paper abstracts
# The dataset is compressed and will be downloaded and cached locally
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "https://huggingface.co/datasets/qualis2006/PUBMED_title_abstracts_2020_baseline/resolve/main/PUBMED_title_abstracts_2020_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
pubmed_dataset

In [None]:
# Examine the structure of a single example
# Each example contains metadata (PMID, language) and text (title + abstract)
pubmed_dataset[0]

In [None]:
# Install psutil to monitor memory usage
# This helps us understand the memory impact of loading large datasets
!uv pip install psutil

In [None]:
# Monitor current memory usage
# Shows how much RAM the process is currently using
import psutil

# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

In [None]:
# Check the size of the cached dataset file
# Even though the dataset is large, it's efficiently stored using Apache Arrow
print(f"Number of files in dataset : {pubmed_dataset.dataset_size}")
size_gb = pubmed_dataset.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

In [None]:
# Benchmark dataset iteration performance
# Test how quickly we can iterate through the entire dataset
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(pubmed_dataset), batch_size):
    _ = pubmed_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

In [None]:
# Enable streaming mode for memory-efficient processing
# streaming=True processes data without loading everything into memory
pubmed_dataset_streamed = load_dataset(
    "json", data_files=data_files, split="train", streaming=True
)

In [None]:
# Access the first example from the streamed dataset
# Uses an iterator pattern instead of direct indexing
next(iter(pubmed_dataset_streamed))

In [None]:
# Apply transformations to streamed datasets
# Tokenization can be applied on-the-fly without loading full dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

In [None]:
# Shuffle streamed datasets for training
# buffer_size controls how many examples are loaded for shuffling
# Larger buffer_size gives better randomization but uses more memory
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [None]:
# Take a small subset for inspection or testing
# Useful for getting a sample of a large streamed dataset
dataset_head = pubmed_dataset_streamed.take(5)
list(dataset_head)

In [None]:
# Create train/validation splits from streamed data
# skip() and take() allow splitting without loading the full dataset
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [None]:
# Load another large dataset for demonstration
# This legal dataset contains court opinions and legal documents
law_dataset_streamed = load_dataset(
    "json",
    data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    split="train",
    streaming=True,
)
next(iter(law_dataset_streamed))

In [None]:
# Combine multiple streamed datasets
# interleave_datasets alternates between datasets for mixed training
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])
list(islice(combined_dataset, 2))

In [None]:
# Load the complete Pile dataset (a massive text corpus)
# This demonstrates loading multiple files for train/validation/test splits
# The Pile is one of the largest open-source text datasets
base_url = "https://the-eye.eu/public/AI/pile/"
data_files = {
    "train": [base_url + "train/" + f"{idx:02d}.jsonl.zst" for idx in range(30)],
    "validation": base_url + "val.jsonl.zst",
    "test": base_url + "test.jsonl.zst",
}
pile_dataset = load_dataset("json", data_files=data_files, streaming=True)
next(iter(pile_dataset["train"]))