# What if my dataset isn't on the Hub?

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

This notebook demonstrates how to work with datasets that aren't available on the Hugging Face Hub, including loading local files and remote datasets.

In [None]:
# Install required packages for working with datasets
# - datasets: Core library for loading and processing datasets
# - evaluate: Library for evaluation metrics
# - transformers[sentencepiece]: Transformers library with SentencePiece tokenizer support
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Download sample dataset files for demonstration
# Using wget to download Italian SQuAD dataset files from GitHub
# These are compressed JSON files containing question-answering data
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
# Extract the compressed JSON files
# -d: decompress, -k: keep original files, -v: verbose output
!gzip -dkv SQuAD_it-*.json.gz

In [None]:
# Load a local JSON dataset using the Datasets library
# Key parameters:
# - "json": specifies the dataset format/loader to use
# - data_files: path to the local JSON file
# - field="data": specifies which field in the JSON contains the actual data
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")

In [None]:
# Display basic information about the loaded dataset
# Shows the structure, features, and number of rows
squad_it_dataset

In [None]:
# Examine the first example in the dataset
# This shows the nested structure typical of SQuAD-format datasets:
# - title: article title
# - paragraphs: list containing context and question-answer pairs
squad_it_dataset["train"][0]

In [None]:
# Load multiple files to create train/test splits
# Using a dictionary to map split names to file paths
# This creates a DatasetDict with both training and test sets
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
# Load compressed files directly without manual extraction
# The datasets library can handle compressed files automatically
# This saves disk space and download time
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
# Load datasets directly from remote URLs
# This allows loading datasets without downloading them first
# Particularly useful for large datasets or when working in cloud environments
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")