### Imports and Setup

In [1]:
import hashlib
import json
import os
from typing import List

from datasets import load_dataset
from huggingface_hub import list_datasets
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Download and Save the Datasets

In [4]:
def download_and_save_dataset(dataset_name, subset=None, batch_size=1000):
    # Load the dataset
    if subset:
        dataset = load_dataset(dataset_name, subset=subset, split='train', streaming=True)
    else:
        dataset = load_dataset(dataset_name, split='train', streaming=True)

    # Specify the file where you want to save the dataset
    # Change the path to a directory where you have write permissions
    save_file = f"./data/{dataset_name}_dataset.jsonl"  # Changed from /backend/data to ./data

    # Ensure the directory exists
    os.makedirs(os.path.dirname(save_file), exist_ok=True)

    # Open the file in append mode
    with open(save_file, 'a') as f:
        batch = []
        for example in dataset:
            batch.append(example)
            if len(batch) == batch_size:
                # Write the batch to the file
                for item in batch:
                    f.write(json.dumps(item) + "\n")
                batch = []  # Clear the batch

        # Write any remaining examples in the last batch
        if batch:
            for item in batch:
                f.write(json.dumps(item) + "\n")

    print(f"Dataset saved to {save_file}")
# List of datasets with size information in comments
"""
Dataset Descriptions:
- tiny_shakespeare (~450 KB): A very small dataset ideal for testing or experimentation.
- bookcorpus (~6 GB): Text extracted from books, used in LLM pretraining.
- wikipedia (subset: 20220301.en, ~16 GB): English Wikipedia dump as of March 2022.
- openwebtext (~40 GB): Open-source recreation of the WebText dataset.
- the_pile (~825 GB): A massive dataset designed for large language model training.
- c4 (subset: en, ~750 GB): Cleaned version of Common Crawl, frequently used for large-scale LLM training.
- oscar (subset: unshuffled_deduplicated_en, ~1.3 TB): A multilingual web corpus.
- common_crawl (Several TBs): Raw web crawl data; storage needs depend on the specific subset downloaded.
"""
datasets_to_download = [
    # {"name": "tiny_shakespeare"},  # ~450 KB
    {"name": "bookcorpus"},       # ~6 GB
    # {"name": "wikipedia", "subset": "20220301.en"},  # ~16 GB
    # {"name": "openwebtext"},      # ~40 GB
    # {"name": "the_pile"},         # ~825 GB
    # {"name": "c4", "subset": "en"},  # ~750 GB
    # {"name": "oscar", "subset": "unshuffled_deduplicated_en"},  # ~1.3 TB
    # {"name": "common_crawl"}      # Several TBs
]


# Loop through the datasets and download them
for dataset_info in datasets_to_download:
    name = dataset_info["name"]
    subset = dataset_info.get("subset")  # Subset is optional
    download_and_save_dataset(name, subset)


Downloading data:   8%|▊         | 94.5M/1.18G [06:23<1:13:26, 246kB/s] 
Downloading data:  35%|███▌      | 415M/1.18G [12:06<22:19, 571kB/s] 



KeyboardInterrupt: 

Downloading data:   0%|          | 1.23M/1.18G [00:04<54:12, 362kB/s][A



In [2]:


# List all available datasets
datasets = list(list_datasets())
print(f"Total datasets available: {len(datasets)}")
print("Some example datasets:")
for dataset in datasets[:10]:
    print(dataset.id)

# Prompt user to select a dataset
selected_dataset = input("Enter the ID of the dataset you want to download: ")

# Function to download the selected dataset
def download_selected_dataset(dataset_id: str):
    print(f"Downloading dataset: {dataset_id}")
    dataset = load_dataset(dataset_id)
    return dataset

# Download the selected dataset
downloaded_dataset = download_selected_dataset(selected_dataset)

def save_dataset_incrementally(dataset_name: str, split: str, save_path: str, num_samples: int = 100):
    """Download and save a dataset incrementally to avoid data loss on errors."""
    dataset = load_dataset(dataset_name, split=split, streaming=True)
    os.makedirs(save_path, exist_ok=True)
    
    for i, sample in enumerate(tqdm(dataset.take(num_samples), total=num_samples, desc=f"Saving {dataset_name}")):
        with open(os.path.join(save_path, f"{dataset_name}_sample_{i}.json"), "w") as f:
            json.dump(sample, f)
        print(f"Saved {dataset_name} sample {i+1}/{num_samples}")

# Load datasets from data directory
data_dir = os.path.join("data")

# Save BookCorpus dataset incrementally
books_path = os.path.join(data_dir, "bookcorpus")
save_dataset_incrementally("bookcorpus", "train", books_path)

# Save Wikipedia dataset incrementally
wiki_path = os.path.join(data_dir, "wikipedia")
save_dataset_incrementally("wikipedia", "20220301.en", wiki_path)

KeyboardInterrupt: 

### Create the Dataset Hashes
This section uses Merkle hashes to generate dataset hashes and saves them in the same directory.

In [None]:
def hash_content(content: str) -> str:
    """Hash a string using SHA-256"""
    return hashlib.sha256(content.encode()).hexdigest()

def create_merkle_tree(data_list: List[str]) -> str:
    """Create a Merkle tree from a list of data and return root hash"""
    if not data_list:
        return hash_content("")
    
    # Create leaf nodes by hashing each piece of data
    hashes = [hash_content(str(data)) for data in data_list]
    
    # Build tree bottom-up until we reach root
    while len(hashes) > 1:
        if len(hashes) % 2 == 1:
            hashes.append(hashes[-1])  # Duplicate last hash if odd number
        
        next_level = []
        for i in range(0, len(hashes), 2):
            combined = hashes[i] + hashes[i+1]
            next_level.append(hash_content(combined))
        hashes = next_level
        
    return hashes[0]  # Return root hash

# Load and hash Wikipedia dataset
wiki_data = load_dataset_from_directory(wiki_path)
wiki_hash = create_merkle_tree(wiki_data)
print(f"Wikipedia dataset hash: {wiki_hash}")

# Load and hash BookCorpus dataset
books_data = load_dataset_from_directory(books_path)
books_hash = create_merkle_tree(books_data)
print(f"BookCorpus dataset hash: {books_hash}")

# Save dataset hashes to a JSON file
dataset_hashes = {
    "wikipedia": wiki_hash,
    "bookcorpus": books_hash
}

hash_file_path = os.path.join("data", "dataset_hashes.json")
with open(hash_file_path, "w") as f:
    json.dump(dataset_hashes, f, indent=4)

print(f"Dataset hashes saved to: {hash_file_path}")