### Imports and Setup

In [None]:
import os
import json
import hashlib
from typing import List

### Download and Save the Datasets

In [None]:
def load_dataset_from_directory(directory_path: str) -> List[dict]:
    """Load JSON files from a directory into a list"""
    dataset = []
    if os.path.exists(directory_path):
        for filename in os.listdir(directory_path):
            if filename.endswith('.json'):
                with open(os.path.join(directory_path, filename), 'r') as f:
                    dataset.append(json.load(f))
    return dataset

# Load datasets from data directory
data_dir = os.path.join("data")

# Load Wikipedia dataset
wiki_path = os.path.join(data_dir, "wikipedia")
wiki_dataset = load_dataset_from_directory(wiki_path)
print(f"Loaded {len(wiki_dataset)} Wikipedia articles")

# Load BookCorpus dataset
books_path = os.path.join(data_dir, "bookcorpus")
books_dataset = load_dataset_from_directory(books_path)
print(f"Loaded {len(books_dataset)} books")

### Create the Dataset Hashes

In [None]:
def hash_content(content: str) -> str:
    """Hash a string using SHA-256"""
    return hashlib.sha256(content.encode()).hexdigest()

def create_merkle_tree(data_list: List[str]) -> str:
    """Create a Merkle tree from a list of data and return root hash"""
    if not data_list:
        return hash_content("")
    
    # Create leaf nodes by hashing each piece of data
    hashes = [hash_content(str(data)) for data in data_list]
    
    # Build tree bottom-up until we reach root
    while len(hashes) > 1:
        if len(hashes) % 2 == 1:
            hashes.append(hashes[-1])  # Duplicate last hash if odd number
        
        next_level = []
        for i in range(0, len(hashes), 2):
            combined = hashes[i] + hashes[i+1]
            next_level.append(hash_content(combined))
        hashes = next_level
        
    return hashes[0]  # Return root hash

# Load and hash Wikipedia dataset
wiki_data = load_dataset_from_directory(wiki_path)
wiki_hash = create_merkle_tree(wiki_data)
print(f"Wikipedia dataset hash: {wiki_hash}")

# Load and hash BookCorpus dataset
books_data = load_dataset_from_directory(books_path)
books_hash = create_merkle_tree(books_data)
print(f"BookCorpus dataset hash: {books_hash}")

# Save dataset hashes to a JSON file
dataset_hashes = {
    "wikipedia": wiki_hash,
    "bookcorpus": books_hash
}

hash_file_path = os.path.join("data", "dataset_hashes.json")
with open(hash_file_path, "w") as f:
    json.dump(dataset_hashes, f, indent=4)

print(f"Dataset hashes saved to: {hash_file_path}")