# Initial Imports

In [1]:
from bigdata_a3_utils import *
import shutil
import os
from pathlib import Path
import logging
import tarfile
import zipfile

In [2]:
cache_dir = get_cache_directory()

[INFO] Current cache directory: C:\Users\saeed\.cache\huggingface\datasets
[NOTE] To use a custom cache directory, set HF_DATASETS_CACHE before importing datasets.
Example:
    import os
    os.environ['HF_DATASETS_CACHE'] = 'C:\\your\\custom\\path'
    from datasets import load_dataset



In [3]:
default_cache_dir = default_cache_path()

[INFO] Your default cache path: "C:\Users\saeed\.cache\huggingface\datasets"


# Checking availble disk space
- Files could also be stored on external drive
- Simply change file path in save_path variable

In [4]:
# Get the disk where your cache directory is located
cache_path = cache_dir
drive = os.path.splitdrive(cache_path)[0]

# Get disk usage statistics
total, used, free = shutil.disk_usage(drive)

# Convert to GB for easier reading
total_gb = total / (1024**3)
used_gb = used / (1024**3)
free_gb = free / (1024**3)

print(f"Drive {drive}:")
print(f"Total: {total_gb:.2f} GB")
print(f"Used: {used_gb:.2f} GB")
print(f"Free: {free_gb:.2f} GB")

# Check if you have enough space (e.g., 220GB to be safe)
required_space = 220  # GB
if free_gb > required_space:
    print(f"✅ You have enough space for the 200GB dataset (with some buffer)")
else:
    print(f"❌ Not enough space! You need at least {required_space} GB, but only have {free_gb:.2f} GB free")
    print(f"Consider setting a custom cache directory on a drive with more space")

Drive C::
Total: 474.72 GB
Used: 432.53 GB
Free: 42.19 GB
❌ Not enough space! You need at least 220 GB, but only have 42.19 GB free
Consider setting a custom cache directory on a drive with more space


# Division of categories into subsets per person:


In [5]:
subset_sizes = [11, 11, 12]  # Define the sizes of each subset

# Create subsets
subsets = []
start_index = 0
for size in subset_sizes:
    subsets.append(VALID_CATEGORIES[start_index:start_index + size])
    start_index += size

# Unpack the subsets into individual variables
kailash_subset, saeed_subset1, johnny_subset2 = subsets

print(f"Subset list: {subsets}")
print(f"Length: {len(VALID_CATEGORIES)}")
# Output the subsets


Subset list: [['All_Beauty', 'Amazon_Fashion', 'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Baby_Products', 'Beauty_and_Personal_Care', 'Books', 'CDs_and_Vinyl', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry'], ['Digital_Music', 'Electronics', 'Gift_Cards', 'Grocery_and_Gourmet_Food', 'Handmade_Products', 'Health_and_Household', 'Health_and_Personal_Care', 'Home_and_Kitchen', 'Industrial_and_Scientific', 'Kindle_Store', 'Magazine_Subscriptions'], ['Movies_and_TV', 'Musical_Instruments', 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies', 'Software', 'Sports_and_Outdoors', 'Subscription_Boxes', 'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games', 'Unknown']]
Length: 34


# Setting up log directory and logging system
- Used for proof of file acquisition

In [6]:
log_dir = Path("logs")
log_dir.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    filename=log_dir / 'download_log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Error handling:
- In the event of an issue where the folder was not dowloaded properly
- In the event of a issue with preprocessing any particular category (OS error, Permissions error, Temp file being deleted before full extraction) it is likely the initial download was corrupted by some means. As such simply redownload via the last cell, again specifying the failed category

In [7]:
def verify_compressed_file(file_path):
    """
    Verifies if a compressed file (tar.gz, zip, etc.) is valid and not corrupted.
    
    Args:
        file_path: Path to the compressed file
        
    Returns:
        bool: True if the file is valid, False otherwise
    """
    file_path = Path(file_path)
    
    if not file_path.exists():
        print(f"Error: File does not exist: {file_path}")
        return False
        
    try:
        if str(file_path).endswith('.tar.gz') or str(file_path).endswith('.tgz'):
            # Test if tar.gz file is valid
            with tarfile.open(file_path, 'r:gz') as tar:
                # Just check the integrity by attempting to list contents
                tar.getnames()
                print(f"✓ Verified {file_path} is a valid tar.gz file")
                return True
                
        else:
            print(f"Error: Unsupported file format for {file_path}")
            return False
            
    except Exception as e:
        print(f"Error verifying {file_path}: {str(e)}")
        return False

# Downloading the folders and metadata
- Just change the file path in save_path accordingly to where you want it stored
- VALID_CATEGORIES will download all metadata and review data for all 34 categories, otherwise choose a subset from above to work with. 

In [11]:
save_path = Path(r"D:\BigData")
save_path.mkdir(parents=True, exist_ok=True)

for category in VALID_CATEGORIES:
    logging.info(f"n------Processing {category}------")
    print(f"\n------Processing {category}------")

    download_all_amazon_reviews(base_save_path=save_path, 
                                categories=[category], compress=False)

    review_folder = save_path / f"raw_review_{category}"
    metadata_folder = save_path / f"raw_meta_{category}"

    if review_folder.exists():
        compressed = compress_folder(review_folder, 
                                        compression_format="gz", 
                                        level=6)
        
        logging.info(f"Compressed reviews to: {compressed}")
        print(f"Compressed reviews to: {compressed}")

    if metadata_folder.exists():
        compressed = compress_folder(metadata_folder, 
                                        compression_format="gz", 
                                        level=6)
        
        logging.info(f"Compressed metadata to: {compressed}")
        print(f"Compressed metadata to: {compressed}")

    
    verify_compressed_file(review_folder)
    verify_compressed_file(metadata_folder) 


------Processing All_Beauty------
⠋ Processing All_Beauty 

README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

⠹ Processing All_Beauty 

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


⠦ Processing All_Beauty 

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

⠦ Processing All_Beauty 

All_Beauty.jsonl:   0%|          | 0.00/327M [00:00<?, ?B/s]

⠸ Processing All_Beauty 

Generating full split: 0 examples [00:00, ? examples/s]

⠼ Processing All_Beauty 

Saving the dataset (0/1 shards):   0%|          | 0/701528 [00:00<?, ? examples/s]

[DONE] raw_review_All_Beauty downloaded
⠸ Processing All_Beauty 

meta_All_Beauty.jsonl:   0%|          | 0.00/213M [00:00<?, ?B/s]

⠦ Processing All_Beauty 

Generating full split:   0%|          | 0/112590 [00:00<?, ? examples/s]

⠋ Processing All_Beauty 

Saving the dataset (0/1 shards):   0%|          | 0/112590 [00:00<?, ? examples/s]

[DONE] raw_meta_All_Beauty downloaded
✅ Processing All_Beauty

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_All_Beauty.tar.gz
Compressed metadata to: D:\BigData\raw_meta_All_Beauty.tar.gz
Error: File does not exist: D:\BigData\raw_review_All_Beauty
Error: File does not exist: D:\BigData\raw_meta_All_Beauty

------Processing Amazon_Fashion------
⠼ Processing Amazon_Fashion 

Amazon_Fashion.jsonl:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

⠦ Processing Amazon_Fashion 

Generating full split: 0 examples [00:00, ? examples/s]

⠏ Processing Amazon_Fashion 

Saving the dataset (0/2 shards):   0%|          | 0/2500939 [00:00<?, ? examples/s]

[DONE] raw_review_Amazon_Fashion downloaded
⠙ Processing Amazon_Fashion 

meta_Amazon_Fashion.jsonl:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

⠦ Processing Amazon_Fashion 

Generating full split: 0 examples [00:00, ? examples/s]

⠦ Processing Amazon_Fashion 

Saving the dataset (0/3 shards):   0%|          | 0/826108 [00:00<?, ? examples/s]

[DONE] raw_meta_Amazon_Fashion downloaded
✅ Processing Amazon_Fashion

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Amazon_Fashion.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Amazon_Fashion.tar.gz
Error: File does not exist: D:\BigData\raw_review_Amazon_Fashion
Error: File does not exist: D:\BigData\raw_meta_Amazon_Fashion

------Processing Appliances------
⠼ Processing Appliances 

Appliances.jsonl:   0%|          | 0.00/929M [00:00<?, ?B/s]

⠇ Processing Appliances 

Generating full split: 0 examples [00:00, ? examples/s]

⠹ Processing Appliances 

Saving the dataset (0/2 shards):   0%|          | 0/2128605 [00:00<?, ? examples/s]

[DONE] raw_review_Appliances downloaded
⠸ Processing Appliances 

meta_Appliances.jsonl:   0%|          | 0.00/285M [00:00<?, ?B/s]

⠹ Processing Appliances 

Generating full split: 0 examples [00:00, ? examples/s]

⠹ Processing Appliances 

Saving the dataset (0/1 shards):   0%|          | 0/94327 [00:00<?, ? examples/s]

[DONE] raw_meta_Appliances downloaded
✅ Processing Appliances

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Appliances.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Appliances.tar.gz
Error: File does not exist: D:\BigData\raw_review_Appliances
Error: File does not exist: D:\BigData\raw_meta_Appliances

------Processing Arts_Crafts_and_Sewing------
⠙ Processing Arts_Crafts_and_Sewing 

Arts_Crafts_and_Sewing.jsonl:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

⠏ Processing Arts_Crafts_and_Sewing 

Generating full split: 0 examples [00:00, ? examples/s]

⠋ Processing Arts_Crafts_and_Sewing 

Saving the dataset (0/6 shards):   0%|          | 0/8966758 [00:00<?, ? examples/s]

[DONE] raw_review_Arts_Crafts_and_Sewing downloaded
⠋ Processing Arts_Crafts_and_Sewing 

meta_Arts_Crafts_and_Sewing.jsonl:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

⠦ Processing Arts_Crafts_and_Sewing 

Generating full split:   0%|          | 0/801446 [00:00<?, ? examples/s]

⠙ Processing Arts_Crafts_and_Sewing 

Saving the dataset (0/4 shards):   0%|          | 0/801446 [00:00<?, ? examples/s]

[DONE] raw_meta_Arts_Crafts_and_Sewing downloaded
✅ Processing Arts_Crafts_and_Sewing

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Arts_Crafts_and_Sewing.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Arts_Crafts_and_Sewing.tar.gz
Error: File does not exist: D:\BigData\raw_review_Arts_Crafts_and_Sewing
Error: File does not exist: D:\BigData\raw_meta_Arts_Crafts_and_Sewing

------Processing Automotive------
⠇ Processing Automotive 

Automotive.jsonl:   0%|          | 0.00/8.73G [00:00<?, ?B/s]

⠹ Processing Automotive 

Generating full split: 0 examples [00:00, ? examples/s]

⠏ Processing Automotive 

Saving the dataset (0/13 shards):   0%|          | 0/19955450 [00:00<?, ? examples/s]

[DONE] raw_review_Automotive downloaded
⠦ Processing Automotive 

meta_Automotive.jsonl:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

⠼ Processing Automotive 

Generating full split: 0 examples [00:00, ? examples/s]

⠼ Processing Automotive 

Saving the dataset (0/10 shards):   0%|          | 0/2003129 [00:00<?, ? examples/s]

[DONE] raw_meta_Automotive downloaded
✅ Processing Automotive

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Automotive.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Automotive.tar.gz
Error: File does not exist: D:\BigData\raw_review_Automotive
Error: File does not exist: D:\BigData\raw_meta_Automotive

------Processing Baby_Products------
⠋ Processing Baby_Products 

Baby_Products.jsonl:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

⠼ Processing Baby_Products 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Baby_Products 

Saving the dataset (0/5 shards):   0%|          | 0/6028884 [00:00<?, ? examples/s]

[DONE] raw_review_Baby_Products downloaded
⠇ Processing Baby_Products 

meta_Baby_Products.jsonl:   0%|          | 0.00/691M [00:00<?, ?B/s]

⠸ Processing Baby_Products 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Baby_Products 

Saving the dataset (0/2 shards):   0%|          | 0/217724 [00:00<?, ? examples/s]

[DONE] raw_meta_Baby_Products downloaded
✅ Processing Baby_Products

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Baby_Products.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Baby_Products.tar.gz
Error: File does not exist: D:\BigData\raw_review_Baby_Products
Error: File does not exist: D:\BigData\raw_meta_Baby_Products

------Processing Beauty_and_Personal_Care------
⠴ Processing Beauty_and_Personal_Care 

Beauty_and_Personal_Care.jsonl:   0%|          | 0.00/11.0G [00:00<?, ?B/s]

⠋ Processing Beauty_and_Personal_Care 

Generating full split: 0 examples [00:00, ? examples/s]

⠹ Processing Beauty_and_Personal_Care 

Saving the dataset (0/16 shards):   0%|          | 0/23911390 [00:00<?, ? examples/s]

[DONE] raw_review_Beauty_and_Personal_Care downloaded
💥 Processing Beauty_and_Personal_Care
Failed to process category 'Beauty_and_Personal_Care': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 8a0b3603-fc76-4cb4-8eac-ed1d106f99e2)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Beauty_and_Personal_Care: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 8a0b3603-fc76-4cb4-8eac-ed1d106f99e2)')
Compressed reviews to: D:\BigData\raw_review_Beauty_and_Personal_Care.tar.gz
Error: File does not exist: D:\BigData\raw_review_Beauty_and_Personal_Care
Error: File does not exist: D:\BigData\raw_meta_Beauty_and_Personal_Care

------Processing Books------
⠹ Processing Books 

Books.jsonl:   0%|          | 0.00/20.1G [00:00<?, ?B/s]

⠴ Processing Books 

Generating full split: 0 examples [00:00, ? examples/s]

⠧ Processing Books 

Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

⠏ Processing Books 

Saving the dataset (0/33 shards):   0%|          | 0/29475453 [00:00<?, ? examples/s]

[DONE] raw_review_Books downloaded
💥 Processing Books
Failed to process category 'Books': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 61c09b40-fd95-4b41-84e6-3c79264802bd)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Books: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 61c09b40-fd95-4b41-84e6-3c79264802bd)')
Compressed reviews to: D:\BigData\raw_review_Books.tar.gz
Error: File does not exist: D:\BigData\raw_review_Books
Error: File does not exist: D:\BigData\raw_meta_Books

------Processing CDs_and_Vinyl------
⠹ Processing CDs_and_Vinyl 

CDs_and_Vinyl.jsonl:   0%|          | 0.00/3.29G [00:00<?, ?B/s]

⠋ Processing CDs_and_Vinyl 

Generating full split: 0 examples [00:00, ? examples/s]

⠹ Processing CDs_and_Vinyl 

Saving the dataset (0/6 shards):   0%|          | 0/4827273 [00:00<?, ? examples/s]

[DONE] raw_review_CDs_and_Vinyl downloaded
⠧ Processing CDs_and_Vinyl 

meta_CDs_and_Vinyl.jsonl:   0%|          | 0.00/949M [00:00<?, ?B/s]

⠴ Processing CDs_and_Vinyl 

Generating full split: 0 examples [00:00, ? examples/s]

⠹ Processing CDs_and_Vinyl 

Saving the dataset (0/2 shards):   0%|          | 0/701959 [00:00<?, ? examples/s]

[DONE] raw_meta_CDs_and_Vinyl downloaded
✅ Processing CDs_and_Vinyl

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_CDs_and_Vinyl.tar.gz
Compressed metadata to: D:\BigData\raw_meta_CDs_and_Vinyl.tar.gz
Error: File does not exist: D:\BigData\raw_review_CDs_and_Vinyl
Error: File does not exist: D:\BigData\raw_meta_CDs_and_Vinyl

------Processing Cell_Phones_and_Accessories------
⠙ Processing Cell_Phones_and_Accessories 

Cell_Phones_and_Accessories.jsonl:   0%|          | 0.00/9.34G [00:00<?, ?B/s]

⠸ Processing Cell_Phones_and_Accessories 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Cell_Phones_and_Accessories 

Saving the dataset (0/14 shards):   0%|          | 0/20812945 [00:00<?, ? examples/s]

[DONE] raw_review_Cell_Phones_and_Accessories downloaded
💥 Processing Cell_Phones_and_Accessories
Failed to process category 'Cell_Phones_and_Accessories': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 98b0bd5d-4953-4ed3-bb4d-44c7b78880ed)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Cell_Phones_and_Accessories: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 98b0bd5d-4953-4ed3-bb4d-44c7b78880ed)')
Compressed reviews to: D:\BigData\raw_review_Cell_Phones_and_Accessories.tar.gz
Error: File does not exist: D:\BigData\raw_review_Cell_Phones_and_Accessories
Error: File does not exist: D:\BigData\raw_meta_Cell_Phones_and_Accessories

------Processing Clothing_Shoes_and_Jewelry------
⠙ Processing Clothing_Shoes_and_Jewelry 

Clothing_Shoes_and_Jewelry.jsonl:   0%|          | 0.00/27.8G [00:00<?, ?B/s]

⠦ Processing Clothing_Shoes_and_Jewelry 

Generating full split: 0 examples [00:00, ? examples/s]

⠋ Processing Clothing_Shoes_and_Jewelry 

Loading dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

⠧ Processing Clothing_Shoes_and_Jewelry 

Saving the dataset (0/38 shards):   0%|          | 0/66033346 [00:00<?, ? examples/s]

[DONE] raw_review_Clothing_Shoes_and_Jewelry downloaded
💥 Processing Clothing_Shoes_and_Jewelry
Failed to process category 'Clothing_Shoes_and_Jewelry': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d5424aaf-eddb-4e2c-8a1f-49e884d50a30)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Clothing_Shoes_and_Jewelry: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d5424aaf-eddb-4e2c-8a1f-49e884d50a30)')
Compressed reviews to: D:\BigData\raw_review_Clothing_Shoes_and_Jewelry.tar.gz
Error: File does not exist: D:\BigData\raw_review_Clothing_Shoes_and_Jewelry
Error: File does not exist: D:\BigData\raw_meta_Clothing_Shoes_and_Jewelry

------Processing Digital_Music------
⠸ Processing Digital_Music 

Digital_Music.jsonl:   0%|          | 0.00/78.8M [00:00<?, ?B/s]

⠹ Processing Digital_Music 

Generating full split: 0 examples [00:00, ? examples/s]

⠹ Processing Digital_Music 

Saving the dataset (0/1 shards):   0%|          | 0/130434 [00:00<?, ? examples/s]

[DONE] raw_review_Digital_Music downloaded
⠋ Processing Digital_Music 

meta_Digital_Music.jsonl:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

⠹ Processing Digital_Music 

Generating full split: 0 examples [00:00, ? examples/s]

⠙ Processing Digital_Music 

Saving the dataset (0/1 shards):   0%|          | 0/70537 [00:00<?, ? examples/s]

[DONE] raw_meta_Digital_Music downloaded
✅ Processing Digital_Music 

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Digital_Music.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Digital_Music.tar.gz
Error: File does not exist: D:\BigData\raw_review_Digital_Music
Error: File does not exist: D:\BigData\raw_meta_Digital_Music

------Processing Electronics------
⠹ Processing Electronics 

Electronics.jsonl:   0%|          | 0.00/22.6G [00:00<?, ?B/s]

⠼ Processing Electronics 

Generating full split: 0 examples [00:00, ? examples/s]

⠋ Processing Electronics 

Loading dataset shards:   0%|          | 0/34 [00:00<?, ?it/s]

⠋ Processing Electronics 

Saving the dataset (0/34 shards):   0%|          | 0/43886944 [00:00<?, ? examples/s]

[DONE] raw_review_Electronics downloaded
💥 Processing Electronics
Failed to process category 'Electronics': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: b77d8c16-1906-4c3d-890a-b78b076383e9)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Electronics: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: b77d8c16-1906-4c3d-890a-b78b076383e9)')
Compressed reviews to: D:\BigData\raw_review_Electronics.tar.gz
Error: File does not exist: D:\BigData\raw_review_Electronics
Error: File does not exist: D:\BigData\raw_meta_Electronics

------Processing Gift_Cards------
⠹ Processing Gift_Cards 

Gift_Cards.jsonl:   0%|          | 0.00/50.2M [00:00<?, ?B/s]

⠸ Processing Gift_Cards 

Generating full split: 0 examples [00:00, ? examples/s]

⠼ Processing Gift_Cards 

Saving the dataset (0/1 shards):   0%|          | 0/152410 [00:00<?, ? examples/s]

[DONE] raw_review_Gift_Cards downloaded
⠼ Processing Gift_Cards 

meta_Gift_Cards.jsonl:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

⠦ Processing Gift_Cards 

Generating full split:   0%|          | 0/1137 [00:00<?, ? examples/s]

⠇ Processing Gift_Cards 

Saving the dataset (0/1 shards):   0%|          | 0/1137 [00:00<?, ? examples/s]

[DONE] raw_meta_Gift_Cards downloaded
✅ Processing Gift_Cards

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Gift_Cards.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Gift_Cards.tar.gz
Error: File does not exist: D:\BigData\raw_review_Gift_Cards
Error: File does not exist: D:\BigData\raw_meta_Gift_Cards

------Processing Grocery_and_Gourmet_Food------
⠦ Processing Grocery_and_Gourmet_Food 

Grocery_and_Gourmet_Food.jsonl:   0%|          | 0.00/5.97G [00:00<?, ?B/s]

⠇ Processing Grocery_and_Gourmet_Food 

Generating full split: 0 examples [00:00, ? examples/s]

⠋ Processing Grocery_and_Gourmet_Food 

Saving the dataset (0/9 shards):   0%|          | 0/14318520 [00:00<?, ? examples/s]

[DONE] raw_review_Grocery_and_Gourmet_Food downloaded
⠙ Processing Grocery_and_Gourmet_Food 

meta_Grocery_and_Gourmet_Food.jsonl:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

⠧ Processing Grocery_and_Gourmet_Food 

Generating full split: 0 examples [00:00, ? examples/s]

⠧ Processing Grocery_and_Gourmet_Food 

Saving the dataset (0/3 shards):   0%|          | 0/603274 [00:00<?, ? examples/s]

[DONE] raw_meta_Grocery_and_Gourmet_Food downloaded
✅ Processing Grocery_and_Gourmet_Food

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Grocery_and_Gourmet_Food.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Grocery_and_Gourmet_Food.tar.gz
Error: File does not exist: D:\BigData\raw_review_Grocery_and_Gourmet_Food
Error: File does not exist: D:\BigData\raw_meta_Grocery_and_Gourmet_Food

------Processing Handmade_Products------
⠸ Processing Handmade_Products 

Handmade_Products.jsonl:   0%|          | 0.00/289M [00:00<?, ?B/s]

⠹ Processing Handmade_Products 

Generating full split: 0 examples [00:00, ? examples/s]

⠋ Processing Handmade_Products 

Saving the dataset (0/1 shards):   0%|          | 0/664162 [00:00<?, ? examples/s]

[DONE] raw_review_Handmade_Products downloaded
⠙ Processing Handmade_Products 

meta_Handmade_Products.jsonl:   0%|          | 0.00/399M [00:00<?, ?B/s]

⠧ Processing Handmade_Products 

Generating full split:   0%|          | 0/164817 [00:00<?, ? examples/s]

⠙ Processing Handmade_Products 

Saving the dataset (0/1 shards):   0%|          | 0/164817 [00:00<?, ? examples/s]

[DONE] raw_meta_Handmade_Products downloaded
✅ Processing Handmade_Products

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Handmade_Products.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Handmade_Products.tar.gz
Error: File does not exist: D:\BigData\raw_review_Handmade_Products
Error: File does not exist: D:\BigData\raw_meta_Handmade_Products

------Processing Health_and_Household------
⠹ Processing Health_and_Household 

Health_and_Household.jsonl:   0%|          | 0.00/11.4G [00:00<?, ?B/s]

⠸ Processing Health_and_Household 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Health_and_Household 

Saving the dataset (0/16 shards):   0%|          | 0/25631345 [00:00<?, ? examples/s]

[DONE] raw_review_Health_and_Household downloaded
💥 Processing Health_and_Household
Failed to process category 'Health_and_Household': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ea419311-f1da-4914-ac8e-9173340639fb)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Health_and_Household: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ea419311-f1da-4914-ac8e-9173340639fb)')
Compressed reviews to: D:\BigData\raw_review_Health_and_Household.tar.gz
Error: File does not exist: D:\BigData\raw_review_Health_and_Household
Error: File does not exist: D:\BigData\raw_meta_Health_and_Household

------Processing Health_and_Personal_Care------
⠋ Processing Health_and_Personal_Care 

Health_and_Personal_Care.jsonl:   0%|          | 0.00/227M [00:00<?, ?B/s]

⠋ Processing Health_and_Personal_Care 

Generating full split: 0 examples [00:00, ? examples/s]

⠏ Processing Health_and_Personal_Care 

Saving the dataset (0/1 shards):   0%|          | 0/494121 [00:00<?, ? examples/s]

[DONE] raw_review_Health_and_Personal_Care downloaded
⠴ Processing Health_and_Personal_Care 

meta_Health_and_Personal_Care.jsonl:   0%|          | 0.00/118M [00:00<?, ?B/s]

⠼ Processing Health_and_Personal_Care 

Generating full split: 0 examples [00:00, ? examples/s]

⠏ Processing Health_and_Personal_Care 

Saving the dataset (0/1 shards):   0%|          | 0/60293 [00:00<?, ? examples/s]

[DONE] raw_meta_Health_and_Personal_Care downloaded
✅ Processing Health_and_Personal_Care

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Health_and_Personal_Care.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Health_and_Personal_Care.tar.gz
Error: File does not exist: D:\BigData\raw_review_Health_and_Personal_Care
Error: File does not exist: D:\BigData\raw_meta_Health_and_Personal_Care

------Processing Home_and_Kitchen------
⠹ Processing Home_and_Kitchen 

Home_and_Kitchen.jsonl:   0%|          | 0.00/31.4G [00:00<?, ?B/s]

⠙ Processing Home_and_Kitchen 

Generating full split: 0 examples [00:00, ? examples/s]

⠦ Processing Home_and_Kitchen 

Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

⠦ Processing Home_and_Kitchen 

Saving the dataset (0/45 shards):   0%|          | 0/67409944 [00:00<?, ? examples/s]

[DONE] raw_review_Home_and_Kitchen downloaded
⠴ Processing Home_and_Kitchen 



💥 Processing Home_and_Kitchen
Failed to process category 'Home_and_Kitchen': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: e1f39c5d-8a39-4c58-a10a-9979876942d9)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Home_and_Kitchen: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: e1f39c5d-8a39-4c58-a10a-9979876942d9)')
Compressed reviews to: D:\BigData\raw_review_Home_and_Kitchen.tar.gz
Error: File does not exist: D:\BigData\raw_review_Home_and_Kitchen
Error: File does not exist: D:\BigData\raw_meta_Home_and_Kitchen

------Processing Industrial_and_Scientific------
⠼ Processing Industrial_and_Scientific 

Industrial_and_Scientific.jsonl:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

⠙ Processing Industrial_and_Scientific 

Generating full split: 0 examples [00:00, ? examples/s]

⠧ Processing Industrial_and_Scientific 

Saving the dataset (0/4 shards):   0%|          | 0/5183005 [00:00<?, ? examples/s]

[DONE] raw_review_Industrial_and_Scientific downloaded
⠏ Processing Industrial_and_Scientific 

meta_Industrial_and_Scientific.jsonl:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

⠋ Processing Industrial_and_Scientific 

Generating full split:   0%|          | 0/427564 [00:00<?, ? examples/s]

⠋ Processing Industrial_and_Scientific 

Saving the dataset (0/2 shards):   0%|          | 0/427564 [00:00<?, ? examples/s]

[DONE] raw_meta_Industrial_and_Scientific downloaded
✅ Processing Industrial_and_Scientific

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Industrial_and_Scientific.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Industrial_and_Scientific.tar.gz
Error: File does not exist: D:\BigData\raw_review_Industrial_and_Scientific
Error: File does not exist: D:\BigData\raw_meta_Industrial_and_Scientific

------Processing Kindle_Store------
⠏ Processing Kindle_Store 

Kindle_Store.jsonl:   0%|          | 0.00/15.8G [00:00<?, ?B/s]

⠦ Processing Kindle_Store 

Generating full split: 0 examples [00:00, ? examples/s]

⠸ Processing Kindle_Store 

Loading dataset shards:   0%|          | 0/25 [00:00<?, ?it/s]

⠹ Processing Kindle_Store 

Saving the dataset (0/25 shards):   0%|          | 0/25577616 [00:00<?, ? examples/s]

[DONE] raw_review_Kindle_Store downloaded
⠇ Processing Kindle_Store 



💥 Processing Kindle_Store
Failed to process category 'Kindle_Store': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 758adbf1-df57-4a90-bef7-47ad15eed5e7)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Kindle_Store: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 758adbf1-df57-4a90-bef7-47ad15eed5e7)')
Compressed reviews to: D:\BigData\raw_review_Kindle_Store.tar.gz
Error: File does not exist: D:\BigData\raw_review_Kindle_Store
Error: File does not exist: D:\BigData\raw_meta_Kindle_Store

------Processing Magazine_Subscriptions------
⠹ Processing Magazine_Subscriptions 

Magazine_Subscriptions.jsonl:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

⠋ Processing Magazine_Subscriptions 

Generating full split: 0 examples [00:00, ? examples/s]

⠙ Processing Magazine_Subscriptions 

Saving the dataset (0/1 shards):   0%|          | 0/71497 [00:00<?, ? examples/s]

[DONE] raw_review_Magazine_Subscriptions downloaded
⠋ Processing Magazine_Subscriptions 

meta_Magazine_Subscriptions.jsonl:   0%|          | 0.00/4.10M [00:00<?, ?B/s]

⠧ Processing Magazine_Subscriptions 

Generating full split: 0 examples [00:00, ? examples/s]

⠙ Processing Magazine_Subscriptions 

Saving the dataset (0/1 shards):   0%|          | 0/3391 [00:00<?, ? examples/s]

[DONE] raw_meta_Magazine_Subscriptions downloaded
✅ Processing Magazine_Subscriptions

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Magazine_Subscriptions.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Magazine_Subscriptions.tar.gz
Error: File does not exist: D:\BigData\raw_review_Magazine_Subscriptions
Error: File does not exist: D:\BigData\raw_meta_Magazine_Subscriptions

------Processing Movies_and_TV------
⠋ Processing Movies_and_TV 

Movies_and_TV.jsonl:   0%|          | 0.00/8.39G [00:00<?, ?B/s]

⠹ Processing Movies_and_TV 

Generating full split: 0 examples [00:00, ? examples/s]

💥 Processing Movies_and_TV
Failed to process category 'Movies_and_TV': [WinError 32] The process cannot access the file because it is being used by another process: 'C:/Users/saeed/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_review_Movies_and_TV/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8.incomplete\\amazon-reviews-2023-full-00000-00011-of-NNNNN.arrow'

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Movies_and_TV: [WinError 32] The process cannot access the file because it is being used by another process: 'C:/Users/saeed/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_review_Movies_and_TV/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8.incomplete\\amazon-reviews-2023-full-00000-00011-of-NNNNN.arrow'
Error: File does not exist: D:\BigData\raw_review_Movies_and_TV
Error: File does not exist: D:\BigData\raw_meta_Movies_and_TV

------Processing Musical

Musical_Instruments.jsonl:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

⠋ Processing Musical_Instruments 

Generating full split: 0 examples [00:00, ? examples/s]

⠦ Processing Musical_Instruments 

Saving the dataset (0/3 shards):   0%|          | 0/3017439 [00:00<?, ? examples/s]

[DONE] raw_review_Musical_Instruments downloaded
⠏ Processing Musical_Instruments 

meta_Musical_Instruments.jsonl:   0%|          | 0.00/632M [00:00<?, ?B/s]

⠴ Processing Musical_Instruments 

Generating full split:   0%|          | 0/213593 [00:00<?, ? examples/s]

⠋ Processing Musical_Instruments 

Saving the dataset (0/2 shards):   0%|          | 0/213593 [00:00<?, ? examples/s]

[DONE] raw_meta_Musical_Instruments downloaded
✅ Processing Musical_Instruments 

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Musical_Instruments.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Musical_Instruments.tar.gz
Error: File does not exist: D:\BigData\raw_review_Musical_Instruments
Error: File does not exist: D:\BigData\raw_meta_Musical_Instruments

------Processing Office_Products------
⠸ Processing Office_Products 



Office_Products.jsonl:   0%|          | 0.00/5.78G [00:00<?, ?B/s]

💥 Processing Office_Products
Failed to process category 'Office_Products': [Errno 28] No space left on device

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Office_Products: [Errno 28] No space left on device
Error: File does not exist: D:\BigData\raw_review_Office_Products
Error: File does not exist: D:\BigData\raw_meta_Office_Products

------Processing Patio_Lawn_and_Garden------
💥 Processing Patio_Lawn_and_Garden
Failed to process category 'Patio_Lawn_and_Garden': Not enough disk space. Needed: Unknown size (download: Unknown size, generated: Unknown size, post-processed: Unknown size)

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Patio_Lawn_and_Garden: Not enough disk space. Needed: Unknown size (download: Unknown size, generated: Unknown size, post-processed: Unknown size)
Error: File does not exist: D:\BigData\raw_review_Patio_Lawn_and_Garden
Error: File does not exist: D:\BigData

# Note
- the cell below is the one you would use to redownload any specific category that failed during initial download or if OS error, permissions error, file locking error

In [None]:
save_path = Path(r"D:\BigData")
save_path.mkdir(parents=True, exist_ok=True)
# category = ""
remaining= ["Electronics", "Kindle_Store", "Movies_and_TV", 
            "Sports_and_Outdoors", "Unknown"
            ]

for category in remaining:
    logging.info(f"n------Processing {category}------")
    print(f"\n------Processing {category}------")

    download_all_amazon_reviews(base_save_path=save_path, 
                                categories=[category], compress=False)

    review_folder = save_path / f"raw_review_{category}"
    metadata_folder = save_path / f"raw_meta_{category}"

    if review_folder.exists():
        compressed = compress_folder(review_folder, 
                                        compression_format="gz", 
                                        level=6)
        
        logging.info(f"Compressed reviews to: {compressed}")
        print(f"Compressed reviews to: {compressed}")

    if metadata_folder.exists():
        compressed = compress_folder(metadata_folder, 
                                        compression_format="gz", 
                                        level=6)
        
        logging.info(f"Compressed metadata to: {compressed}")
        print(f"Compressed metadata to: {compressed}")

    verify_compressed_file(review_folder)
    verify_compressed_file(metadata_folder) 


------Processing Beauty_and_Personal_Care------
[SKIP] raw_review_Beauty_and_Personal_Care already exists
⠧ Processing Beauty_and_Personal_Care 

README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

⠏ Processing Beauty_and_Personal_Care 

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

⠹ Processing Beauty_and_Personal_Care 

meta_Beauty_and_Personal_Care.jsonl:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

⠧ Processing Beauty_and_Personal_Care 

Generating full split: 0 examples [00:00, ? examples/s]

⠧ Processing Beauty_and_Personal_Care 

Saving the dataset (0/5 shards):   0%|          | 0/1028914 [00:00<?, ? examples/s]

[DONE] raw_meta_Beauty_and_Personal_Care downloaded
✅ Processing Beauty_and_Personal_Care

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed metadata to: D:\BigData\raw_meta_Beauty_and_Personal_Care.tar.gz
Error: File does not exist: D:\BigData\raw_review_Beauty_and_Personal_Care
Error: File does not exist: D:\BigData\raw_meta_Beauty_and_Personal_Care

------Processing Books------
[SKIP] raw_review_Books already exists
⠴ Processing Books 

meta_Books.jsonl:   0%|          | 0.00/14.7G [00:00<?, ?B/s]

⠇ Processing Books 

Generating full split: 0 examples [00:00, ? examples/s]

⠸ Processing Books 

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

⠋ Processing Books 

Saving the dataset (0/28 shards):   0%|          | 0/4448181 [00:00<?, ? examples/s]

[DONE] raw_meta_Books downloaded
✅ Processing Books

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed metadata to: D:\BigData\raw_meta_Books.tar.gz
Error: File does not exist: D:\BigData\raw_review_Books
Error: File does not exist: D:\BigData\raw_meta_Books

------Processing Cell_Phones_and_Accessories------
[SKIP] raw_review_Cell_Phones_and_Accessories already exists
⠙ Processing Cell_Phones_and_Accessories 

meta_Cell_Phones_and_Accessories.jsonl:   0%|          | 0.00/4.02G [00:00<?, ?B/s]

⠴ Processing Cell_Phones_and_Accessories 

Generating full split:   0%|          | 0/1288490 [00:00<?, ? examples/s]

⠋ Processing Cell_Phones_and_Accessories 

Saving the dataset (0/7 shards):   0%|          | 0/1288490 [00:00<?, ? examples/s]

[DONE] raw_meta_Cell_Phones_and_Accessories downloaded
✅ Processing Cell_Phones_and_Accessories

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed metadata to: D:\BigData\raw_meta_Cell_Phones_and_Accessories.tar.gz
Error: File does not exist: D:\BigData\raw_review_Cell_Phones_and_Accessories
Error: File does not exist: D:\BigData\raw_meta_Cell_Phones_and_Accessories

------Processing Clothing_Shoes_and_Jewelry------
[SKIP] raw_review_Clothing_Shoes_and_Jewelry already exists
⠴ Processing Clothing_Shoes_and_Jewelry 

meta_Clothing_Shoes_and_Jewelry.jsonl:   0%|          | 0.00/18.0G [00:00<?, ?B/s]

⠴ Processing Clothing_Shoes_and_Jewelry 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Clothing_Shoes_and_Jewelry 

Loading dataset shards:   0%|          | 0/31 [00:00<?, ?it/s]

⠼ Processing Clothing_Shoes_and_Jewelry 

Saving the dataset (0/31 shards):   0%|          | 0/7218481 [00:00<?, ? examples/s]

[DONE] raw_meta_Clothing_Shoes_and_Jewelry downloaded
✅ Processing Clothing_Shoes_and_Jewelry

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed metadata to: D:\BigData\raw_meta_Clothing_Shoes_and_Jewelry.tar.gz
Error: File does not exist: D:\BigData\raw_review_Clothing_Shoes_and_Jewelry
Error: File does not exist: D:\BigData\raw_meta_Clothing_Shoes_and_Jewelry

------Processing Electronics------
[SKIP] raw_review_Electronics already exists
💥 Processing Electronics
Failed to process category 'Electronics': (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 7a532ee0-2eec-428b-981a-91355da47b62)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Electronics: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 7a532ee0-2eec-428b-981a-91355da47b62)')
Error: File does not exist: D:

meta_Health_and_Household.jsonl:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

⠙ Processing Health_and_Household 

Generating full split: 0 examples [00:00, ? examples/s]

⠸ Processing Health_and_Household 

Saving the dataset (0/5 shards):   0%|          | 0/797563 [00:00<?, ? examples/s]

[DONE] raw_meta_Health_and_Household downloaded
✅ Processing Health_and_Household

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed metadata to: D:\BigData\raw_meta_Health_and_Household.tar.gz
Error: File does not exist: D:\BigData\raw_review_Health_and_Household
Error: File does not exist: D:\BigData\raw_meta_Health_and_Household

------Processing Home_and_Kitchen------
[SKIP] raw_review_Home_and_Kitchen already exists
⠹ Processing Home_and_Kitchen 

meta_Home_and_Kitchen.jsonl:   0%|          | 0.00/11.8G [00:00<?, ?B/s]

⠸ Processing Home_and_Kitchen 

Generating full split: 0 examples [00:00, ? examples/s]

⠙ Processing Home_and_Kitchen 

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

⠙ Processing Home_and_Kitchen 

Saving the dataset (0/21 shards):   0%|          | 0/3735584 [00:00<?, ? examples/s]

[DONE] raw_meta_Home_and_Kitchen downloaded
✅ Processing Home_and_Kitchen

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed metadata to: D:\BigData\raw_meta_Home_and_Kitchen.tar.gz
Error: File does not exist: D:\BigData\raw_review_Home_and_Kitchen
Error: File does not exist: D:\BigData\raw_meta_Home_and_Kitchen

------Processing Kindle_Store------
[SKIP] raw_review_Kindle_Store already exists
💥 Processing Kindle_Store
Failed to process category 'Kindle_Store': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d6ff46ff-5cb8-47a2-9c5c-f468419d4226)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Kindle_Store: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d6ff46ff-5cb8-47a2-9c5c-f468419d4226)')
Error: File does not exist: D:\BigData\raw_review_Kindle_Store


Movies_and_TV.jsonl:   0%|          | 0.00/8.39G [00:00<?, ?B/s]

⠼ Processing Movies_and_TV 

Generating full split: 0 examples [00:00, ? examples/s]

⠧ Processing Movies_and_TV 

Saving the dataset (0/13 shards):   0%|          | 0/17328314 [00:00<?, ? examples/s]

[DONE] raw_review_Movies_and_TV downloaded
💥 Processing Movies_and_TV
Failed to process category 'Movies_and_TV': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c53f70e2-aa8d-4b02-9bcc-47720a640892)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Movies_and_TV: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c53f70e2-aa8d-4b02-9bcc-47720a640892)')
Compressed reviews to: D:\BigData\raw_review_Movies_and_TV.tar.gz
Error: File does not exist: D:\BigData\raw_review_Movies_and_TV
Error: File does not exist: D:\BigData\raw_meta_Movies_and_TV

------Processing Office_Products------
⠋ Processing Office_Products 

Office_Products.jsonl:   0%|          | 0.00/5.78G [00:00<?, ?B/s]

⠇ Processing Office_Products 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Office_Products 

Saving the dataset (0/9 shards):   0%|          | 0/12845712 [00:00<?, ? examples/s]

[DONE] raw_review_Office_Products downloaded
⠏ Processing Office_Products 

meta_Office_Products.jsonl:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

⠏ Processing Office_Products 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Office_Products 

Saving the dataset (0/4 shards):   0%|          | 0/710503 [00:00<?, ? examples/s]

[DONE] raw_meta_Office_Products downloaded
✅ Processing Office_Products

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Office_Products.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Office_Products.tar.gz
Error: File does not exist: D:\BigData\raw_review_Office_Products
Error: File does not exist: D:\BigData\raw_meta_Office_Products

------Processing Patio_Lawn_and_Garden------
⠼ Processing Patio_Lawn_and_Garden 

Patio_Lawn_and_Garden.jsonl:   0%|          | 0.00/7.75G [00:00<?, ?B/s]

⠼ Processing Patio_Lawn_and_Garden 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Patio_Lawn_and_Garden 

Saving the dataset (0/11 shards):   0%|          | 0/16490047 [00:00<?, ? examples/s]

[DONE] raw_review_Patio_Lawn_and_Garden downloaded
⠼ Processing Patio_Lawn_and_Garden 

meta_Patio_Lawn_and_Garden.jsonl:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

⠧ Processing Patio_Lawn_and_Garden 

Generating full split: 0 examples [00:00, ? examples/s]

⠇ Processing Patio_Lawn_and_Garden 

Saving the dataset (0/5 shards):   0%|          | 0/851907 [00:00<?, ? examples/s]

[DONE] raw_meta_Patio_Lawn_and_Garden downloaded
✅ Processing Patio_Lawn_and_Garden

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Patio_Lawn_and_Garden.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Patio_Lawn_and_Garden.tar.gz
Error: File does not exist: D:\BigData\raw_review_Patio_Lawn_and_Garden
Error: File does not exist: D:\BigData\raw_meta_Patio_Lawn_and_Garden

------Processing Pet_Supplies------
⠸ Processing Pet_Supplies 

Pet_Supplies.jsonl:   0%|          | 0.00/8.35G [00:00<?, ?B/s]

⠏ Processing Pet_Supplies 

Generating full split: 0 examples [00:00, ? examples/s]

⠏ Processing Pet_Supplies 

Saving the dataset (0/13 shards):   0%|          | 0/16827862 [00:00<?, ? examples/s]

[DONE] raw_review_Pet_Supplies downloaded
⠦ Processing Pet_Supplies 

meta_Pet_Supplies.jsonl:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

⠸ Processing Pet_Supplies 

Generating full split: 0 examples [00:00, ? examples/s]

⠏ Processing Pet_Supplies 

Saving the dataset (0/3 shards):   0%|          | 0/492798 [00:00<?, ? examples/s]

[DONE] raw_meta_Pet_Supplies downloaded
✅ Processing Pet_Supplies

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Pet_Supplies.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Pet_Supplies.tar.gz
Error: File does not exist: D:\BigData\raw_review_Pet_Supplies
Error: File does not exist: D:\BigData\raw_meta_Pet_Supplies

------Processing Software------
⠹ Processing Software 

Software.jsonl:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

⠹ Processing Software 

Generating full split: 0 examples [00:00, ? examples/s]

⠼ Processing Software 

Saving the dataset (0/3 shards):   0%|          | 0/4880181 [00:00<?, ? examples/s]

[DONE] raw_review_Software downloaded
⠦ Processing Software 

meta_Software.jsonl:   0%|          | 0.00/256M [00:00<?, ?B/s]

⠼ Processing Software 

Generating full split: 0 examples [00:00, ? examples/s]

⠼ Processing Software 

Saving the dataset (0/1 shards):   0%|          | 0/89251 [00:00<?, ? examples/s]

[DONE] raw_meta_Software downloaded
✅ Processing Software

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Software.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Software.tar.gz
Error: File does not exist: D:\BigData\raw_review_Software
Error: File does not exist: D:\BigData\raw_meta_Software

------Processing Sports_and_Outdoors------
⠴ Processing Sports_and_Outdoors 

Sports_and_Outdoors.jsonl:   0%|          | 0.00/9.26G [00:00<?, ?B/s]

⠧ Processing Sports_and_Outdoors 

Generating full split: 0 examples [00:00, ? examples/s]

⠏ Processing Sports_and_Outdoors 

Saving the dataset (0/14 shards):   0%|          | 0/19595170 [00:00<?, ? examples/s]

[DONE] raw_review_Sports_and_Outdoors downloaded
💥 Processing Sports_and_Outdoors
Failed to process category 'Sports_and_Outdoors': (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: e1b6f79a-c692-4781-8672-7be951ea5186)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Sports_and_Outdoors: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: e1b6f79a-c692-4781-8672-7be951ea5186)')
Compressed reviews to: D:\BigData\raw_review_Sports_and_Outdoors.tar.gz
Error: File does not exist: D:\BigData\raw_review_Sports_and_Outdoors
Error: File does not exist: D:\BigData\raw_meta_Sports_and_Outdoors

------Processing Subscription_Boxes------
⠇ Processing Subscription_Boxes 

Subscription_Boxes.jsonl:   0%|          | 0.00/8.95M [00:00<?, ?B/s]

⠇ Processing Subscription_Boxes 

Generating full split: 0 examples [00:00, ? examples/s]

⠙ Processing Subscription_Boxes 

Saving the dataset (0/1 shards):   0%|          | 0/16216 [00:00<?, ? examples/s]

[DONE] raw_review_Subscription_Boxes downloaded
⠼ Processing Subscription_Boxes 

meta_Subscription_Boxes.jsonl:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

⠴ Processing Subscription_Boxes 

Generating full split: 0 examples [00:00, ? examples/s]

⠧ Processing Subscription_Boxes 

Saving the dataset (0/1 shards):   0%|          | 0/641 [00:00<?, ? examples/s]

[DONE] raw_meta_Subscription_Boxes downloaded
✅ Processing Subscription_Boxes

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Subscription_Boxes.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Subscription_Boxes.tar.gz
Error: File does not exist: D:\BigData\raw_review_Subscription_Boxes
Error: File does not exist: D:\BigData\raw_meta_Subscription_Boxes

------Processing Tools_and_Home_Improvement------
⠋ Processing Tools_and_Home_Improvement 

Tools_and_Home_Improvement.jsonl:   0%|          | 0.00/12.8G [00:00<?, ?B/s]

⠋ Processing Tools_and_Home_Improvement 

Generating full split: 0 examples [00:00, ? examples/s]

⠸ Processing Tools_and_Home_Improvement 

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

⠧ Processing Tools_and_Home_Improvement 

Saving the dataset (0/19 shards):   0%|          | 0/26982256 [00:00<?, ? examples/s]

[DONE] raw_review_Tools_and_Home_Improvement downloaded
⠋ Processing Tools_and_Home_Improvement 

meta_Tools_and_Home_Improvement.jsonl:   0%|          | 0.00/4.85G [00:00<?, ?B/s]

⠙ Processing Tools_and_Home_Improvement 

Generating full split: 0 examples [00:00, ? examples/s]

⠋ Processing Tools_and_Home_Improvement 

Saving the dataset (0/9 shards):   0%|          | 0/1473810 [00:00<?, ? examples/s]

[DONE] raw_meta_Tools_and_Home_Improvement downloaded
✅ Processing Tools_and_Home_Improvement

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Tools_and_Home_Improvement.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Tools_and_Home_Improvement.tar.gz
Error: File does not exist: D:\BigData\raw_review_Tools_and_Home_Improvement
Error: File does not exist: D:\BigData\raw_meta_Tools_and_Home_Improvement

------Processing Toys_and_Games------
⠙ Processing Toys_and_Games 

Toys_and_Games.jsonl:   0%|          | 0.00/7.32G [00:00<?, ?B/s]

⠹ Processing Toys_and_Games 

Generating full split: 0 examples [00:00, ? examples/s]

⠴ Processing Toys_and_Games 

Saving the dataset (0/11 shards):   0%|          | 0/16260406 [00:00<?, ? examples/s]

[DONE] raw_review_Toys_and_Games downloaded
⠋ Processing Toys_and_Games 

meta_Toys_and_Games.jsonl:   0%|          | 0.00/2.64G [00:00<?, ?B/s]

⠦ Processing Toys_and_Games 

Generating full split:   0%|          | 0/890874 [00:00<?, ? examples/s]

⠙ Processing Toys_and_Games 

Saving the dataset (0/5 shards):   0%|          | 0/890874 [00:00<?, ? examples/s]

[DONE] raw_meta_Toys_and_Games downloaded
✅ Processing Toys_and_Games

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Toys_and_Games.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Toys_and_Games.tar.gz
Error: File does not exist: D:\BigData\raw_review_Toys_and_Games
Error: File does not exist: D:\BigData\raw_meta_Toys_and_Games

------Processing Video_Games------
⠙ Processing Video_Games 

Video_Games.jsonl:   0%|          | 0.00/2.68G [00:00<?, ?B/s]

⠼ Processing Video_Games 

Generating full split: 0 examples [00:00, ? examples/s]

⠹ Processing Video_Games 

Saving the dataset (0/5 shards):   0%|          | 0/4624615 [00:00<?, ? examples/s]

[DONE] raw_review_Video_Games downloaded
⠋ Processing Video_Games 

meta_Video_Games.jsonl:   0%|          | 0.00/437M [00:00<?, ?B/s]

⠧ Processing Video_Games 

Generating full split: 0 examples [00:00, ? examples/s]

⠸ Processing Video_Games 

Saving the dataset (0/1 shards):   0%|          | 0/137269 [00:00<?, ? examples/s]

[DONE] raw_meta_Video_Games downloaded
✅ Processing Video_Games 

🎉 Download summary:
  - Successfully processed: 1/1 categories
Compressed reviews to: D:\BigData\raw_review_Video_Games.tar.gz
Compressed metadata to: D:\BigData\raw_meta_Video_Games.tar.gz
Error: File does not exist: D:\BigData\raw_review_Video_Games
Error: File does not exist: D:\BigData\raw_meta_Video_Games

------Processing Unknown------
⠹ Processing Unknown 

Unknown.jsonl:   0%|          | 0.00/29.9G [00:00<?, ?B/s]

⠼ Processing Unknown 

Generating full split: 0 examples [00:00, ? examples/s]

⠦ Processing Unknown 

Loading dataset shards:   0%|          | 0/43 [00:00<?, ?it/s]

⠸ Processing Unknown 

Saving the dataset (0/43 shards):   0%|          | 0/63814110 [00:00<?, ? examples/s]

[DONE] raw_review_Unknown downloaded
💥 Processing Unknown
Failed to process category 'Unknown': (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: df372fc8-bad9-4e85-8f54-e60203ad1539)')

🎉 Download summary:
  - Successfully processed: 0/1 categories
  - Failed: 1/1 categories
    - Unknown: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: df372fc8-bad9-4e85-8f54-e60203ad1539)')
Compressed reviews to: D:\BigData\raw_review_Unknown.tar.gz
Error: File does not exist: D:\BigData\raw_review_Unknown
Error: File does not exist: D:\BigData\raw_meta_Unknown
