# Initial Imports

In [2]:
from bigdata_a3_utils import *
import shutil
import os
from pathlib import Path
import logging

In [None]:
cache_dir = get_cache_directory()

In [None]:
default_cache_dir = default_cache_path()

# Checking availble disk space
- Files could also be stored on external drive
- Simply change file path in save_path variable

In [None]:
# Get the disk where your cache directory is located
cache_path = cache_dir
drive = os.path.splitdrive(cache_path)[0]

# Get disk usage statistics
total, used, free = shutil.disk_usage(drive)

# Convert to GB for easier reading
total_gb = total / (1024**3)
used_gb = used / (1024**3)
free_gb = free / (1024**3)

print(f"Drive {drive}:")
print(f"Total: {total_gb:.2f} GB")
print(f"Used: {used_gb:.2f} GB")
print(f"Free: {free_gb:.2f} GB")

# Check if you have enough space (e.g., 220GB to be safe)
required_space = 220  # GB
if free_gb > required_space:
    print(f"✅ You have enough space for the 200GB dataset (with some buffer)")
else:
    print(f"❌ Not enough space! You need at least {required_space} GB, but only have {free_gb:.2f} GB free")
    print(f"Consider setting a custom cache directory on a drive with more space")

# Division of categories into subsets per person:


In [5]:
subset_sizes = [11, 11, 12]  # Define the sizes of each subset

# Create subsets
subsets = []
start_index = 0
for size in subset_sizes:
    subsets.append(VALID_CATEGORIES[start_index:start_index + size])
    start_index += size

# Unpack the subsets into individual variables
kailash_subset, tbd_subset1, tbd_subset2 = subsets

print(f"Subset list: {subsets}")
print(f"Length: {len(VALID_CATEGORIES)}")
# Output the subsets


Subset list: [['All_Beauty', 'Amazon_Fashion', 'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Baby_Products', 'Beauty_and_Personal_Care', 'Books', 'CDs_and_Vinyl', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry'], ['Digital_Music', 'Electronics', 'Gift_Cards', 'Grocery_and_Gourmet_Food', 'Handmade_Products', 'Health_and_Household', 'Health_and_Personal_Care', 'Home_and_Kitchen', 'Industrial_and_Scientific', 'Kindle_Store', 'Magazine_Subscriptions'], ['Movies_and_TV', 'Musical_Instruments', 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies', 'Software', 'Sports_and_Outdoors', 'Subscription_Boxes', 'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games', 'Unknown']]
Length: 34


# Setting up log directory and logging system
- Used for proof of file acquisition

In [None]:
log_dir = Path("logs")
log_dir.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    filename=log_dir / 'download_log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Downloading the folders and metadata
- Just change the file path in save_path accordingly to where you want it stored
- Also change the "kailash_subset" to the subset variable assigned from above 

In [None]:
save_path = Path(r"C:\BigDataA3\A3_Dataset")
save_path.mkdir(parents=True, exist_ok=True)


for i in range(2):
    logging.info(f"n------Processing {kailash_subset[i]}------")
    print(f"\n------Processing {kailash_subset[i]}------")

    download_all_amazon_reviews(base_save_path=save_path, 
                                categories=[kailash_subset[i]], compress=False)
    
    review_folder = save_path / f"raw_review_{kailash_subset[i]}"
    metadata_folder = save_path / f"raw_metadata_{kailash_subset[i]}"

    if review_folder.exists():
        compressed = compress_folder(review_folder, 
                                     compression_format="gz", 
                                     level=6)
        
        logging.info(f"Compressed reviews to: {compressed}")
        # print(f"Compressed reviews to: {compressed}")

    if metadata_folder.exists():
        compressed = compress_folder(metadata_folder, 
                                     compression_format="gz", 
                                     level=6)
        
        logging.info(f"Compressed metadata to: {compressed}")
        # print(f"Compressed metadata to: {compressed}")