# Initial Imports

In [None]:
from bigdata_a3_utils import *
import shutil
import os
from pathlib import Path
import logging
import tarfile
import zipfile

In [None]:
cache_dir = get_cache_directory()

In [None]:
default_cache_dir = default_cache_path()

# Checking availble disk space
- Files could also be stored on external drive
- Simply change file path in save_path variable

In [None]:
# Get the disk where your cache directory is located
cache_path = cache_dir
drive = os.path.splitdrive(cache_path)[0]

# Get disk usage statistics
total, used, free = shutil.disk_usage(drive)

# Convert to GB for easier reading
total_gb = total / (1024**3)
used_gb = used / (1024**3)
free_gb = free / (1024**3)

print(f"Drive {drive}:")
print(f"Total: {total_gb:.2f} GB")
print(f"Used: {used_gb:.2f} GB")
print(f"Free: {free_gb:.2f} GB")

# Check if you have enough space (e.g., 220GB to be safe)
required_space = 220  # GB
if free_gb > required_space:
    print(f"✅ You have enough space for the 200GB dataset (with some buffer)")
else:
    print(f"❌ Not enough space! You need at least {required_space} GB, but only have {free_gb:.2f} GB free")
    print(f"Consider setting a custom cache directory on a drive with more space")

# Division of categories into subsets per person:


In [None]:
subset_sizes = [11, 11, 12]  # Define the sizes of each subset

# Create subsets
subsets = []
start_index = 0
for size in subset_sizes:
    subsets.append(VALID_CATEGORIES[start_index:start_index + size])
    start_index += size

# Unpack the subsets into individual variables
kailash_subset, tbd_subset1, tbd_subset2 = subsets

print(f"Subset list: {subsets}")
print(f"Length: {len(VALID_CATEGORIES)}")
# Output the subsets


# Setting up log directory and logging system
- Used for proof of file acquisition

In [None]:
log_dir = Path("logs")
log_dir.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    filename=log_dir / 'download_log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Error handling:
- In the event of an issue where the folder was not dowloaded properly, as reflected by the output in this notebook, use the cell below while specifying the category that failed to download in the category variable. 
- In the event of a issue with preprocessing any particular category (OS error, Permissions error, Temp file being deleted before full extraction) it is likely the initial download was corrupted by some means. As such simply redownload via the cell below, again specifying the failed category

In [None]:
def verify_compressed_file(file_path):
    """
    Verifies if a compressed file (tar.gz, zip, etc.) is valid and not corrupted.
    
    Args:
        file_path: Path to the compressed file
        
    Returns:
        bool: True if the file is valid, False otherwise
    """
    file_path = Path(file_path)
    
    if not file_path.exists():
        print(f"Error: File does not exist: {file_path}")
        return False
        
    try:
        if str(file_path).endswith('.tar.gz') or str(file_path).endswith('.tgz'):
            # Test if tar.gz file is valid
            with tarfile.open(file_path, 'r:gz') as tar:
                # Just check the integrity by attempting to list contents
                tar.getnames()
                print(f"✓ Verified {file_path} is a valid tar.gz file")
                return True
                
        elif str(file_path).endswith('.zip'):
            # Test if zip file is valid
            with zipfile.ZipFile(file_path) as zip_file:
                # Check the integrity of the zip file
                result = zip_file.testzip()
                if result is None:
                    print(f"✓ Verified {file_path} is a valid zip file")
                    return True
                else:
                    print(f"Error: {file_path} is corrupted. First bad file: {result}")
                    return False
        else:
            print(f"Error: Unsupported file format for {file_path}")
            return False
            
    except Exception as e:
        print(f"Error verifying {file_path}: {str(e)}")
        return False

# Downloading the folders and metadata
- Just change the file path in save_path accordingly to where you want it stored
- Also change the "kailash_subset" to the subset variable assigned from above 

In [None]:
save_path = Path(r"C:\BigDataA3\A3_Dataset")
save_path.mkdir(parents=True, exist_ok=True)

for category in kailash_subset:
    logging.info(f"n------Processing {category}------")
    print(f"\n------Processing {category}------")

    download_all_amazon_reviews(base_save_path=save_path, 
                                categories=[category], compress=False)

    review_folder = save_path / f"raw_review_{category}"
    metadata_folder = save_path / f"raw_meta_{category}"

    if review_folder.exists():
        compressed = compress_folder(review_folder, 
                                        compression_format="gz", 
                                        level=6)
        
        logging.info(f"Compressed reviews to: {compressed}")
        print(f"Compressed reviews to: {compressed}")

    if metadata_folder.exists():
        compressed = compress_folder(metadata_folder, 
                                        compression_format="gz", 
                                        level=6)
        
        logging.info(f"Compressed metadata to: {compressed}")
        print(f"Compressed metadata to: {compressed}")

    
    verify_compressed_file(review_folder)
    verify_compressed_file(metadata_folder) 

In [None]:

category = ""

print(f"\n------Processing {category}------")

download_all_amazon_reviews(base_save_path=save_path, 
                            categories=[category], compress=False)

review_folder = save_path / f"raw_review_{category}"
metadata_folder = save_path / f"raw_meta_{category}"

if review_folder.exists():
    compressed = compress_folder(review_folder, 
                                    compression_format="gz", 
                                    level=6)
    

    print(f"Compressed reviews to: {compressed}")

if metadata_folder.exists():
    compressed = compress_folder(metadata_folder, 
                                    compression_format="gz", 
                                    level=6)
    
    
    print(f"Compressed metadata to: {compressed}")