# Test 05: Full Verification & Cleanup Test

**Purpose:** 
1. Comprehensive verification of all cache locations
2. Verify Storage_Cleanup.ipynb can find and clean the files
3. Summary report of the entire test suite

**Run this AFTER:** Tests 01-04

**Run this on:** Both CoCalc base and Compute Server

In [None]:
# DS776 Environment Setup & Package Update
# Configures storage paths for proper cleanup/sync, then updates introdl if needed
# If this cell fails, see Lessons/Course_Tools/AUTO_UPDATE_SYSTEM.md for help
%run ../../Lessons/Course_Tools/auto_update_introdl.py

In [None]:
from pathlib import Path
import os

home = Path.home()

def get_dir_size(path):
    """Get total size of a directory in MB."""
    if not path.exists():
        return 0
    return sum(f.stat().st_size for f in path.rglob('*') if f.is_file()) / 1024 / 1024

def format_size(mb):
    """Format size nicely."""
    if mb < 1:
        return f"{mb*1024:.1f} KB"
    elif mb < 1024:
        return f"{mb:.1f} MB"
    else:
        return f"{mb/1024:.2f} GB"

print("Helper functions defined.")

## Section 1: Environment Detection

In [None]:
print("=" * 70)
print("ENVIRONMENT DETECTION")
print("=" * 70)

# Check multiple CoCalc indicators (not all may be present)
is_cocalc = (home / '.cocalc').exists() or (home / '.smc').exists() or 'COCALC_PROJECT_ID' in os.environ

# Detect environment
if is_cocalc:
    cs_workspace = home / 'cs_workspace'
    if cs_workspace.exists() and (home / 'home_workspace').exists():
        env_type = "CoCalc COMPUTE SERVER"
        expected_cache = cs_workspace / 'downloads'
        expected_data = cs_workspace / 'data'
        synced_storage = home / 'home_workspace'
    else:
        env_type = "CoCalc HOME SERVER"
        expected_cache = home / 'home_workspace' / 'downloads'
        expected_data = home / 'home_workspace' / 'data'
        synced_storage = home / 'home_workspace'
else:
    env_type = "Local Development"
    root = Path(os.environ.get('DS776_ROOT_DIR', str(home)))
    expected_cache = root / 'home_workspace' / 'downloads'
    expected_data = root / 'home_workspace' / 'data'
    synced_storage = root / 'home_workspace'

print(f"\nEnvironment: {env_type}")
print(f"Expected cache location: {expected_cache}")
print(f"Expected data location: {expected_data}")
print(f"Synced storage: {synced_storage}")

## Section 2: Complete Storage Audit

In [None]:
print("\n" + "=" * 70)
print("COMPLETE STORAGE AUDIT")
print("=" * 70)

# Good locations (where files SHOULD be)
print("\n" + "-" * 40)
print("GOOD LOCATIONS (where files should be):")
print("-" * 40)

good_locations = {
    'Downloads/Cache': expected_cache,
    'Data': expected_data,
    'HuggingFace Hub': expected_cache / 'huggingface' / 'hub',
    'Torch Hub': expected_cache / 'hub',
}

total_good = 0
for name, path in good_locations.items():
    if path.exists():
        size = get_dir_size(path)
        total_good += size
        status = "OK" if size > 0 else "empty"
        print(f"  {name}: {format_size(size)} [{status}]")
        print(f"    Path: {path}")
    else:
        print(f"  {name}: (not created yet)")
        print(f"    Path: {path}")

print(f"\n  TOTAL in good locations: {format_size(total_good)}")

In [None]:
# Bad locations (where files should NOT be)
print("\n" + "-" * 40)
print("BAD LOCATIONS (files here won't be cleaned):")
print("-" * 40)

bad_locations = {
    '~/.cache/huggingface': home / '.cache' / 'huggingface',
    '~/.cache/torch': home / '.cache' / 'torch',
    '~/.cache (other)': home / '.cache',
}

total_bad = 0
issues = []

for name, path in bad_locations.items():
    if path.exists():
        size = get_dir_size(path)
        if 'other' in name:
            # Subtract huggingface and torch from total .cache
            hf_size = get_dir_size(home / '.cache' / 'huggingface')
            torch_size = get_dir_size(home / '.cache' / 'torch')
            size = size - hf_size - torch_size
        total_bad += size
        if size > 0.1:  # More than 100KB
            status = "WARNING"
            issues.append((name, size))
        else:
            status = "minimal"
        print(f"  {name}: {format_size(size)} [{status}]")
    else:
        print(f"  {name}: (does not exist) [GOOD]")

print(f"\n  TOTAL in bad locations: {format_size(total_bad)}")

if issues:
    print("\n  ISSUES DETECTED:")
    for name, size in issues:
        print(f"    - {name} has {format_size(size)} that won't be cleaned!")

## Section 3: Detailed Cache Contents

In [None]:
print("\n" + "=" * 70)
print("DETAILED CACHE CONTENTS")
print("=" * 70)

# Check HuggingFace models in correct location
hf_hub = expected_cache / 'huggingface' / 'hub'
print(f"\nHuggingFace Hub ({hf_hub}):")
if hf_hub.exists():
    models_dir = hf_hub / 'models--*'
    for model_dir in sorted(hf_hub.glob('models--*')):
        size = get_dir_size(model_dir)
        model_name = model_dir.name.replace('models--', '').replace('--', '/')
        print(f"  {model_name}: {format_size(size)}")
else:
    print("  (not created yet)")

# Check torch models in correct location
torch_hub = expected_cache / 'hub' / 'checkpoints'
print(f"\nTorch Hub Checkpoints ({torch_hub}):")
if torch_hub.exists():
    for ckpt in sorted(torch_hub.glob('*.pth')):
        size = ckpt.stat().st_size / 1024 / 1024
        print(f"  {ckpt.name}: {format_size(size)}")
else:
    # Check alternative location
    alt_torch = expected_cache / 'checkpoints'
    if alt_torch.exists():
        print(f"  (Found at {alt_torch} instead)")
        for ckpt in sorted(alt_torch.glob('*.pth')):
            size = ckpt.stat().st_size / 1024 / 1024
            print(f"  {ckpt.name}: {format_size(size)}")
    else:
        print("  (not created yet)")

# Check datasets in correct location
print(f"\nDatasets ({expected_data}):")
if expected_data.exists():
    for item in sorted(expected_data.iterdir()):
        if item.is_dir():
            size = get_dir_size(item)
            print(f"  {item.name}/: {format_size(size)}")
else:
    print("  (not created yet)")

## Section 4: Cleanup Utility Compatibility Check

In [None]:
print("\n" + "=" * 70)
print("CLEANUP UTILITY COMPATIBILITY")
print("=" * 70)

# These are the paths that Storage_Cleanup.ipynb searches
# (Based on the cleanup utility implementation)

cleanup_paths = [
    ('home_workspace/downloads', home / 'home_workspace' / 'downloads'),
    ('home_workspace/data', home / 'home_workspace' / 'data'),
    ('cs_workspace/downloads', home / 'cs_workspace' / 'downloads'),
    ('cs_workspace/data', home / 'cs_workspace' / 'data'),
]

print("\nPaths searched by Storage_Cleanup.ipynb:")
for name, path in cleanup_paths:
    if path.exists():
        size = get_dir_size(path)
        print(f"  {name}: {format_size(size)} - WILL BE CLEANED")
    else:
        print(f"  {name}: (not found)")

# Check if bad locations would be missed
bad_hf = home / '.cache' / 'huggingface'
bad_torch = home / '.cache' / 'torch'

print("\nLocations NOT searched (would be missed):")
if bad_hf.exists() and get_dir_size(bad_hf) > 0.1:
    print(f"  WARNING: ~/.cache/huggingface ({format_size(get_dir_size(bad_hf))})")
else:
    print(f"  ~/.cache/huggingface: OK (empty or doesn't exist)")
    
if bad_torch.exists() and get_dir_size(bad_torch) > 0.1:
    print(f"  WARNING: ~/.cache/torch ({format_size(get_dir_size(bad_torch))})")
else:
    print(f"  ~/.cache/torch: OK (empty or doesn't exist)")

## Section 5: Final Test Summary

In [None]:
print("\n" + "=" * 70)
print("FINAL TEST SUMMARY")
print("=" * 70)

# Gather all test results
tests = []

# Test 1: Environment variables set
env_vars_ok = all([
    os.environ.get('TORCH_HOME'),
    os.environ.get('HF_HOME'),
    os.environ.get('HF_DATASETS_CACHE'),
])
tests.append(('Environment variables configured', env_vars_ok))

# Test 2: No ~/.cache usage
bad_hf_size = get_dir_size(home / '.cache' / 'huggingface')
bad_torch_size = get_dir_size(home / '.cache' / 'torch')
no_bad_cache = (bad_hf_size < 1) and (bad_torch_size < 1)  # Less than 1MB in each
tests.append(('No significant ~/.cache usage', no_bad_cache))

# Test 3: Correct locations exist
correct_locations = expected_cache.exists()
tests.append(('Correct cache location exists', correct_locations))

# Test 4: Files in correct location
good_size = get_dir_size(expected_cache)
files_in_good = good_size > 0.1  # More than 100KB
tests.append(('Files stored in correct location', files_in_good))

# Test 5: Cleanup compatibility
cleanup_compatible = (
    (home / 'home_workspace' / 'downloads').exists() or 
    (home / 'cs_workspace' / 'downloads').exists()
)
tests.append(('Cleanup utility compatible', cleanup_compatible))

print("\nTest Results:")
all_passed = True
for name, passed in tests:
    status = "PASS" if passed else "FAIL"
    emoji = "" if passed else ""
    if not passed:
        all_passed = False
    print(f"  {emoji} [{status}] {name}")

print("\n" + "-" * 40)
if all_passed:
    print("ALL TESTS PASSED!")
    print("\nThe environment setup is working correctly:")
    print("- Cache paths are configured before library imports")
    print("- Downloads go to the correct locations")
    print("- Storage_Cleanup.ipynb will find these files")
else:
    print("SOME TESTS FAILED")
    print("\nPlease review the failed tests above.")
    print("Common issues:")
    print("- Pre-existing files in ~/.cache (from before the fix)")
    print("- Need to restart kernel after running auto_update")

print("\n" + "=" * 70)

## Optional: Clean Up Bad Cache

If you have pre-existing files in `~/.cache` that you want to remove, uncomment and run the cell below.

**WARNING:** This will delete files from `~/.cache/huggingface` and `~/.cache/torch`. Only run if you're sure these are duplicates of files in the correct locations.

In [None]:
# # OPTIONAL: Remove bad cache locations
# # Uncomment the code below to clean up ~/.cache

# import shutil
# from pathlib import Path

# home = Path.home()
# bad_locations = [
#     home / '.cache' / 'huggingface',
#     home / '.cache' / 'torch',
# ]

# for path in bad_locations:
#     if path.exists():
#         size = sum(f.stat().st_size for f in path.rglob('*') if f.is_file()) / 1024 / 1024
#         print(f"Removing {path} ({size:.1f} MB)...")
#         shutil.rmtree(path)
#         print(f"  Removed!")
#     else:
#         print(f"{path} does not exist")

# print("\nDone! Bad cache locations cleaned.")

## Test Complete!

If all tests passed:
1. Environment variables are set correctly before library imports
2. HuggingFace and PyTorch downloads go to the correct locations
3. Storage_Cleanup.ipynb will be able to find and clean these files

**Run this test suite on both:**
- CoCalc Home Server (base project)
- CoCalc Compute Server

to ensure the fix works in both environments.