In [1]:
#!/usr/bin/env python3
"""
Simple script to understand where HuggingFace and other caches are stored
Run this BEFORE loading any models to see the defaults
"""

import os
from pathlib import Path

def check_environment_defaults():
    """Check what the default cache locations are"""
    print("=== DEFAULT CACHE LOCATIONS ===")
    
    # HuggingFace defaults
    home = Path.home()
    
    print(f"Your HOME directory: {home}")
    print(f"Home directory size: {get_folder_size(home):.2f} GB")
    print()
    
    # Check HuggingFace environment variables
    print("HuggingFace Cache Variables:")
    hf_home = os.environ.get('HF_HOME')
    hf_cache = os.environ.get('HF_CACHE_HOME') 
    transformers_cache = os.environ.get('TRANSFORMERS_CACHE')
    
    if hf_home:
        print(f"  HF_HOME: {hf_home}")
    else:
        default_hf = home / ".cache" / "huggingface"
        print(f"  HF_HOME: Not set (default: {default_hf})")
        
    if hf_cache:
        print(f"  HF_CACHE_HOME: {hf_cache}")
    else:
        print(f"  HF_CACHE_HOME: Not set (will use HF_HOME)")
        
    if transformers_cache:
        print(f"  TRANSFORMERS_CACHE: {transformers_cache}")
    else:
        print(f"  TRANSFORMERS_CACHE: Not set (will use HF_HOME)")
    
    print()

def get_folder_size(folder_path):
    """Get folder size in GB"""
    try:
        total_size = 0
        for dirpath, dirnames, filenames in os.walk(folder_path):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                try:
                    total_size += os.path.getsize(filepath)
                except (OSError, FileNotFoundError):
                    continue
        return total_size / (1024**3)  # Convert to GB
    except:
        return 0

def check_existing_caches():
    """Check what's already in the cache directories"""
    print("=== EXISTING CACHE CONTENTS ===")
    
    home = Path.home()
    cache_locations = [
        ("HuggingFace Hub", home / ".cache" / "huggingface" / "hub"),
        ("HuggingFace Transformers", home / ".cache" / "huggingface" / "transformers"), 
        ("PyTorch Hub", home / ".cache" / "torch" / "hub"),
        ("General Cache", home / ".cache"),
    ]
    
    for name, path in cache_locations:
        if path.exists():
            size = get_folder_size(path)
            print(f"  {name}: EXISTS - {size:.2f} GB")
            
            # List contents if it's small enough
            if size < 1.0:  # Less than 1GB, show contents
                try:
                    contents = list(path.iterdir())[:5]  # First 5 items
                    for item in contents:
                        item_size = get_folder_size(item) if item.is_dir() else item.stat().st_size / (1024**3)
                        print(f"    - {item.name}: {item_size:.3f} GB")
                    if len(list(path.iterdir())) > 5:
                        print(f"    ... and {len(list(path.iterdir())) - 5} more items")
                except:
                    print(f"    (Could not list contents)")
        else:
            print(f"  {name}: Does not exist")
    
    print()

def check_disk_space():
    """Check disk space in home and scratch"""
    print("=== DISK SPACE ===")
    
    import shutil
    
    # Home directory
    try:
        home_usage = shutil.disk_usage(Path.home())
        home_total = home_usage.total / (1024**3)
        home_used = home_usage.used / (1024**3) 
        home_free = home_usage.free / (1024**3)
        
        print(f"HOME directory:")
        print(f"  Total: {home_total:.1f} GB")
        print(f"  Used: {home_used:.1f} GB") 
        print(f"  Free: {home_free:.1f} GB")
        print(f"  Usage: {(home_used/home_total)*100:.1f}%")
        
        if home_free < 5:
            print("  ⚠️  WARNING: Less than 5GB free!")
            
    except Exception as e:
        print(f"Could not check home disk usage: {e}")
    
    # Scratch directory
    scratch_dir = f"/scratch/{os.environ.get('USER', 'unknown')}"
    if os.path.exists(scratch_dir):
        try:
            scratch_usage = shutil.disk_usage(scratch_dir)
            scratch_free = scratch_usage.free / (1024**3)
            scratch_total = scratch_usage.total / (1024**3)
            print(f"\nSCRATCH directory ({scratch_dir}):")
            print(f"  Free: {scratch_free:.1f} GB")
            print(f"  Total: {scratch_total:.1f} GB")
        except Exception as e:
            print(f"Could not check scratch usage: {e}")
    else:
        print(f"\nSCRATCH directory: {scratch_dir} does not exist")
    
    print()

def what_happens_when_loading():
    """Explain what happens when models load"""
    print("=== WHAT HAPPENS WHEN LOADING MODELS ===")
    print()
    print("1. HuggingFace AutoModel.from_pretrained():")
    print("   - Downloads model files to ~/.cache/huggingface/hub/")
    print("   - Each model gets a folder like 'models--Qwen--Qwen-7B-Chat'")
    print("   - Files include: pytorch_model.bin, config.json, tokenizer files")
    print("   - These can be 7-14GB+ PER MODEL")
    print()
    print("2. vLLM LLM() loading:")
    print("   - Loads the model files from HuggingFace cache")
    print("   - Loads model weights into GPU memory")
    print("   - May create additional cache files")
    print()
    print("3. When you create model copies (for ablation):")
    print("   - copy.deepcopy() duplicates the ENTIRE model in RAM")
    print("   - This can be 7-14GB+ in system RAM")
    print()
    print("4. When you save ablated models:")
    print("   - model.save_pretrained() saves to disk")
    print("   - Your code saves to './ablated_model_*' directories")
    print("   - Each ablated model is ANOTHER 7-14GB on disk")
    print()

def main():
    print("🔍 UNDERSTANDING CACHE AND STORAGE")
    print("=" * 50)
    
    check_environment_defaults()
    check_existing_caches() 
    check_disk_space()
    what_happens_when_loading()
    
    print("=" * 50)
    print("🤔 LIKELY ISSUES:")
    print("1. HuggingFace cache filling up your HOME directory")
    print("2. Ablated model checkpoints saving to current directory")
    print("3. Model copies using too much RAM")
    print("4. Multiple model versions accumulating")
    print()
    print("Run this script, then run your experiment, then run it again")
    print("to see what changed!")

if __name__ == "__main__":
    main()

🔍 UNDERSTANDING CACHE AND STORAGE
=== DEFAULT CACHE LOCATIONS ===
Your HOME directory: /home/an3854
Home directory size: 37.30 GB

HuggingFace Cache Variables:
  HF_HOME: /scratch/an3854/huggingface_cache
  HF_CACHE_HOME: Not set (will use HF_HOME)
  TRANSFORMERS_CACHE: Not set (will use HF_HOME)

=== EXISTING CACHE CONTENTS ===
  HuggingFace Hub: Does not exist
  HuggingFace Transformers: Does not exist
  PyTorch Hub: Does not exist
  General Cache: Does not exist

=== DISK SPACE ===
HOME directory:
  Total: 15117.5 GB
  Used: 11604.7 GB
  Free: 2857.4 GB
  Usage: 76.8%

SCRATCH directory (/scratch/an3854):
  Free: 5594529.5 GB
  Total: 7847311.7 GB

=== WHAT HAPPENS WHEN LOADING MODELS ===

1. HuggingFace AutoModel.from_pretrained():
   - Downloads model files to ~/.cache/huggingface/hub/
   - Each model gets a folder like 'models--Qwen--Qwen-7B-Chat'
   - Files include: pytorch_model.bin, config.json, tokenizer files
   - These can be 7-14GB+ PER MODEL

2. vLLM LLM() loading:
   - L

Loading model directly...


Downloading shards:  25%|██▌       | 1/4 [00:10<00:31, 10.40s/it]