# Test 03: HuggingFace Downloads

**Purpose:** Actually download HuggingFace models and datasets, then verify they went to the correct locations.

**What we'll download:**
1. A small transformer model (distilbert-base-uncased)
2. A small dataset (imdb, just first 100 samples)

**Expected Locations:**
- **CoCalc Home:** `~/home_workspace/downloads/huggingface/` and `~/home_workspace/data/`
- **Compute Server:** `~/cs_workspace/downloads/huggingface/` and `~/cs_workspace/data/`
- **NOT:** `~/.cache/huggingface/`

**Run this on:** Both CoCalc base and Compute Server

---

## IMPORTANT: Restart kernel before running this test!

In [None]:
# DS776 Environment Setup & Package Update
# Configures storage paths for proper cleanup/sync, then updates introdl if needed
# If this cell fails, see Lessons/Course_Tools/AUTO_UPDATE_SYSTEM.md for help
%run ../../Lessons/Course_Tools/auto_update_introdl.py

In [None]:
# Pre-download check: What's in ~/.cache?
from pathlib import Path
import os

home = Path.home()
bad_cache = home / '.cache' / 'huggingface'

print("=" * 60)
print("PRE-DOWNLOAD CHECK: ~/.cache/huggingface")
print("=" * 60)

if bad_cache.exists():
    # Get total size
    total_size = sum(f.stat().st_size for f in bad_cache.rglob('*') if f.is_file())
    print(f"WARNING: ~/.cache/huggingface exists ({total_size / 1024 / 1024:.1f} MB)")
    print("Contents:")
    for item in bad_cache.iterdir():
        if item.is_dir():
            size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
            print(f"  {item.name}/: {size / 1024 / 1024:.1f} MB")
    print("\nNote: This is pre-existing content, not from this test.")
else:
    print("Good: ~/.cache/huggingface does not exist")

In [None]:
# Check expected cache locations
import os
from pathlib import Path

print("\n" + "=" * 60)
print("EXPECTED CACHE LOCATIONS")
print("=" * 60)

hf_home = os.environ.get('HF_HOME', 'NOT SET')
hf_datasets = os.environ.get('HF_DATASETS_CACHE', 'NOT SET')

print(f"HF_HOME: {hf_home}")
print(f"HF_DATASETS_CACHE: {hf_datasets}")

# Create directories if needed
if hf_home != 'NOT SET':
    Path(hf_home).mkdir(parents=True, exist_ok=True)
if hf_datasets != 'NOT SET':
    Path(hf_datasets).mkdir(parents=True, exist_ok=True)

## Test A: Download a Transformer Model

In [None]:
# Download a small model
from transformers import AutoTokenizer, AutoModel

model_name = "distilbert-base-uncased"
print(f"\nDownloading model: {model_name}")
print("(This may take a minute on first run...)\n")

tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Tokenizer downloaded successfully")

# Just download config, not full model (faster)
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
print(f"Config downloaded successfully")
print(f"Model type: {config.model_type}")

In [None]:
# Verify model download location
from pathlib import Path
import os

print("\n" + "=" * 60)
print("MODEL DOWNLOAD VERIFICATION")
print("=" * 60)

home = Path.home()

# Check expected location
hf_home = Path(os.environ.get('HF_HOME', ''))
expected_hub = hf_home / 'hub'

if expected_hub.exists():
    print(f"\nCorrect location ({expected_hub}):")
    for item in expected_hub.iterdir():
        if 'distilbert' in item.name.lower():
            size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
            print(f"  FOUND: {item.name} ({size / 1024 / 1024:.1f} MB)")

# Check bad location
bad_hub = home / '.cache' / 'huggingface' / 'hub'
if bad_hub.exists():
    new_distilbert = [d for d in bad_hub.iterdir() if 'distilbert' in d.name.lower()]
    if new_distilbert:
        print(f"\nWARNING: Model found in ~/.cache/huggingface/hub!")
        for d in new_distilbert:
            print(f"  {d.name}")
    else:
        print(f"\nGood: No new distilbert in ~/.cache/huggingface/hub")
else:
    print(f"\nGood: ~/.cache/huggingface/hub does not exist")

## Test B: Download a Dataset

In [None]:
# Download a small portion of a dataset
from datasets import load_dataset

print("\nDownloading IMDB dataset (first 100 samples)...")
print("(This may take a minute on first run...)\n")

# Just download a tiny slice
dataset = load_dataset("imdb", split="train[:100]")
print(f"Dataset downloaded successfully")
print(f"Number of samples: {len(dataset)}")
print(f"Features: {dataset.features}")

In [None]:
# Verify dataset download location
from pathlib import Path
import os

print("\n" + "=" * 60)
print("DATASET DOWNLOAD VERIFICATION")
print("=" * 60)

home = Path.home()

# Check expected location
hf_datasets = Path(os.environ.get('HF_DATASETS_CACHE', ''))

if hf_datasets.exists():
    print(f"\nCorrect location ({hf_datasets}):")
    # Look for imdb or downloads folder
    found = False
    for item in hf_datasets.rglob('*'):
        if 'imdb' in item.name.lower() and item.is_dir():
            size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
            print(f"  FOUND: {item.relative_to(hf_datasets)} ({size / 1024 / 1024:.1f} MB)")
            found = True
            break
    if not found:
        # List top-level items
        for item in hf_datasets.iterdir():
            if item.is_dir():
                size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
                print(f"  {item.name}/: {size / 1024 / 1024:.1f} MB")

# Check bad location
bad_datasets = home / '.cache' / 'huggingface' / 'datasets'
if bad_datasets.exists():
    imdb_in_bad = list(bad_datasets.rglob('*imdb*'))
    if imdb_in_bad:
        print(f"\nWARNING: IMDB found in ~/.cache/huggingface/datasets!")
        for d in imdb_in_bad[:3]:  # Show first 3
            print(f"  {d}")
    else:
        print(f"\nGood: No IMDB in ~/.cache/huggingface/datasets")
else:
    print(f"\nGood: ~/.cache/huggingface/datasets does not exist")

## Final Summary

In [None]:
# Final summary
from pathlib import Path
import os

print("\n" + "=" * 60)
print("FINAL SUMMARY: HuggingFace Downloads")
print("=" * 60)

home = Path.home()

# Check correct locations
hf_home = Path(os.environ.get('HF_HOME', ''))
hf_datasets = Path(os.environ.get('HF_DATASETS_CACHE', ''))

print("\nCorrect Locations:")
if hf_home.exists():
    size = sum(f.stat().st_size for f in hf_home.rglob('*') if f.is_file())
    print(f"  HF_HOME: {hf_home} ({size / 1024 / 1024:.1f} MB)")
if hf_datasets.exists():
    size = sum(f.stat().st_size for f in hf_datasets.rglob('*') if f.is_file())
    print(f"  HF_DATASETS_CACHE: {hf_datasets} ({size / 1024 / 1024:.1f} MB)")

# Check bad location
bad_cache = home / '.cache' / 'huggingface'
print("\nBad Location (~/.cache/huggingface):")
if bad_cache.exists():
    size = sum(f.stat().st_size for f in bad_cache.rglob('*') if f.is_file())
    print(f"  EXISTS with {size / 1024 / 1024:.1f} MB")
    print("  (May be pre-existing content, check timestamps)")
else:
    print("  Does not exist - PERFECT!")

print("\n" + "=" * 60)
print("If downloads went to correct locations, the fix is working!")
print("=" * 60)

## Next Steps

- **Test_04:** Test torchvision model downloads
- **Test_05:** Full verification of all cache locations