# Setup for pre-cache of HF models used in the course lessons

Requires the HuggingFace CLI tool to be installed e.g. `brew install huggingface-cli`.

Specify the list of models to be downloaded (if you have already downloaded them they will not be downloaded again.)

Note: The usual cache location is `~/.cache/huggingface`.

In [14]:
!ls ~/.cache/huggingface

[34mdatasets[m[m                            [34mmodules[m[m
[34mhub[m[m                                 stored_tokens
[34mmodels--HuggingFaceTB--SmolLM2-135M[m[m token
[34mmodels--Qwen--Qwen3-0.6B-Base[m[m       [34mxet[m[m
[34mmodels--banghua--Qwen3-0.6B-SFT[m[m


In [15]:
lesson_models = [
    "Qwen/Qwen3-0.6B-Base",
    "banghua/Qwen3-0.6B-SFT",
    "HuggingFaceTB/SmolLM2-135Mbanghua/Qwen2.5-0.5B-DPO",
    "HuggingFaceTB/SmolLM2-135M-Instruct",
    "banghua/Qwen2.5-0.5B-DPO",
    "Qwen/Qwen2.5-0.5B-Instruct",
    "banghua/Qwen2.5-0.5B-GRPO",
]

In [6]:
# List of models to be downloaded

hf_models_to_download = [
    "smol-ai/SmolVLM-256M-Instruct",
    "HuggingFaceTB/SmolLM2-135M-Instruct"
]

In [9]:
import httpx

def hf_models_to_urls_with_status(model_ids, repo_type="model", timeout=5.0):
    """
    For each model ID, build its Hugging Face Hub URL and check if it exists.

    Args:
        model_ids (list of str): List like ['smol-ai/SmolVLM-256M-Instruct']
        repo_type (str): 'model', 'dataset', or 'space'
        timeout (float): HTTP timeout in seconds

    Returns:
        list of dicts with keys: 'id', 'url', 'exists' (bool), 'status_code' (int)
    """
    base_url = "https://huggingface.co"
    if repo_type not in {"model", "dataset", "space"}:
        raise ValueError(f"Invalid repo_type: {repo_type}")

    type_path = {
        "model": "",          # e.g., /facebook/opt-125m
        "dataset": "/datasets",
        "space": "/spaces"
    }[repo_type]

    results = []
    with httpx.Client(timeout=timeout, follow_redirects=True) as client:
        for model_id in model_ids:
            url = f"{base_url}{type_path}/{model_id}"
            try:
                response = client.get(url)
                exists = response.status_code == 200
            except httpx.RequestError as e:
                exists = False
                response = None
                print(f"‚ö†Ô∏è Request to {url} failed: {e}")
            results.append({
                "id": model_id,
                "url": url,
                "exists": exists,
                "status_code": response.status_code if response else None
            })
    return results


In [10]:
hf_models_to_urls_with_status(hf_models_to_download)

[{'id': 'smol-ai/SmolVLM-256M-Instruct',
  'url': 'https://huggingface.co/smol-ai/SmolVLM-256M-Instruct',
  'exists': False,
  'status_code': 401},
 {'id': 'HuggingFaceTB/SmolLM2-135M-Instruct',
  'url': 'https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct',
  'exists': True,
  'status_code': 200}]

In [11]:
def hf_repo_exists(model_id, repo_type="model"):
    """Check if a Hugging Face model/dataset/space exists."""
    base_url = "https://huggingface.co"
    type_path = {
        "model": "",
        "dataset": "/datasets",
        "space": "/spaces"
    }[repo_type]

    url = f"{base_url}{type_path}/{model_id}"
    try:
        resp = httpx.get(url, follow_redirects=True)
        return resp.status_code == 200
    except httpx.RequestError as e:
        print(f"‚ö†Ô∏è Error checking {url}: {e}")
        return False

In [13]:
# üîß Set dry_run = True to skip downloading
dry_run = False  # toggle this to False when you're ready to download

# Download or preview
n_checked = 0
n_found = 0
n_downloaded = 0

for repo in hf_models_to_download:
    print(f"üîé Checking: {repo}...", end=" ")
    n_checked += 1
    if hf_repo_exists(repo):
        n_found += 1
        print("‚úÖ Found", end=" ")
        if dry_run:
            print("‚Äî dry run (skipped download)")
        else:
            print("‚Äî downloading")
            !huggingface-cli download {repo}
            n_downloaded += 1
    else:
        print("‚ùå Not found ‚Äî skipped")

print("\nüü© Summary:")
print(f"  üîç Checked    : {n_checked}")
print(f"  ‚úÖ Found      : {n_found}")
print(f"  üì• Downloaded : {n_downloaded} {'(dry run)' if dry_run else ''}")

üîé Checking: smol-ai/SmolVLM-256M-Instruct... ‚ùå Not found ‚Äî skipped
üîé Checking: HuggingFaceTB/SmolLM2-135M-Instruct... ‚úÖ Found ‚Äî downloading
Fetching 25 files:   0%|                                 | 0/25 [00:00<?, ?it/s]Downloading 'onnx/model_fp16.onnx' to '/Users/mjboothaus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/e85e706ce14a3499e1f698c0d638b430df75b5f67131d1516bd0d46b844dfb2e.incomplete'
Downloading 'onnx/model_quantized.onnx' to '/Users/mjboothaus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/ecc1a19eece6494e2963cf74f78ace35916e8d0b803168ddd41db00979f18e39.incomplete' (resume from 137147981/137147981)
Downloading 'onnx/model_int8.onnx' to '/Users/mjboothaus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/blobs/a7c33f9ef85d06734cc9d1f943f7e2ba57c77e769df727f4d3e217d9f672b0cc.incomplete' (resume from 137147867/137147867)
Downloading 'onnx/model_q4.onnx' to '/Users/mjboothaus/.cache/hugging