In [10]:
import os
from pathlib import Path

# 设定项目内的 CLIP 权重缓存目录（便于可控与复现）
PROJECT_CLIP_DIR = "/Users/Mac/code/Practice/ML/CV/clip/.models/clip"
os.makedirs(PROJECT_CLIP_DIR, exist_ok=True)

# 可选：强制重新下载，先清理默认缓存与项目缓存里的旧权重
for p in [
    os.path.expanduser("~/.cache/clip/ViT-B-32.pt"),
    os.path.expanduser("~/.cache/clip/ViT-B-32.pt.gz"),
    os.path.join(PROJECT_CLIP_DIR, "ViT-B-32.pt"),
    os.path.join(PROJECT_CLIP_DIR, "ViT-B-32.pt.gz"),
]:
    try:
        Path(p).unlink()
        print("Removed:", p)
    except FileNotFoundError:
        pass
print("Cache cleanup done.")

Cache cleanup done.


In [14]:
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm

# 设备优先级：CUDA > MPS(Apple Silicon) > CPU
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print("Using device:", device)

# 对 MPS，禁用 jit 更稳定；同时把模型放到对应设备
model, preprocess = clip.load(
    "ViT-B/32",
    device=device,
    jit=False,  # MPS 更稳定
    download_root=PROJECT_CLIP_DIR
)
model.eval()

root = os.path.expanduser("/Users/Mac/code/Practice/ML/CV/clip")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)

# 简单的批大小/加载器参数按设备调整
BATCH_SIZE = 256 if device == "cuda" else (128 if device == "mps" else 64)
NUM_WORKERS = min(4, os.cpu_count() or 2)
PIN_MEMORY = device == "cuda"  # 仅对 CUDA 有效

Using device: mps


100%|████████████████████████████████████████| 338M/338M [07:38<00:00, 771kiB/s]


transform: 对样本的图像部分进行预处理的函数或 torchvision.transforms 组合。  
在你代码里传入的是 preprocess，它来自 model, preprocess = clip.load("ViT-B/32", ...).  
这个 preprocess 是 CLIP 官方为该模型准备的图像变换流水线，通常包含：  
resize 到模型输入分辨率（ViT-B/32 为 224）；  
中心裁剪；  
转成张量；  
用 CLIP 的 mean/std 进行归一化。  
作用：保证送入 model.encode_image() 的张量与 CLIP 预训练时的分布一致，避免精度损失。

In [15]:
def get_features(dataset, batch_size=BATCH_SIZE):
    all_features = []
    all_labels = []

    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        persistent_workers=NUM_WORKERS > 0,
    )

    with torch.no_grad():
        for images, labels in tqdm(loader):
            images = images.to(device, non_blocking=PIN_MEMORY)
            features = model.encode_image(images)
            all_features.append(features.cpu())   # 及时回收显存/MPS 内存
            all_labels.append(labels)

    return torch.cat(all_features).numpy(), torch.cat(all_labels).numpy()

In [16]:
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)

classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(float)) * 100.0
print(f"Accuracy = {accuracy:.3f}")

  0%|          | 1/391 [00:26<2:51:36, 26.40s/it]

: 