In [1]:
!pip install torch torchvision transformers ftfy
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import clip  
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [4]:
model, preprocess = clip.load("ViT-L/14", device=device)

100%|████████████████████████████████████████| 890M/890M [00:06<00:00, 137MiB/s]


In [5]:
for p in model.parameters():
    p.requires_grad = False #заморозка модели 

In [7]:
class ProjectionHead(nn.Module): #mlp хэды
    def __init__(self, in_dim=768, out_dim=256):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            nn.GELU(),
            nn.LayerNorm(out_dim)
        )

    def forward(self, x):
        return self.proj(x)

In [8]:
proj_image = ProjectionHead(768, 256).to(device)
proj_text = ProjectionHead(768, 256).to(device)

In [9]:
classifier = nn.Sequential(
    nn.Linear(256, 3)).to(device) #три класса

In [14]:
class ImageTextDataset(Dataset):
    def __init__(self, df, image_dir, transform, train_mode=True):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.train_mode = train_mode
        self.label_map = {'Плохо': 0, 'Удовлетворительно': 1, 'Идеально': 2}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['text']
        filename = row['filename'].replace("competition_data:", "competition_data_")
        img_path = os.path.join(self.image_dir, filename)

    #если нет файла
        if not os.path.exists(img_path):
            image = Image.new("RGB", (224, 224), (0, 0, 0))
        else:
            image = Image.open(img_path).convert("RGB")

        image = self.transform(image)

        if self.train_mode:
            label = self.label_map[row['mark']]
            return text, image, label
        else:
            return text, image, filename

In [19]:
clip_mean = (0.48145466, 0.4578275, 0.40821073)
clip_std = (0.26862954, 0.26130258, 0.27577711)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.05),
    transforms.RandomApply([transforms.GaussianBlur(3)], p=0.3),
    transforms.ToTensor(),
    transforms.Normalize(clip_mean, clip_std),
])

val_transform = transforms.Compose([
    transforms.Resize(224, interpolation=Image.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(clip_mean, clip_std),
])


In [20]:
train_df = pd.read_csv("/kaggle/input/image-text-matching-dataset/train_df.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/image-text-matching-dataset/test_df.tsv", sep="\t")

train_split, val_split = train_test_split(train_df, test_size=0.15, random_state=42, stratify=train_df["mark"])

image_dir_train = "/kaggle/input/image-text-matching-dataset/train_final/train"
image_dir_test = "/kaggle/input/image-text-matching-dataset/test_final/test"

train_dataset = ImageTextDataset(train_split, image_dir_train, train_transform, train_mode=True)
val_dataset = ImageTextDataset(val_split, image_dir_train, val_transform, train_mode=True)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    list(proj_image.parameters()) + list(proj_text.parameters()) + list(classifier.parameters()),
    lr=2e-4, weight_decay=1e-4
)

In [22]:
def train_epoch(model, proj_image, proj_text, classifier, dataloader, optimizer, criterion):
    model.eval()
    proj_image.train()
    proj_text.train()
    classifier.train()

    total_loss = 0
    for texts, images, labels in tqdm(dataloader):
        images = images.to(device)
        labels = labels.to(device)
        text_tokens = clip.tokenize(texts, truncate=True).to(device)

        with torch.no_grad():
            img_features = model.encode_image(images)
            txt_features = model.encode_text(text_tokens)

        img_proj = proj_image(img_features.float())
        txt_proj = proj_text(txt_features.float())


        # Нормализация
        img_proj = img_proj / img_proj.norm(dim=-1, keepdim=True)
        txt_proj = txt_proj / txt_proj.norm(dim=-1, keepdim=True)

        combined = (img_proj + txt_proj) / 2
        preds = classifier(combined)

        loss = criterion(preds, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


def evaluate(model, proj_image, proj_text, classifier, dataloader):
    model.eval()
    proj_image.eval()
    proj_text.eval()
    classifier.eval()

    correct, total = 0, 0
    with torch.no_grad():
        for texts, images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            text_tokens = clip.tokenize(texts, truncate=True).to(device)

            img_features = model.encode_image(images)
            txt_features = model.encode_text(text_tokens)
            
            img_proj = proj_image(img_features.float())
            txt_proj = proj_text(txt_features.float())


            img_proj = img_proj / img_proj.norm(dim=-1, keepdim=True)
            txt_proj = txt_proj / txt_proj.norm(dim=-1, keepdim=True)

            combined = (img_proj + txt_proj) / 2
            preds = classifier(combined)

            pred_labels = preds.argmax(dim=1)
            correct += (pred_labels == labels).sum().item()
            total += labels.size(0)

    return correct / total

In [23]:
for epoch in range(5):
    loss = train_epoch(model, proj_image, proj_text, classifier, train_loader, optimizer, criterion)
    acc = evaluate(model, proj_image, proj_text, classifier, val_loader)
    print(f"Epoch {epoch+1}/5 | Loss: {loss:.4f} | Val acc: {acc:.4f}")

 53%|█████▎    | 738/1380 [05:42<05:09,  2.07it/s]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7deecab1dd00>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
 54%|█████▎    | 741/1380 [05:44<04:55,  2.17it/s]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7deecab1dd00>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdow

Epoch 1/5 | Loss: 0.8808 | Val acc: 0.5998


100%|██████████| 1380/1380 [09:21<00:00,  2.46it/s]


Epoch 2/5 | Loss: 0.8310 | Val acc: 0.6119


100%|██████████| 1380/1380 [08:40<00:00,  2.65it/s]


Epoch 3/5 | Loss: 0.8113 | Val acc: 0.6096


100%|██████████| 1380/1380 [08:25<00:00,  2.73it/s]


Epoch 4/5 | Loss: 0.7961 | Val acc: 0.6075


100%|██████████| 1380/1380 [08:16<00:00,  2.78it/s]


Epoch 5/5 | Loss: 0.7791 | Val acc: 0.6088


In [29]:
def predict(model, proj_image, proj_text, classifier, df, image_dir, transform):
    dataset = ImageTextDataset(df, image_dir, transform, train_mode=False)
    loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)  # num_workers=0 для стабильности

    preds_all, filenames, texts_list = [], [], []
    with torch.no_grad():
        for texts, images, names in tqdm(loader):
            images = images.to(device)
            text_tokens = clip.tokenize(texts, truncate=True).to(device)

            # Получаем эмбеддинги
            img_features = model.encode_image(images)
            txt_features = model.encode_text(text_tokens)

            # Проекционные головы (float для совместимости с FP16)
            img_proj = proj_image(img_features.float())
            txt_proj = proj_text(txt_features.float())

            # Нормализация
            img_proj = img_proj / img_proj.norm(dim=-1, keepdim=True)
            txt_proj = txt_proj / txt_proj.norm(dim=-1, keepdim=True)

            # Комбинируем и классифицируем
            combined = (img_proj + txt_proj) / 2
            outputs = classifier(combined)
            preds = outputs.argmax(dim=1).cpu().numpy()

            preds_all.extend(preds)
            filenames.extend(names)
            texts_list.extend(texts)

    # Перевод индексов в метки
    label_map_inv = {0: "Плохо", 1: "Удовлетворительно", 2: "Идеально"}

    return pd.DataFrame({
        "filename": filenames,
        "text": texts_list,
        "mark": [label_map_inv[p] for p in preds_all]
    })


# Генерация submission в TSV
submission = predict(model, proj_image, proj_text, classifier, test_df, image_dir_test, val_transform)


100%|██████████| 750/750 [12:18<00:00,  1.02it/s]

✅ submission.tsv создан!





In [34]:
def replace_second_underscore(s):
    parts = s.split('_', 2)  # разбиваем на максимум 3 части
    if len(parts) == 3:
        return parts[0] + '_' + parts[1] + ':' + parts[2]
    else:
        return s  # если меньше 2-х _, оставляем как есть

submission['filename'] = submission['filename'].apply(replace_second_underscore)

In [35]:
submission['filename']

0         competition_data:46008.png
1        competition_data:354303.png
2         competition_data:98673.png
3        competition_data:208734.png
4        competition_data:260487.png
                    ...             
11995     competition_data:85036.png
11996    competition_data:333914.png
11997     competition_data:97622.png
11998    competition_data:237715.png
11999    competition_data:325273.png
Name: filename, Length: 12000, dtype: object

In [36]:
submission

Unnamed: 0,filename,text,mark
0,competition_data:46008.png,диаграмма при аритмии сердца,Удовлетворительно
1,competition_data:354303.png,оборона полоцка в 1941 году,Удовлетворительно
2,competition_data:98673.png,короба под инсталляцию,Удовлетворительно
3,competition_data:208734.png,раскраски энчантималс,Идеально
4,competition_data:260487.png,дани милохина,Идеально
...,...,...,...
11995,competition_data:85036.png,проект мой город 2 класс окружающий мир,Удовлетворительно
11996,competition_data:333914.png,страны и заповедникиподмосковье,Идеально
11997,competition_data:97622.png,принцип обеспечивающий единство общего специал...,Удовлетворительно
11998,competition_data:237715.png,инн огрн 440008 г пенза ул ставского 11,Удовлетворительно


In [37]:
submission.to_csv("submission2.tsv", sep='\t', index=False)
print("✅ submission.tsv создан!")

✅ submission.tsv создан!


In [None]:
import pandas as pd
from tqdm import tqdm
import torch
import clip
from PIL import Image
# Тестовый DataFrame с колонками 'filename' и 'text'
# test_df = pd.read_csv("test_data.csv") или другой источник

filenames = []
texts = []
marks = []

model.eval()
with torch.no_grad():
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        filename = row['filename']
        text = row['text']

        # Загружаем изображение и применяем препроцессинг
        image = Image.open(f"{image_dir}/{filename}").convert("RGB")
        image = preprocess(image).unsqueeze(0).to(device)

        # Токенизируем текст
        text_tokens = clip.tokenize([text], truncate=True).to(device)

        # Получаем эмбеддинги
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_tokens)

        # Косинусное сходство
        similarity = torch.cosine_similarity(image_features, text_features)
        sim_value = similarity.item()

        # Простейшая логика для mark (можно заменить на твою классификацию)
        if sim_value > 0.75:
            mark = "Идеально"
        elif sim_value > 0.5:
            mark = "Удовлетворительно"
        else:
            mark = "Плохо"

        filenames.append(filename)
        texts.append(text)
        marks.append(mark)

# Создаём DataFrame и сохраняем в TSV
submission = pd.DataFrame({
    "filename": filenames,
    "text": texts,
    "mark": marks
})

submission.to_csv("submission.tsv", sep='\t', index=False)
print("✅ submission.tsv создан!")


In [None]:
test_df = pd.read_csv("/kaggle/input/image-text-matching-dataset/test.csv")
test_dir = "/kaggle/input/image-text-matching-dataset/test_final/test"

test_df["mark"] = predict(model, proj_image, proj_text, classifier, test_df, test_dir, val_transform)
test_df[["filename", "mark"]].to_csv("submission.csv", index=False)
print("✅ submission.csv создан!")


# Улучшение решения

In [None]:
import os
from typing import List, Tuple, Optional, Dict
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import numpy as np

In [None]:
#улучшение описания фотографий 

def generate_prompts_for_label(label: str) -> List[str]:
"""Генерирует разнообразные текстовые подсказки для одного класса/ярлыка.
Идея: покрыть стиль, контекст, объектный/фотографический описания.
Возвращает список подсказок.
"""
    templates = [
"a photo of a {}",
"a close-up photo of a {}",
"a high quality photo of a {}",
"a detailed image of a {}",
"an image of a {} in the wild",
"a professional photograph of a {}",
"a {} on a plain background",
"a {} with strong lighting",
"a {} in a natural scene",
"a cropped photo of a {}",
    ]
    prompts = [t.format(label) for t in templates]
    return prompts




def ensemble_text_features(model, tokenizer, device: torch.device, labels: List[str]) -> Tuple[torch.Tensor, List[str]]:
"""Создаёт усреднённые текстовые эмбеддинги для каждого класса, используя набор подсказок.
model: должен поддерживать encode_text / get_text_features интерфейс.
tokenizer: функция/класс токенизации (например, clip.tokenize или tokenizer.__call__)
Возвращает тензор shape=(num_labels, dim) и список итоговых prompt-строк.
"""
    all_features = []
    all_prompts = []
    with torch.no_grad():
        for label in labels:
            prompts = generate_prompts_for_label(label)
            all_prompts.extend(prompts)
            tokenized = tokenizer(prompts, return_tensors='pt', padding=True).to(device)
            text_feats = model.get_text_features(**tokenized) if hasattr(model, 'get_text_features') else model.encode_text(tokenized['input_ids'])
# L2-normalize и усреднить
            text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
            mean_feat = text_feats.mean(dim=0)
            mean_feat = mean_feat / mean_feat.norm()
            all_features.append(mean_feat.cpu())
    all_features = torch.stack(all_features, dim=0)
    return all_features, labels

In [None]:
# ---- Feature extraction: augmentation + averaging + denoising ----

def build_image_augmentations(image_size: int, n_augment: int = 5) -> List[transforms.Compose]:
    """Возвращает список трансформов для создания n_augment версий изображения.
    Не использовать случайные сильные изменения (чтобы эмбеддинги оставались релевантными).
    """
    base = [
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
    ]
    augment_list = []
    for i in range(n_augment):
        ops = []
        # добавляем легкие вариации: crop, flip, slight color jitter
        ops.append(transforms.RandomResizedCrop(image_size, scale=(0.9, 1.0)))
        if i % 2 == 0:
            ops.append(transforms.RandomHorizontalFlip(p=0.5))
        if i % 3 == 0:
            ops.append(transforms.ColorJitter(brightness=0.08, contrast=0.08, saturation=0.08))
        ops.extend(base)
        augment_list.append(transforms.Compose(ops))
    return augment_list


def image_augmented_embedding(model, preprocess, image: Image.Image, device: torch.device, augment_transforms: List[transforms.Compose]) -> torch.Tensor:
    """Для одного PIL image создаёт усреднённый эмбеддинг по augment_transforms.
    model должен иметь get_image_features(preprocessed_tensor) или encode_image
    preprocess: дополнительная нормализация/токенизация если нужна
    """
    feats = []
    with torch.no_grad():
        for t in augment_transforms:
            x = t(image).unsqueeze(0).to(device)
            if preprocess is not None:
                x = preprocess(x)
            image_feat = model.get_image_features(x) if hasattr(model, 'get_image_features') else model.encode_image(x)
            image_feat = image_feat / image_feat.norm(dim=-1, keepdim=True)
            feats.append(image_feat.cpu())
    feats = torch.cat(feats, dim=0)
    mean_feat = feats.mean(dim=0)
    mean_feat = mean_feat / mean_feat.norm()
    return mean_feat


def apply_pca_denoise(embeddings: np.ndarray, n_components: int = None, energy: float = 0.95) -> np.ndarray:
    """Простая PCA-дегенерация шума: если n_components не указан, выбираем по energy.
    embeddings: (N, D) numpy
    """
    from sklearn.decomposition import PCA
    if n_components is None:
        pca = PCA(n_components=min(embeddings.shape[0], embeddings.shape[1]))
        pca.fit(embeddings)
        cum_energy = np.cumsum(pca.explained_variance_ratio_)
        n = int(np.searchsorted(cum_energy, energy) + 1)
        n_components = max(1, n)
    pca = PCA(n_components=n_components)
    reduced = pca.fit_transform(embeddings)
    recon = pca.inverse_transform(reduced)
    return recon


In [None]:
# ---- Simple datasets and linear probe trainer ----

class SimpleImageTextDataset(Dataset):
    """Ожидается, что items = list of (PIL.Image or path, label_index)"""
    def __init__(self, items: List[Tuple[str, int]], transform=None):
        self.items = items
        self.transform = transform

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        img_path, label = self.items[idx]
        if isinstance(img_path, str):
            img = Image.open(img_path).convert('RGB')
        else:
            img = img_path
        if self.transform:
            img = self.transform(img)
        return img, label


def train_linear_probe(image_features: torch.Tensor, labels: torch.Tensor, num_classes: int, device: torch.device, epochs: int = 10, lr: float = 1e-3) -> nn.Module:
    """Обучение простого линейного классификатора на зафиксированных эмбеддингах.
    image_features: (N, D) torch
    labels: (N,) torch
    Возвращает обученную Linear модель.
    """
    D = image_features.shape[1]
    clf = nn.Linear(D, num_classes).to(device)
    opt = torch.optim.Adam(clf.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    ds = torch.utils.data.TensorDataset(image_features.to(device), labels.to(device))
    loader = DataLoader(ds, batch_size=64, shuffle=True)
    clf.train()
    for epoch in range(epochs):
        total_loss = 0.0
        total = 0
        correct = 0
        for x, y in loader:
            opt.zero_grad()
            logits = clf(x)
            loss = loss_fn(logits, y)
            loss.backward()
            opt.step()
            total_loss += loss.item() * x.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += x.size(0)
        # печать прогресса
        print(f"Epoch {epoch+1}/{epochs}: loss={total_loss/total:.4f}, acc={correct/total:.4f}")
    return clf


In [None]:

# ---- Adapter / lightweight fine-tuning (пример) ----

class SimpleAdapter(nn.Module):
    """Простой адаптер: маленький bottleneck, добавляется к выходу кодировщика изображения/text.
    Используется как residual: out = feat + alpha * adapter(feat)
    """
    def __init__(self, dim: int, bottleneck: int = 256):
        super().__init__()
        self.down = nn.Linear(dim, bottleneck)
        self.act = nn.ReLU()
        self.up = nn.Linear(bottleneck, dim)
        self.alpha = nn.Parameter(torch.tensor(1.0))

    def forward(self, x):
        return x + self.alpha * self.up(self.act(self.down(x)))


def attach_adapter_to_model(model, dim: int, bottleneck: int = 256, attach_to: str = 'image'):
    """Пример как прикрепить адаптер к модели. Интерфейс зависит от реализации CLIP.
    attach_to: 'image' или 'text'
    Возвращает модель с добавленным адаптером (встраиваем адаптер в pipelinе вызовов).
    """
    adapter = SimpleAdapter(dim, bottleneck)
    model.adapter = adapter
    # Нужно также модифицировать forward / get_image_features чтобы применять adapter
    # В примере ниже - ожидается, что пользователь интегрирует adapter в нужное место.
    return model

# ---- Full fine-tuning example (скелет) ----

def finetune_clip_full(model, train_loader: DataLoader, val_loader: Optional[DataLoader], device: torch.device,
                       epochs: int = 5, lr: float = 1e-5, freeze_text: bool = False, freeze_image: bool = False):
    """Скелет для полного fine-tune CLIP модели (PyTorch). Внимание: heavy GPU memory.
    - model должен отдавать logits или эмбеддинги
    - рекомендуется использовать mixed precision и gradient accumulation при нехватке памяти
    """
    # Заморозить части
    if freeze_text and hasattr(model, 'text_encoder'):
        for p in model.text_encoder.parameters():
            p.requires_grad = False
    if freeze_image and hasattr(model, 'vision_encoder'):
        for p in model.vision_encoder.parameters():
            p.requires_grad = False

    params = [p for p in model.parameters() if p.requires_grad]
    opt = torch.optim.AdamW(params, lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    scaler = torch.cuda.amp.GradScaler() if device.type == 'cuda' else None

    model.to(device)
    model.train()
    for epoch in range(epochs):
        for imgs, labels in train_loader:
            imgs = imgs.to(device)
            labels = labels.to(device)
            opt.zero_grad()
            with torch.cuda.amp.autocast(enabled=(scaler is not None)):
                # зависит от интерфейса: некоторые модели возвращают logits_per_image
                outputs = model(imgs)
                if isinstance(outputs, dict) and 'logits' in outputs:
                    logits = outputs['logits']
                else:
                    # fallback: вычисляем эмбеддинги и косинусная классификация
                    image_feats = model.get_image_features(imgs)
                    text_feats = model.get_text_features()  # !!!! здесь нужно подготовить text_feats заранее
                    logits = image_feats @ text_feats.t()
                loss = loss_fn(logits, labels)
            if scaler is not None:
                scaler.scale(loss).backward()
                scaler.step(opt)
                scaler.update()
            else:
                loss.backward()
                opt.step()
        print(f"Epoch {epoch+1}/{epochs} done")
    return model

In [None]:

# ---- Подсказки при смене модели CLIP / OpenCLIP / ViT variants ----

MODEL_CHANGE_NOTES = {
    'ViT-B/32': {
        'image_size': 224,
        'patch': 32,
        'notes': 'Быстрая, меньшая разрешающая способность. Обычно использовать стандартный preprocess.'
    },
    'ViT-B/16': {
        'image_size': 224,
        'patch': 16,
        'notes': 'Лучший баланс speed/accuracy. Лучше качество по мелким объектам.'
    },
    'ViT-L/14': {
        'image_size': 336,
        'patch': 14,
        'notes': 'Нужны большие картинки (336) и больше VRAM. Тонкая настройка learning rates и большее batch size.'
    },
    'OpenCLIP-Large': {
        'image_size': 336,
        'notes': 'Необходимо проверить tokenizer/входные нормализации; веса обучены на других датасетах.'
    }
}


def adapt_preprocess_for_model(model_name: str) -> Dict:
    """Возвращает рекомендации по preprocess и гиперпараметрам при смене модели.
    """
    info = MODEL_CHANGE_NOTES.get(model_name, None)
    if info is None:
        return {'image_size': 224, 'notes': 'Unknown: use defaults'}
    return info

In [None]:
# ---- Быстрый пример: inference pipeline (без загрузки датасетов) ----

def example_inference_pipeline(model, tokenizer, image_paths: List[str], labels: List[str], device: torch.device):
    """Demo: как объединить prompt-ensemble, image-augment-averaging и предсказание через косинус.
    model, tokenizer - объекты совместимые с HuggingFace CLIP / open_clip API (get_image_features, get_text_features)
    """
    # 1) подготовить текстовые эмбеддинги (ансэмблировать prompts)
    text_feats, label_list = ensemble_text_features(model, tokenizer, device, labels)
    text_feats = text_feats.to(device)

    # 2) подготовка augment transforms
    info = adapt_preprocess_for_model('ViT-B/16')
    aug_transforms = build_image_augmentations(info['image_size'], n_augment=5)

    preds = []
    for p in image_paths:
        img = Image.open(p).convert('RGB')
        img_repr = image_augmented_embedding(model, None, img, device, aug_transforms)
        img_repr = img_repr.to(device)
        sims = (img_repr @ text_feats.t()).squeeze(0)
        pred = sims.argmax().item()
        preds.append(label_list[pred])
    return preds


# ---- Конфигурационные рекомендации и чеклист для production ----

CONFIG_CHECKLIST = """
1) Подсказки (prompts):
   - создайте 8-12 вариаций на класс (photographic, context, scene, close-up)
   - усредняйте текстовые эмбеддинги
2) Извлечение признаков:
   - аугментации: легкие random resized crop, flip, color jitter; усредняйте эмбеддинги
   - L2-нормализация всегда
   - опционально PCA/TruncatedSVD для удаления высокочастотного шума
3) Fine-tuning:
   - Linear probe: быстрый и надёжный
   - Adapter/tip-adapter: меньший набор обучаемых параметров, fast to converge
   - Full finetune: требует осторожности (lr в 1e-6..1e-5), gradient accumulation, amp
4) При смене модели:
   - проверьте recommended image_size
   - проверьте tokenizer & text preprocessing
   - скорректируйте learning rate и batch size (большие модели -> меньший lr)
5) Inference:
   - храните нормализованные эмбеддинги (float32/float16)
   - используйте ANN (FAISS, HNSW) для retrieval
"""

if __name__ == '__main__':
    print("This module provides utilities to improve CLIP workflows. Import functions and use with your CLIP model.")
    print("See CONFIG_CHECKLIST for practical tips.")
    print(CONFIG_CHECKLIST)