In [1]:
import numpy as np
import pandas as pd


In [2]:
train = pd.read_csv("shopee-product-matching/train.csv")
train['label_group'] = train['label_group'].astype(str)
train

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069
...,...,...,...,...,...
34245,train_4028265689,fff1c07ceefc2c970a7964cfb81981c5.jpg,e3cd72389f248f21,Masker Bahan Kain Spunbond Non Woven 75 gsm 3 ...,3776555725
34246,train_769054909,fff401691371bdcb382a0d9075dfea6a.jpg,be86851f72e2853c,MamyPoko Pants Royal Soft - S 70 - Popok Celana,2736479533
34247,train_614977732,fff421b78fa7284284724baf249f522e.jpg,ad27f0d08c0fcbf0,KHANZAACC Robot RE101S 1.2mm Subwoofer Bass Me...,4101248785
34248,train_3630949769,fff51b87916dbfb6d0f8faa01bee67b8.jpg,e3b13bd1d896c05c,"Kaldu NON MSG HALAL Mama Kamu Ayam Kampung , S...",1663538013


In [3]:
train = train.sample(frac=0.2, random_state=42)

In [4]:
test = pd.read_csv("shopee-product-matching/test.csv")
test

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [5]:
sample_submission = pd.read_csv("shopee-product-matching/sample_submission.csv")
sample_submission

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


In [6]:
# num_classes = len(train['label_group'].unique())
# num_classes
num_classes = 11014

In [7]:
from sklearn.preprocessing import LabelEncoder

all_labels = train['label_group'].unique()

# Создаем encoder
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Функции для преобразования
def text_to_label(text_labels):
    """Преобразует текстовые метки в числовые"""
    return label_encoder.transform(text_labels)

def label_to_text(numeric_labels):
    """Преобразует числовые метки обратно в текстовые"""
    return label_encoder.inverse_transform(numeric_labels)


In [8]:
X = train.drop(columns=['label_group'])
y = train['label_group']
y = text_to_label(y)

In [9]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch
from PIL import Image

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

class ImageDataset(Dataset):
    def __init__(self, paths, labels, transform=None):
        self.paths = paths
        self.labels = labels
        self.prefix = "shopee-product-matching/train_images/"
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        image = Image.open(self.prefix + path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return image, label


train_dataset = ImageDataset(X_train['image'].tolist(), y_train.tolist(), transform)
val_dataset = ImageDataset(X_val['image'].tolist(), y_val.tolist(), transform)

In [11]:
batch_size = 32


train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

**IMAGE EMBEDDINGS**

In [12]:
import torch
import torch.nn.functional as F

In [13]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# dinov2_embedder.py
from typing import List, Union, Optional
import torch
import torch.nn as nn
from PIL import Image
from transformers import AutoImageProcessor, AutoModel

Tensor = torch.Tensor

class DINOv2Embedder(nn.Module):
    """
    DINOv2 (facebook/dinov2-*) — возвращает CLS-эмбеддинг без головы.
    Совместим с triplet loss, L2-нормировка опциональна.
    """
    def __init__(
        self,
        model_name: str = "facebook/dinov2-base",   # small / base / large / giant
        device: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        normalize_l2: bool = False,
        fp16_infer: bool = False,
    ):
        super().__init__()
        self.device = torch.device(device) if device else torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        self.normalize_l2 = normalize_l2
        self.fp16_infer = fp16_infer and self.device.type == "cuda"

        # Загрузка модели и препроцессора
        self.processor = AutoImageProcessor.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        # в DINOv2 "last_hidden_state[:,0]" — CLS-токен, головы классификации нет
        self.model.eval()

        # размер эмбеддинга (768 для base)
        self.embed_dim = 768
        self.dtype = dtype if dtype is not None else (
            torch.float16 if (self.fp16_infer and self.device.type == "cuda") else torch.float32
        )
        if self.dtype == torch.float16:
            self.model = self.model.half()

    def encode_batch(
        self,
        images: Union[List[Image.Image], Tensor],
        return_numpy: bool = False,
    ) -> Union[Tensor, "np.ndarray"]:
        """
        images: список PIL.Image (RGB) или тензор (B, C, H, W) в [0,1]/[0,255].
        Возвращает (B, D) эмбеддинги CLS.
        """
        if isinstance(images, list):
            inputs = self.processor(images, return_tensors="pt")
        elif isinstance(images, torch.Tensor):
            # если подаём тензор, processor сам умеет работать с torch.Tensor
            inputs = self.processor(images=images, return_tensors="pt")
        else:
            raise TypeError("images must be a List[PIL.Image] or a torch.Tensor")

        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        autocast_dtype = torch.float16 if self.fp16_infer else (
            torch.bfloat16 if self.device.type == "cuda" else None
        )
        if autocast_dtype is not None:
            ctx = torch.autocast(device_type="cuda", dtype=autocast_dtype)
        else:
            from contextlib import nullcontext
            ctx = nullcontext()

        with ctx:
            outputs = self.model(**inputs)
            feats: Tensor = outputs.last_hidden_state[:, 0]  # CLS токен

        if self.normalize_l2:
            feats = torch.nn.functional.normalize(feats, p=2, dim=1)

        if return_numpy:
            return feats.detach().cpu().numpy()
        return feats

In [15]:
model = DINOv2Embedder()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [16]:
class MemoryBank:
    def __init__(self, dim, size=50000, device='cuda'):
        self.size = size
        self.device = device
        self.ptr = 0
        self.full = False
        self.feats = torch.zeros(size, dim, device=device)
        self.labels = torch.full((size,), -1, device=device, dtype=torch.long)

    @torch.no_grad()
    def enqueue(self, feats, labels):
        b = feats.size(0)
        idx = (torch.arange(b, device=self.device) + self.ptr) % self.size
        self.feats[idx] = feats
        self.labels[idx] = labels
        self.ptr = int((self.ptr + b) % self.size)
        if self.ptr == 0:
            self.full = True

    def valid_mask(self):
        return self.labels >= 0


In [17]:
def mine_triplets(anchor_z, anchor_y, mem: MemoryBank, margin=0.3, pos_k=2, neg_k=5):
    # anchor_z: [B, D] (L2-norm), anchor_y: [B]
    with torch.no_grad():
        mask = mem.valid_mask()
        mem_z = mem.feats[mask]          # [M, D]
        mem_y = mem.labels[mask]         # [M]

        if mem_z.numel() == 0:
            return []  # пока память пустая — пропустим шаг лосса

        sim = (anchor_z.float() @ mem_z.float().T)
        dist = (1 - sim).clamp(0, 2)     # cos distance in [0,2]

        triplets = []
        for i in range(anchor_z.size(0)):
            y = anchor_y[i]
            pos_mask = (mem_y == y)
            neg_mask = ~pos_mask

            pos_d = dist[i][pos_mask]
            neg_d = dist[i][neg_mask]
            if pos_d.numel() == 0 or neg_d.numel() == 0:
                continue

            # Позитивы: берём ближайшие pos_k
            pos_idx = torch.topk(pos_d, k=min(pos_k, pos_d.numel()), largest=False).indices
            pos_sel = torch.nonzero(pos_mask, as_tuple=False).squeeze(1)[pos_idx]

            # Негативы: distance-weighted вероятности + semi-hard (d > min_pos_d)
            min_pos = pos_d.min()
            # semi-hard: d_pos < d_neg < d_pos + margin
            neg_window = (neg_d > min_pos) & (neg_d < (min_pos + margin))
            cand = neg_d[neg_window]
            if cand.numel() == 0:
                # fallback: самый близкий негатив (hardest)
                neg_idx = torch.topk(neg_d, k=min(neg_k, neg_d.numel()), largest=False).indices
                neg_sel = torch.nonzero(neg_mask, as_tuple=False).squeeze(1)[neg_idx]
            else:
                # можно взять k ближайших в окне
                k = min(neg_k, cand.numel())
                idx_local = torch.topk(cand, k=k, largest=False).indices
                neg_candidates = torch.nonzero(neg_mask, as_tuple=False).squeeze(1)[neg_window]
                neg_sel = neg_candidates[idx_local]


            for p in pos_sel:
                for n in neg_sel:
                    triplets.append( (i, p.item(), n.item()) )
        return triplets


In [18]:
def triplet_loss_from_indices(anchor_z, anchor_y, mem, triplets, margin=0.3):
    if len(triplets) == 0:
        return anchor_z.sum()*0  # нулевой лосс, корректный градиент = 0
    a_idx = torch.tensor([a for a,_,_ in triplets], device=anchor_z.device)
    p_idx = torch.tensor([p for _,p,_ in triplets], device=anchor_z.device)
    n_idx = torch.tensor([n for _,_,n in triplets], device=anchor_z.device)

    mem_z = mem.feats
    sim_ap = (anchor_z[a_idx] * mem_z[p_idx]).sum(-1)
    sim_an = (anchor_z[a_idx] * mem_z[n_idx]).sum(-1)
    # dist = 1 - sim
    loss = F.relu( (1 - sim_ap) - (1 - sim_an) + margin )
    # опционально: усреднить top-k hardest
    return loss.mean()

In [19]:
memory = MemoryBank(dim=768, size=50000, device=device)

In [20]:
from transformers import get_cosine_schedule_with_warmup


total_steps = len(train_dataloader) * batch_size

In [21]:
@torch.no_grad()
def compute_embeddings(dataloader):
    model.eval()
    all_z, all_y = [], []
    for image, labels in dataloader:
        z = model.encode_batch(image)
        all_z.append(z)
        all_y.append(labels.to(device))
    model.train()
    return torch.cat(all_z, dim=0), torch.cat(all_y, dim=0)

@torch.no_grad()
def recall_at_k(z, y, ks=(1,5,10), chunk=2000):
    """
    z: [N, D] L2-норм, y: [N]
    Считаем топ-K по косинус-сим, исключая self-match.
    Для больших N используем блочно (chunk), чтобы не взорваться по памяти.
    """
    N = z.size(0)
    hits = {k:0 for k in ks}
    for start in range(0, N, chunk):
        end = min(start+chunk, N)
        q = z[start:end]                      # [Q, D]
        sims = q @ z.T                        # [Q, N]
        # исключаем самих себя
        idx = torch.arange(start, end, device=z.device)
        sims[torch.arange(q.size(0), device=z.device), idx] = -1e9

        # топ-K предсказанных индексов
        max_k = max(ks)
        topk_idx = sims.topk(k=max_k, dim=1, largest=True).indices  # [Q, maxK]
        target = y[start:end].unsqueeze(1)                           # [Q, 1]
        retrieved = y[topk_idx]                                      # [Q, maxK]
        eq = (retrieved == target)                                   # bool

        for k in ks:
            hits[k] += eq[:, :k].any(dim=1).sum().item()
    return {k: hits[k] / N for k in ks}

In [22]:
with torch.inference_mode():
      val_z, val_y = compute_embeddings(val_dataloader)
      metrics = recall_at_k(val_z, val_y, ks=(1, 5, 10))
      print(f"Zero metrics"
            f"Val R@1={metrics[1]:.4f} R@5={metrics[5]:.4f} R@10={metrics[10]:.4f}")

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Zero metricsVal R@1=0.0482 R@5=0.0591 R@10=0.0657


In [27]:
import time
from collections import defaultdict
import math


def train_triplet(
    train_loader: DataLoader,
    val_loader: DataLoader,
    epochs=3,
    lr=2e-5,
    weight_decay=1e-4,
    margin=0.3,
    mem_size=100_000,
    pos_k=2,
    neg_k=5,
    grad_clip=1.0,
    log_every=50,
    accum_steps=1
):
    hidden_dim = 768
    memory = MemoryBank(dim=hidden_dim, size=mem_size, device=device)
    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # пересчитываем общее число ШАГОВ оптимизатора (а не мини-батчей!)
    total_update_steps = math.ceil(len(train_loader) / accum_steps) * epochs
    scheduler = get_cosine_schedule_with_warmup(
        optim,
        num_warmup_steps=int(0.1 * total_update_steps),
        num_training_steps=total_update_steps
    )

    print(f"Start training: epochs={epochs}, lr={lr}, mem_size={mem_size}, dim={hidden_dim}")
    best_R1 = -1.0
    best_state = None

    for epoch in range(1, epochs + 1):
        t0 = time.time()
        model.train()
        running = defaultdict(float)
        triplet_cnt = 0
        optim.zero_grad()        # <── теперь обнуляем градиенты один раз в начале эпохи

        for step, (images, labels) in enumerate(train_loader, 1):
            images = images.to(device)
            labels = labels.to(device)

            z = model.encode_batch(images)


            # майнинг триплетов
            triplets = mine_triplets(z.detach(), labels, memory,
                                     margin=margin, pos_k=pos_k, neg_k=neg_k)
            if len(triplets) > 0:

                # лосс делим на accum_steps
                loss = triplet_loss_from_indices(z, labels, memory, triplets, margin=margin)
                loss = loss / accum_steps
                loss.backward()

                running['loss'] += loss.item() * accum_steps  # для логов возвращаем к исходной шкале
                triplet_cnt += len(triplets)

                # шаг оптимизатора только после accum_steps
                if step % accum_steps == 0:
                    if grad_clip is not None:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
                    optim.step()
                    scheduler.step()
                    optim.zero_grad()

            # обновляем память независимо от оптимизационного шага
            with torch.no_grad():
                memory.enqueue(z.detach(), labels)
            model.train()

            # логирование
            if step % log_every == 0:
                avg_loss = running['loss'] / log_every
                avg_trip = triplet_cnt / log_every
                print(f"[epoch {epoch} step {step}] "
                      f"loss={avg_loss:.4f}  triplets/step={avg_trip:.1f}")
                print("lr:", scheduler.get_last_lr())
                running['loss'] = 0.0
                triplet_cnt = 0

        # если длина даталоадера не кратна accum_steps — делаем «хвостовой» шаг
        if len(train_loader) % accum_steps != 0:
            if grad_clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optim.step()
            scheduler.step()
            optim.zero_grad()

        # === Валидация ===
        with torch.inference_mode():
            val_z, val_y = compute_embeddings(val_loader)
            metrics = recall_at_k(val_z, val_y, ks=(1, 5, 10))
            dt = time.time() - t0
            print(f"Epoch {epoch} done in {dt:.1f}s | "
                f"Val R@1={metrics[1]:.4f} R@5={metrics[5]:.4f} R@10={metrics[10]:.4f}")

            if metrics[1] > best_R1:
                best_R1 = metrics[1]
                best_state = {
                    "model": model.state_dict(),
                    "optimizer": optim.state_dict(),
                    "epoch": epoch,
                    "metrics": metrics
                }

    if best_state is not None:
        print(f"Best Val R@1={best_R1:.4f} @ epoch {best_state['epoch']}")
    return best_state


In [28]:
with torch.inference_mode():
    for (images, labels) in train_dataloader:
        images = images.to(device)
        labels = labels.to(device)
        z = model.encode_batch(images)
        memory.enqueue(z.detach(), labels)


In [29]:
for p in model.parameters():
    p.requires_grad = True

In [30]:
best_model_state = train_triplet(train_dataloader, val_dataloader, epochs=3, lr=2e-5, weight_decay=1e-4, margin=0.3, mem_size=50_000, pos_k=2, neg_k=5, grad_clip=1.0, log_every=20)

Start training: epochs=3, lr=2e-05, mem_size=50000, dim=768
[epoch 1 step 20] loss=0.7486  triplets/step=5.5
lr: [5.882352941176471e-06]
[epoch 1 step 40] loss=0.2215  triplets/step=21.0
lr: [1.3725490196078432e-05]
[epoch 1 step 60] loss=0.0254  triplets/step=36.8
lr: [1.9996348616949673e-05]
[epoch 1 step 80] loss=243.8707  triplets/step=40.5
lr: [1.9868829976729444e-05]
[epoch 1 step 100] loss=25.4597  triplets/step=53.0
lr: [1.9561399963785586e-05]
[epoch 1 step 120] loss=0.0128  triplets/step=62.8
lr: [1.9079663108318304e-05]
[epoch 1 step 140] loss=0.1606  triplets/step=74.9
lr: [1.8432401600228823e-05]
[epoch 1 step 160] loss=27.0361  triplets/step=76.2
lr: [1.7631415187481818e-05]
Epoch 1 done in 159.1s | Val R@1=0.0000 R@5=0.0007 R@10=0.0007
[epoch 2 step 20] loss=10.6446  triplets/step=218.2
lr: [1.6067494830143014e-05]
[epoch 2 step 40] loss=0.0071  triplets/step=222.2
lr: [1.4941376675970058e-05]
[epoch 2 step 60] loss=0.0063  triplets/step=228.0
lr: [1.3725175922034566e-05

KeyboardInterrupt: 