In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os    
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/clothing-1m/clothing10k_test.npz
/kaggle/input/clothing-1m/clothing1m.npz


In [2]:
# SMP for Noisy Labels - Clothing1M
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import random

# For reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


In [3]:
# Dataset Paths (mounted via Kaggle's Dataset tab)
TRAIN_NPZ_PATH = "/kaggle/input/clothing-1m/clothing1m.npz"
TEST_NPZ_PATH = "/kaggle/input/clothing-1m/clothing10k_test.npz"

NUM_CLASSES = 14
NUM_PROTOTYPES = 8
SAMPLES_PER_CLASS = 1280
ALPHA_WARMUP_EPOCH = 5
TOTAL_EPOCHS = 15


In [4]:
class Clothing1MNPZDataset(Dataset):
    def __init__(self, npz_path, transform=None, limit=None):
        data = np.load(npz_path)
        self.images = data['arr_0']
        self.labels = data['arr_1']

        if limit:
            self.images = self.images[:limit]
            self.labels = self.labels[:limit]

        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img = Image.fromarray(self.images[idx].astype(np.uint8))
        label = int(self.labels[idx])
        if self.transform:
            img = self.transform(img)
        return img, label, idx


In [5]:
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super().__init__()
        base = models.resnet50(pretrained=True)
        self.backbone = nn.Sequential(*list(base.children())[:-1])
        self.fc = nn.Linear(base.fc.in_features, num_classes)

    def forward(self, x):
        feats = self.backbone(x)
        feats = feats.view(feats.size(0), -1)
        return self.fc(feats)

    def extract_features(self, x):
        feats = self.backbone(x)
        return feats.view(feats.size(0), -1)


In [6]:
def extract_features(model, dataloader):
    model.eval()
    all_feats, all_labels = [], []
    with torch.no_grad():
        for imgs, labels, _ in tqdm(dataloader, desc="Extracting"):
            imgs = imgs.cuda()
            feats = model.extract_features(imgs).cpu()
            all_feats.append(feats)
            all_labels.append(labels)
    return torch.cat(all_feats), torch.cat(all_labels)

def select_prototypes(features, labels):
    prototypes = {}
    for c in range(NUM_CLASSES):
        idxs = (labels == c).nonzero(as_tuple=True)[0]
        if len(idxs) == 0:
            continue
        selected = idxs[torch.randperm(len(idxs))[:SAMPLES_PER_CLASS]]
        feats = features[selected]
        sim_matrix = cosine_similarity(feats.numpy())
        Sc = np.percentile(sim_matrix.flatten(), 60)
        densities = np.sum(sim_matrix > Sc, axis=1)
        topk = densities.argsort()[-NUM_PROTOTYPES:]
        prototypes[c] = feats[topk]
    return prototypes

def correct_labels(model, dataloader):
    features, labels = extract_features(model, dataloader)
    prototypes = select_prototypes(features, labels)
    corrected = torch.zeros(len(labels), dtype=torch.long)
    for i in range(len(features)):
        feat = features[i].unsqueeze(0)
        sims = []
        for c in range(NUM_CLASSES):
            if c not in prototypes:
                sims.append(-float('inf'))
                continue
            class_protos = prototypes[c]
            sim = F.cosine_similarity(feat, class_protos).mean().item()
            sims.append(sim)
        corrected[i] = torch.tensor(np.argmax(sims))
    return corrected


In [7]:
def train_one_epoch(model, dataloader, optimizer, criterion, corrected_labels=None, alpha=0.5):
    model.train()
    for imgs, labels, idxs in tqdm(dataloader, desc="Training"):
        imgs, labels = imgs.cuda(), labels.cuda()
        preds = model(imgs)
        if corrected_labels is not None:
            pseudo_labels = corrected_labels[idxs].cuda()
            loss = (1 - alpha) * criterion(preds, labels) + alpha * criterion(preds, pseudo_labels)
        else:
            loss = criterion(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

@torch.no_grad()
def evaluate(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []
    for imgs, labels, _ in dataloader:
        imgs = imgs.cuda()
        preds = model(imgs).argmax(dim=1).cpu()
        all_preds.extend(preds)
        all_labels.extend(labels)
    acc = accuracy_score(all_labels, all_preds)
    print(f"✅ Test Accuracy: {acc * 100:.2f}%")
    return acc


In [8]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

train_dataset = Clothing1MNPZDataset(TRAIN_NPZ_PATH, transform, limit=20000)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)

test_dataset = Clothing1MNPZDataset(TEST_NPZ_PATH, transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

model = ResNetFeatureExtractor().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.002, momentum=0.9, weight_decay=5e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
criterion = nn.CrossEntropyLoss()

for epoch in range(TOTAL_EPOCHS):
    print(f"\n🌀 Epoch {epoch+1}/{TOTAL_EPOCHS}")
    if epoch < ALPHA_WARMUP_EPOCH:
        corrected_labels = None
        alpha = 0.0
    else:
        corrected_labels = correct_labels(model, train_loader)
        alpha = 0.5

    train_one_epoch(model, train_loader, optimizer, criterion, corrected_labels, alpha)
    scheduler.step()
    evaluate(model, test_loader)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 203MB/s]



🌀 Epoch 1/15


Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 60.37%

🌀 Epoch 3/15


Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 58.42%

🌀 Epoch 4/15


Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 57.13%

🌀 Epoch 5/15


Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 55.23%

🌀 Epoch 6/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 48.67%

🌀 Epoch 7/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 53.35%

🌀 Epoch 8/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 56.62%

🌀 Epoch 10/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 57.55%

🌀 Epoch 11/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 57.29%

🌀 Epoch 12/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 57.02%

🌀 Epoch 13/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 57.42%

🌀 Epoch 14/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 57.90%

🌀 Epoch 15/15


Extracting:   0%|          | 0/282 [00:00<?, ?it/s]

Training:   0%|          | 0/282 [00:00<?, ?it/s]

✅ Test Accuracy: 57.59%
