# V12 — Pre-trained DINOv2 + ConvNeXtV2 with HuggingFace Dataset + Meta-Model

**Goal:** improve accuracy by pre-training models on HuggingFace dataset, then fine-tuning on target dataset, and stacking in LightGBM.

**Pipeline:**
- Download HuggingFace dataset (`subinium/emojiimage-dataset`)
- Map 11 vendor classes to 7 target classes
- **Pre-train DINOv2** on HuggingFace dataset (seed=42)
- **Pre-train ConvNeXtV2** on HuggingFace dataset (seed=42)
- Split target dataset train/val (stratified if possible)
- **Fine-tune DINOv2** on target dataset (seed=42)
- **Fine-tune ConvNeXtV2** on target dataset (seed=42)
- For each image: compute deterministic TTA prob-vectors for **both models** + statistical features
- Train **LightGBM** meta-model on top

**Classes:** apple, google, whatsapp, facebook, samsung, mozilla, messenger


## Install and Import


In [None]:
%pip install -r requirements.txt


In [None]:
import os
import hashlib
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

import torchvision.transforms.functional as F

from transformers import AutoImageProcessor, AutoModelForImageClassification
from transformers.modeling_outputs import ImageClassifierOutput

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import kagglehub

try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception as e:
    HAS_LGB = False
    print('LightGBM import failed:', e)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    torch.cuda.empty_cache()


## HYPERPARAMETERS


In [None]:
# Data
SECOND_DATASET_BASE_PATH = '.'
SECOND_DATASET_TRAIN_DIR = Path(SECOND_DATASET_BASE_PATH) / 'train'
SECOND_DATASET_CSV_PATH = Path(SECOND_DATASET_BASE_PATH) / 'train_labels.csv'
SECOND_DATASET_TEST_DIR = Path(SECOND_DATASET_BASE_PATH) / 'test'

# HuggingFace Dataset
HF_DATASET_ID = 'subinium/emojiimage-dataset'

# Models
DINO_MODEL_ID = 'facebook/dinov2-base'
CNN_MODEL_ID  = 'facebook/convnextv2-base-22k-224'
SEED = 42

# Train
VAL_SIZE = 0.10
RANDOM_STATE = 42
NUM_EPOCHS = 20
EARLY_STOPPING_PATIENCE = 3
LEARNING_RATE = 1e-5
BATCH_SIZE_CUDA = 16
BATCH_SIZE_CPU = 4
NUM_WORKERS = 2
LABEL_SMOOTHING = 0.05

# TTA
NUM_TTA_AUGS = 10
TQDM_MININTERVAL = 10
SHOW_PROGRESS = True  # set False to reduce output
FEATURE_BATCH_SIZE = 16  # images per batch for feature extraction

# Meta-model
USE_LIGHTGBM = True
LGB_PARAMS = {
    'n_estimators': 1200,
    'learning_rate': 0.03,
    'num_leaves': 63,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_STATE
}

# Output
PREDICTIONS_OUTPUT_FILE = 'predictions_V12.csv'

print('DINO_MODEL_ID:', DINO_MODEL_ID)
print('CNN_MODEL_ID :', CNN_MODEL_ID)
print('SEED:', SEED)
print('NUM_EPOCHS:', NUM_EPOCHS, 'patience:', EARLY_STOPPING_PATIENCE)
print('LABEL_SMOOTHING:', LABEL_SMOOTHING)
print('NUM_TTA_AUGS:', NUM_TTA_AUGS)


In [None]:
VENDOR_CLASSES = ['apple','google','whatsapp','facebook','samsung','mozilla','messenger']
VENDOR_TO_IDX = {v:i for i,v in enumerate(VENDOR_CLASSES)}
IDX_TO_VENDOR = {i:v for v,i in VENDOR_TO_IDX.items()}

# Label mapping from HuggingFace dataset (11 classes) to target dataset (7 classes)
HF_TO_V11_MAPPING = {
    'Apple': 'apple',
    'Google': 'google', 'Gmail': 'google', 'Mozilla': 'google',
    'Facebook': 'facebook',
    'Samsung': 'samsung',
    'WhatsApp': 'whatsapp',  # if exists in HF dataset
    'Messenger': 'messenger',  # if exists in HF dataset
    'DoCoMo': 'apple', 'JoyPixels': 'apple', 'KDDI': 'apple', 'SoftBank': 'apple',
    'Twitter': 'google', 'Windows': 'google'
}

print('VENDOR_CLASSES:', VENDOR_CLASSES)
print('HF_TO_V11_MAPPING:', HF_TO_V11_MAPPING)


## Deterministic Augmentation (Predictable TTA)


## Statistical features (incl. original_mode)


In [None]:
def extract_image_properties(image_path):
    try:
        img = Image.open(image_path)
        mode_mapping = {'L':0,'LA':1,'P':2,'RGB':3,'RGBA':4}
        original_mode = float(mode_mapping.get(img.mode, 3))
        # Normalize image to RGB for pixel stats
        if img.mode == 'P':
            img = img.convert('RGBA')
        if img.mode == 'RGBA':
            bg = Image.new('RGB', img.size, (255,255,255))
            bg.paste(img, mask=img.split()[3])
            img = bg
        elif img.mode != 'RGB':
            img = img.convert('RGB')
        w,h = img.size
        ar = w / h if h else 0.0
        pix = float(w*h)
        arr = np.array(img)
        mean_r = float(arr[:,:,0].mean()); mean_g = float(arr[:,:,1].mean()); mean_b = float(arr[:,:,2].mean())
        std_r = float(arr[:,:,0].std());  std_g  = float(arr[:,:,1].std());  std_b  = float(arr[:,:,2].std())
        brightness = float((mean_r+mean_g+mean_b)/3.0)
        is_mostly_white = float(brightness > 200)
        return {
            'width': float(w), 'height': float(h), 'aspect_ratio': float(ar), 'pixel_count': pix,
            'mean_r': mean_r, 'mean_g': mean_g, 'mean_b': mean_b,
            'std_r': std_r, 'std_g': std_g, 'std_b': std_b,
            'brightness': brightness, 'is_mostly_white': is_mostly_white,
            'original_mode': original_mode
        }
    except Exception as e:
        return {
            'width':224.0,'height':224.0,'aspect_ratio':1.0,'pixel_count':50176.0,
            'mean_r':128.0,'mean_g':128.0,'mean_b':128.0,
            'std_r':50.0,'std_g':50.0,'std_b':50.0,
            'brightness':128.0,'is_mostly_white':0.0,'original_mode':3.0
        }

STAT_COLS = ['width','height','aspect_ratio','pixel_count','mean_r','mean_g','mean_b','std_r','std_g','std_b','brightness','is_mostly_white','original_mode']


In [None]:
class DeterministicAugmentation:
    def __init__(self, image_size=224, seed=42):
        self.image_size = image_size
        self.seed = seed
        self.rotation_angles = [-10, -5, 5, 10]
        self.crop_ratios = [0.75, 0.85, 0.9, 0.95]
        self.color_jitter_params = {'brightness':0.3,'contrast':0.3,'saturation':0.3,'hue':0.1}
        self.translate_range = (0.1, 0.1)
        self.blur_sigma = (0.1, 0.5)

    def _get_deterministic_seed(self, image_or_hash):
        if isinstance(image_or_hash, Image.Image):
            img_bytes = image_or_hash.tobytes()
            return int(hashlib.md5(img_bytes).hexdigest()[:8], 16)
        return hash(str(image_or_hash)) & 0xFFFFFFFF

    def horizontal_flip(self, image):
        return F.hflip(image)

    def rotation(self, image, angle):
        return F.rotate(image, angle)

    def center_crop(self, image, crop_ratio=0.9):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        return F.center_crop(image, [crop,crop])

    def corner_crop(self, image, crop_ratio=0.9, position='tl'):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        if position=='tl': return F.crop(image, 0, 0, crop, crop)
        if position=='tr': return F.crop(image, 0, w-crop, crop, crop)
        if position=='bl': return F.crop(image, h-crop, 0, crop, crop)
        if position=='br': return F.crop(image, h-crop, w-crop, crop, crop)
        return image

    def resized_crop(self, image, crop_ratio=0.85):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        cropped = F.center_crop(image, [crop,crop])
        return cropped.resize((self.image_size,self.image_size), Image.BILINEAR)

    def color_jitter(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        b = 1.0 + np.random.uniform(-self.color_jitter_params['brightness'], self.color_jitter_params['brightness'])
        c = 1.0 + np.random.uniform(-self.color_jitter_params['contrast'], self.color_jitter_params['contrast'])
        s = 1.0 + np.random.uniform(-self.color_jitter_params['saturation'], self.color_jitter_params['saturation'])
        h = np.random.uniform(-self.color_jitter_params['hue'], self.color_jitter_params['hue'])
        img = F.adjust_brightness(image, b)
        img = F.adjust_contrast(img, c)
        img = F.adjust_saturation(img, s)
        img = F.adjust_hue(img, h)
        return img

    def affine_transform(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        tx = np.random.uniform(-self.translate_range[0], self.translate_range[0])
        ty = np.random.uniform(-self.translate_range[1], self.translate_range[1])
        return F.affine(image, angle=0, translate=(tx*image.width, ty*image.height), scale=1.0, shear=0.0)

    def gaussian_blur(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        sigma = np.random.uniform(self.blur_sigma[0], self.blur_sigma[1])
        return F.gaussian_blur(image, kernel_size=3, sigma=[sigma,sigma])

    def get_augmentations(self, image, num_augmentations=10, seed_source=None):
        if seed_source is None:
            seed_val = self._get_deterministic_seed(image)
        elif isinstance(seed_source, str):
            seed_val = self._get_deterministic_seed(seed_source)
        else:
            seed_val = int(seed_source)
        augs = []
        augs.append(image.resize((self.image_size,self.image_size), Image.BILINEAR))
        augs.append(self.horizontal_flip(image).resize((self.image_size,self.image_size), Image.BILINEAR))
        for angle in self.rotation_angles[:max(0, min(4, num_augmentations-len(augs)))]:
            augs.append(self.rotation(image, angle).resize((self.image_size,self.image_size), Image.BILINEAR))
        corners = ['tl','tr','bl','br']
        for cpos in corners[:max(0, min(4, num_augmentations-len(augs)))]:
            augs.append(self.corner_crop(image, 0.9, cpos).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.center_crop(image, 0.9).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.resized_crop(image, 0.85))
        if len(augs) < num_augmentations:
            augs.append(self.color_jitter(image, seed_val).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.affine_transform(image, seed_val+1).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.gaussian_blur(image, seed_val+2).resize((self.image_size,self.image_size), Image.BILINEAR))
        return augs[:num_augmentations]
    
    def apply_training_augmentation(self, image, seed_source=None):
        if seed_source is None:
            seed_val = self._get_deterministic_seed(image)
        elif isinstance(seed_source, str):
            seed_val = self._get_deterministic_seed(seed_source)
        else:
            seed_val = int(seed_source)
        
        np.random.seed(seed_val % (2**32))
        
        if (seed_val % 2 == 0):
            image = self.horizontal_flip(image)
        
        angle_idx = (seed_val // 2) % len(self.rotation_angles)
        angle = self.rotation_angles[angle_idx]
        image = self.rotation(image, angle)
        
        crop_idx = (seed_val // 10) % len(self.crop_ratios)
        crop_ratio = self.crop_ratios[crop_idx]
        w, h = image.size
        crop_size = int(min(w, h) * crop_ratio)
        image = F.center_crop(image, [crop_size, crop_size])
        
        image = self.color_jitter(image, seed_val)
        
        if (seed_val // 3) % 2 == 0:
            image = self.affine_transform(image, seed_val + 1)
        
        if (seed_val // 5) % 5 == 0:
            image = self.gaussian_blur(image, seed_val + 2)
        
        image = image.resize((self.image_size, self.image_size), Image.BILINEAR)
        
        return image

tta_aug = DeterministicAugmentation(image_size=224, seed=42)
print('Deterministic augmentation ready.')


## HuggingFace Dataset Loading (with label mapping)


In [None]:
def prepare_hf_dataset_with_mapping(dataset_path):
    """
    Prepare HuggingFace dataset by finding all images and mapping vendor labels.
    Maps 11 HF classes to 7 target classes using HF_TO_V11_MAPPING.
    """
    image_paths = []
    labels = []
    dataset_path = Path(dataset_path)
    image_extensions = {'.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'}
    
    # HF dataset has vendor folders (Apple, Google, Facebook, etc.)
    # Scan each vendor folder and map to target classes
    for hf_vendor, target_vendor in HF_TO_V11_MAPPING.items():
        if target_vendor not in VENDOR_TO_IDX:
            continue  # Skip if target vendor not in our classes
        
        vendor_dir = dataset_path / hf_vendor
        if vendor_dir.exists() and vendor_dir.is_dir():
            for ext in image_extensions:
                images = list(vendor_dir.glob(f"*{ext}"))
                for img_path in images:
                    image_paths.append(str(img_path))
                    labels.append(VENDOR_TO_IDX[target_vendor])
    
    # Fallback: if no images found via vendor folders, try scanning all images
    if len(image_paths) == 0:
        for ext in image_extensions:
            all_images = list(dataset_path.rglob(f"*{ext}"))
            for img_path in all_images:
                filename = img_path.name.lower()
                parent_dir = img_path.parent.name
                # Try to match vendor from filename or parent directory
                for hf_vendor, target_vendor in HF_TO_V11_MAPPING.items():
                    if target_vendor not in VENDOR_TO_IDX:
                        continue
                    if hf_vendor.lower() in filename or hf_vendor.lower() in parent_dir.lower():
                        image_paths.append(str(img_path))
                        labels.append(VENDOR_TO_IDX[target_vendor])
                        break
    
    print(f'Loaded {len(image_paths)} images from HuggingFace dataset')
    if len(labels) > 0:
        label_counts = np.bincount(np.array(labels), minlength=len(VENDOR_CLASSES))
        print(f'Label distribution: {label_counts}')
        print(f'Label names: {[VENDOR_CLASSES[i] for i in range(len(VENDOR_CLASSES))]}')
    
    return image_paths, labels


## Data loading (CSV labels)


In [None]:
def prepare_dataset_from_csv(train_dir, csv_path):
    train_dir = Path(train_dir); csv_path = Path(csv_path)
    if not train_dir.exists() or not csv_path.exists():
        raise FileNotFoundError(f'Missing train_dir or csv: {train_dir} / {csv_path}')
    df = pd.read_csv(csv_path)
    label_map = {v: VENDOR_TO_IDX[v] for v in VENDOR_CLASSES}
    img_paths=[]; labels=[]
    missing=0; unmapped=0
    for _, r in df.iterrows():
        img_id = str(r['Id']).zfill(5)
        lab = str(r['Label']).lower()
        if lab not in label_map:
            unmapped += 1
            continue
        found = None
        for ext in ('.png','.jpg','.jpeg','.PNG','.JPG','.JPEG'):
            p = train_dir / f'{img_id}{ext}'
            if p.exists():
                found = str(p)
                break
        if found is None:
            missing += 1
            continue
        img_paths.append(found)
        labels.append(int(label_map[lab]))
    print('Loaded:', len(img_paths), 'images')
    print('Unmapped labels skipped:', unmapped, 'Missing files skipped:', missing)
    if labels:
        print('Label distribution:', np.bincount(np.array(labels), minlength=len(VENDOR_CLASSES)))
    return img_paths, labels


## Split (stratified)


## Dataset class


In [None]:
def load_image_rgb(path):
    img = Image.open(path)
    if img.mode == 'P':
        img = img.convert('RGBA')
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255,255,255))
        bg.paste(img, mask=img.split()[3])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    return img

class EmojiDataset(Dataset):
    def __init__(self, image_paths, labels, processor, use_augmentation=False):
        self.image_paths = list(image_paths)
        self.labels = list(labels)
        self.processor = processor
        self.use_augmentation = use_augmentation
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        p = self.image_paths[idx]
        y = int(self.labels[idx])
        img = load_image_rgb(p)
        
        # Apply training augmentation if enabled (deterministic based on image path)
        if self.use_augmentation:
            img = tta_aug.apply_training_augmentation(img, seed_source=str(p))
        
        inputs = self.processor(img, return_tensors='pt')
        pixel_values = inputs['pixel_values'].squeeze(0)
        y = int(max(0, min(y, len(VENDOR_CLASSES)-1)))
        return {'pixel_values': pixel_values, 'labels': torch.tensor(y, dtype=torch.long)}


## Model wrapper (DINOv2 and ConvNeXtV2 backbones → 7 classes)


In [None]:
class ConvNeXtV2ForEmojiClassification(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        # ConvNeXtV2 hidden size is in config.hidden_sizes
        hidden = getattr(getattr(base_model, 'config', None), 'hidden_sizes', [1024])[-1]
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Dropout(0.2),
            nn.Linear(hidden, num_labels)
        )

    def forward(self, pixel_values, labels=None):
        # ConvNeXtV2 backbone feature map
        out = self.base_model.convnextv2(pixel_values)
        feats = out.last_hidden_state
        if len(feats.shape) == 4:
            pooled = feats.mean(dim=[2,3])
        else:
            pooled = feats
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels-1)
            loss = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTHING)(logits.view(-1, self.num_labels), labels.view(-1))
        return ImageClassifierOutput(loss=loss, logits=logits)

class DINOv2ForEmojiClassification(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        hidden = getattr(base_model.config, 'hidden_size', 1024)
        
        # Simpler head: less capacity, less overfitting risk
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Dropout(0.1),  # Lower dropout
            nn.Linear(hidden, num_labels)  # Direct projection
        )
    
    def forward(self, pixel_values, labels=None):
        out = self.base_model(pixel_values=pixel_values, output_hidden_states=True)
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            pooled = out.pooler_output
        else:
            pooled = out.hidden_states[-1][:,0,:]
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels-1)
            loss = nn.CrossEntropyLoss(label_smoothing=0.1)(logits.view(-1, self.num_labels), labels.view(-1))
        return ImageClassifierOutput(loss=loss, logits=logits)


## Train / Validate loops


In [None]:
def seed_everything(seed):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_epoch(model, loader, optimizer, device, scaler=None):
    model.train()
    total_loss=0.0; correct=0; total=0
    use_amp = (device.type=='cuda')
    for batch in tqdm(loader, desc='Training', mininterval=TQDM_MININTERVAL):
        x = batch['pixel_values'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels-1)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast('cuda', enabled=use_amp):
            out = model(pixel_values=x, labels=y)
            loss = out.loss
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        total_loss += float(loss.item())
        pred = torch.argmax(out.logits, dim=1)
        correct += int((pred==y).sum().item())
        total += int(y.size(0))
    return total_loss/max(1,len(loader)), 100.0*correct/max(1,total)

@torch.no_grad()
def validate(model, loader, device):
    model.eval()
    total_loss=0.0; correct=0; total=0
    preds=[]; labels=[]
    use_amp = (device.type=='cuda')
    for batch in tqdm(loader, desc='Validation', mininterval=TQDM_MININTERVAL):
        x = batch['pixel_values'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels-1)
        with torch.amp.autocast('cuda', enabled=use_amp):
            out = model(pixel_values=x, labels=y)
            loss = out.loss
        total_loss += float(loss.item())
        pred = torch.argmax(out.logits, dim=1)
        pred = torch.clamp(pred, 0, model.num_labels-1)
        correct += int((pred==y).sum().item())
        total += int(y.size(0))
        preds.extend(pred.cpu().numpy().tolist())
        labels.extend(y.cpu().numpy().tolist())
    return total_loss/max(1,len(loader)), 100.0*correct/max(1,total), preds, labels


## Pre-training on HuggingFace Dataset


In [None]:
def pretrain_on_hf_dataset(model_kind, model_id, seed):
    """
    Pre-train model on HuggingFace dataset with label mapping.
    Returns model, processor, checkpoint_path, best_acc
    """
    seed_everything(seed)
    
    # Download and load HuggingFace dataset
    print(f'\n=== Pre-training {model_kind.upper()} on HuggingFace dataset ===')
    hf_path = kagglehub.dataset_download(HF_DATASET_ID)
    print(f'HuggingFace dataset path: {hf_path}')
    
    hf_paths, hf_labels = prepare_hf_dataset_with_mapping(hf_path)
    
    if len(hf_paths) == 0:
        raise ValueError('No images found in HuggingFace dataset')
    
    # Split HF dataset into train/val
    labels_arr = np.array(hf_labels)
    min_count = np.bincount(labels_arr, minlength=len(VENDOR_CLASSES)).min() if len(labels_arr) else 0
    can_stratify = (min_count >= 2)
    
    hf_train_paths, hf_val_paths, hf_train_y, hf_val_y = train_test_split(
        list(hf_paths), list(hf_labels),
        test_size=VAL_SIZE, random_state=seed,
        stratify=list(hf_labels) if can_stratify else None
    )
    
    print(f'HF Train: {len(hf_train_paths)}, HF Val: {len(hf_val_paths)}')
    
    # Load model and processor
    processor = AutoImageProcessor.from_pretrained(model_id)
    backbone = AutoModelForImageClassification.from_pretrained(model_id).to(device)
    if model_kind == 'dino':
        model = DINOv2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES)).to(device)
    elif model_kind == 'cnn':
        model = ConvNeXtV2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES)).to(device)
    else:
        raise ValueError('Unknown model_kind: ' + str(model_kind))
    
    # Create datasets and loaders
    hf_train_ds = EmojiDataset(hf_train_paths, hf_train_y, processor, use_augmentation=True)
    hf_val_ds = EmojiDataset(hf_val_paths, hf_val_y, processor, use_augmentation=False)
    bs = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
    hf_train_loader = DataLoader(hf_train_ds, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    hf_val_loader = DataLoader(hf_val_ds, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    
    # Training setup
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-7, cooldown=1)
    scaler = None
    if torch.cuda.is_available() and (not torch.cuda.is_bf16_supported()):
        scaler = torch.cuda.amp.GradScaler()
    
    # Training loop
    best_acc = -1.0
    best_path = f'pretrained_{model_kind}_hf.pt'
    bad = 0
    for epoch in range(NUM_EPOCHS):
        print(f'\n[HF Pre-train {model_kind}] epoch {epoch+1}/{NUM_EPOCHS}')
        tr_loss, tr_acc = train_epoch(model, hf_train_loader, optimizer, device, scaler)
        va_loss, va_acc, va_pred, va_true = validate(model, hf_val_loader, device)
        scheduler.step(va_acc)
        print(f'Train: loss={tr_loss:.4f} acc={tr_acc:.2f}% | Val: loss={va_loss:.4f} acc={va_acc:.2f}%')
        if va_acc > best_acc + 1e-6:
            best_acc = va_acc
            bad = 0
            torch.save(model.state_dict(), best_path)
            print('✓ saved', best_path)
        else:
            bad += 1
            if bad >= EARLY_STOPPING_PATIENCE:
                print('Early stopping: no improvement for', EARLY_STOPPING_PATIENCE, 'epochs')
                break
    
    model.load_state_dict(torch.load(best_path, map_location=device))
    print(f'✓ Pre-training completed! Best HF validation accuracy: {best_acc:.2f}%')
    return model, processor, best_path, best_acc


## Fine-tuning on Target Dataset


In [None]:
def finetune_model(model_kind, model_id, seed, stage_tag, train_paths_s, train_y_s, val_paths_s, val_y_s, pretrained_checkpoint=None):
    """
    Fine-tune model on target dataset.
    If pretrained_checkpoint is provided, loads weights from pre-training.
    """
    seed_everything(seed)
    processor = AutoImageProcessor.from_pretrained(model_id)
    backbone = AutoModelForImageClassification.from_pretrained(model_id).to(device)
    if model_kind == 'dino':
        model = DINOv2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES)).to(device)
    elif model_kind == 'cnn':
        model = ConvNeXtV2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES)).to(device)
    else:
        raise ValueError('Unknown model_kind: ' + str(model_kind))
    
    # Load pre-trained weights if provided
    if pretrained_checkpoint is not None and os.path.exists(pretrained_checkpoint):
        model.load_state_dict(torch.load(pretrained_checkpoint, map_location=device))
        print(f'✓ Loaded pre-trained weights from {pretrained_checkpoint}')
    else:
        print('Starting from scratch (no pre-trained checkpoint)')

    train_ds = EmojiDataset(train_paths_s, train_y_s, processor, use_augmentation=True)
    val_ds = EmojiDataset(val_paths_s, val_y_s, processor, use_augmentation=False)
    bs = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
    train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    val_loader = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-7, cooldown=1)
    scaler = None
    if torch.cuda.is_available() and (not torch.cuda.is_bf16_supported()):
        scaler = torch.cuda.amp.GradScaler()

    best_acc = -1.0
    best_path = f'best_{stage_tag}.pt'
    bad = 0
    for epoch in range(NUM_EPOCHS):
        print(f'\n[{stage_tag}] epoch {epoch+1}/{NUM_EPOCHS}')
        tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, device, scaler)
        va_loss, va_acc, va_pred, va_true = validate(model, val_loader, device)
        scheduler.step(va_acc)
        print(f'Train: loss={tr_loss:.4f} acc={tr_acc:.2f}% | Val: loss={va_loss:.4f} acc={va_acc:.2f}%')
        if va_acc > best_acc + 1e-6:
            best_acc = va_acc
            bad = 0
            torch.save(model.state_dict(), best_path)
            print('✓ saved', best_path)
        else:
            bad += 1
            if bad >= EARLY_STOPPING_PATIENCE:
                print('Early stopping: no improvement for', EARLY_STOPPING_PATIENCE, 'epochs')
                break

    model.load_state_dict(torch.load(best_path, map_location=device))
    return model, processor, best_path, best_acc


## Load Target Dataset and Split


In [None]:
# Load target dataset
all_paths, all_labels = prepare_dataset_from_csv(SECOND_DATASET_TRAIN_DIR, SECOND_DATASET_CSV_PATH)

# Split target dataset
min_count = np.bincount(np.array(all_labels), minlength=len(VENDOR_CLASSES)).min()
can_stratify = (min_count >= 2)
print('Min class count:', int(min_count), 'Stratify:', can_stratify)
train_paths, val_paths, train_y, val_y = train_test_split(
    all_paths, all_labels, test_size=VAL_SIZE, random_state=RANDOM_STATE,
    stratify=all_labels if can_stratify else None
)
print('Train:', len(train_paths), 'Val:', len(val_paths))
print('Train dist:', np.bincount(np.array(train_y), minlength=len(VENDOR_CLASSES)))
print('Val   dist:', np.bincount(np.array(val_y), minlength=len(VENDOR_CLASSES)))


## Training Pipeline: Pre-train + Fine-tune


In [None]:
print('=== Pre-training on HuggingFace dataset ===')

# Pre-train DINOv2 on HuggingFace dataset
dino_pretrained, dino_proc, dino_hf_ckpt, dino_hf_best = pretrain_on_hf_dataset('dino', DINO_MODEL_ID, SEED)

# Pre-train ConvNeXtV2 on HuggingFace dataset
cnn_pretrained, cnn_proc, cnn_hf_ckpt, cnn_hf_best = pretrain_on_hf_dataset('cnn', CNN_MODEL_ID, SEED)

# Move pre-trained models to CPU to save memory
if device.type == 'cuda':
    dino_pretrained = dino_pretrained.to('cpu')
    cnn_pretrained = cnn_pretrained.to('cpu')
    torch.cuda.empty_cache()

print('\n=== Fine-tuning on target dataset ===')

# Fine-tune DINOv2 on target dataset
dino_final, dino_proc, dino_ckpt, dino_best = finetune_model(
    'dino', DINO_MODEL_ID, SEED, 'dino_finetuned',
    train_paths, train_y, val_paths, val_y,
    pretrained_checkpoint=dino_hf_ckpt
)

# Fine-tune ConvNeXtV2 on target dataset
cnn_final, cnn_proc, cnn_ckpt, cnn_best = finetune_model(
    'cnn', CNN_MODEL_ID, SEED, 'cnn_finetuned',
    train_paths, train_y, val_paths, val_y,
    pretrained_checkpoint=cnn_hf_ckpt
)

# Keep models on CPU when not actively used
if device.type == 'cuda':
    dino_final = dino_final.to('cpu')
    cnn_final = cnn_final.to('cpu')
    torch.cuda.empty_cache()

trained_members = [
    (dino_final, dino_proc, 'dino'),
    (cnn_final,  cnn_proc,  'cnn'),
]
print('✓ Trained members:', [t for _,_,t in trained_members])


## Feature matrix: stats + (A×C) prob vectors


In [None]:
def _prob_cols_for_members(members, num_augmentations):
    cols = []
    for _, _, tag in members:
        for i in range(num_augmentations):
            for c in range(len(VENDOR_CLASSES)):
                cols.append(f'prob_{tag}_aug{i}_cls{c}')
    return cols

@torch.no_grad()
def build_features_batched(image_paths, members, num_augmentations, batch_size=None):
    """
    Batched feature extraction optimized for GPU efficiency.
    Processes all batches for each model before switching to the next model.
    For each image: stats + per-model (A×C) probabilities.
    """
    if batch_size is None:
        batch_size = FEATURE_BATCH_SIZE

    num_batches = (len(image_paths) + batch_size - 1) // batch_size
    
    # Step 1: Extract image stats for all images (model-independent)
    all_stats = []
    all_images = []
    for start in tqdm(range(0, len(image_paths), batch_size),
                      desc='Extracting image stats',
                      mininterval=TQDM_MININTERVAL,
                      disable=(not SHOW_PROGRESS),
                      total=num_batches):
        batch_paths = image_paths[start:start+batch_size]
        batch_stats = [extract_image_properties(p) for p in batch_paths]
        batch_imgs = [load_image_rgb(p) for p in batch_paths]
        all_stats.extend(batch_stats)
        all_images.extend(batch_imgs)
    
    # Step 2: Process all batches for each model (minimize GPU model loading/unloading)
    # Structure: model_probs[tag][batch_idx][aug_idx][class_idx]
    model_probs = {}  # tag -> list of (B, A, C) arrays, one per batch
    
    for m, proc, tag in tqdm(members, desc='Processing models', disable=(not SHOW_PROGRESS)):
        # Load model to GPU once for all batches
        m.to(device)
        m.eval()
        
        model_batch_probs = []  # Store probabilities for all batches of this model
        
        for start in tqdm(range(0, len(image_paths), batch_size),
                          desc=f'  {tag} batches',
                          mininterval=TQDM_MININTERVAL,
                          disable=(not SHOW_PROGRESS),
                          total=num_batches,
                          leave=False):
            batch_paths = image_paths[start:start+batch_size]
            B = len(batch_paths)
            batch_start_idx = start
            batch_end_idx = min(start + batch_size, len(image_paths))
            batch_imgs = all_images[batch_start_idx:batch_end_idx]
            
            # Generate augmentations for this batch
            batch_augs = []
            for img in batch_imgs:
                batch_augs.extend(tta_aug.get_augmentations(img, num_augmentations=num_augmentations))
            
            # Process batch
            inputs = proc(batch_augs, return_tensors='pt')
            x = inputs['pixel_values'].to(device)
            out = m(pixel_values=x)
            probs = torch.softmax(out.logits, dim=-1)
            probs = probs.view(B, num_augmentations, -1).detach().cpu().numpy()
            model_batch_probs.append(probs)
        
        model_probs[tag] = model_batch_probs
        
        # Move model back to CPU after processing all batches
        if device.type == 'cuda':
            m.to('cpu')
            torch.cuda.empty_cache()
    
    # Step 3: Combine stats and probabilities into final feature matrix
    all_rows = []
    for img_idx, stats in enumerate(all_stats):
        row = dict(stats)
        
        # Find which batch this image belongs to
        batch_idx = img_idx // batch_size
        batch_local_idx = img_idx % batch_size
        
        # Extract probabilities for this image from all models
        for tag, batch_probs_list in model_probs.items():
            batch_probs = batch_probs_list[batch_idx]  # (B, A, C)
            for a in range(num_augmentations):
                for c in range(len(VENDOR_CLASSES)):
                    row[f'prob_{tag}_aug{a}_cls{c}'] = float(batch_probs[batch_local_idx, a, c])
        
        all_rows.append(row)

    df = pd.DataFrame(all_rows)
    prob_cols = _prob_cols_for_members(members, num_augmentations)
    all_cols = STAT_COLS + prob_cols
    for col in all_cols:
        if col not in df.columns:
            df[col] = 0.0
    return df[all_cols]

print('Total features:', len(STAT_COLS) + len(_prob_cols_for_members([('','','dino'),('','','cnn')], NUM_TTA_AUGS)))
print('Using batched feature extraction with batch_size:', FEATURE_BATCH_SIZE)


In [None]:
# Use the trained_members list from the training pipeline
members = trained_members
print('Members for features:', [t for _,_,t in members])

X_train = build_features_batched(train_paths, members, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
X_val = build_features_batched(val_paths, members, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
y_train = np.array(train_y)
y_val = np.array(val_y)
print('X_train:', X_train.shape, 'X_val:', X_val.shape)

if USE_LIGHTGBM and HAS_LGB:
    meta = lgb.LGBMClassifier(**LGB_PARAMS)
    meta.fit(X_train, y_train)
    val_pred = meta.predict(X_val)
    print('Meta(LGB) val acc:', accuracy_score(y_val, val_pred)*100.0)
    print(classification_report(y_val, val_pred, target_names=VENDOR_CLASSES))
    meta.booster_.save_model('meta_lgb_v12.txt')
    print('Saved meta_lgb_v12.txt')
else:
    raise RuntimeError('LightGBM not available in this environment. Please install lightgbm or switch USE_LIGHTGBM=False')


## Final fit on Train+Val and predict Test


In [None]:
members = trained_members
combined_paths = train_paths + val_paths
combined_y = train_y + val_y

X_all = build_features_batched(combined_paths, members, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
y_all = np.array(combined_y)

if not (USE_LIGHTGBM and HAS_LGB):
    raise RuntimeError('LightGBM not available for final fit')
meta_final = lgb.LGBMClassifier(**LGB_PARAMS)
meta_final.fit(X_all, y_all)
meta_final.booster_.save_model('meta_lgb_v12_final.txt')
print('Saved meta_lgb_v12_final.txt')

test_dir = SECOND_DATASET_TEST_DIR
if not test_dir.exists():
    raise FileNotFoundError(f'Missing test dir: {test_dir}')
test_paths = []
for ext in ('.png','.jpg','.jpeg','.PNG','.JPG','.JPEG'):
    test_paths += [str(p) for p in test_dir.rglob(f'*{ext}')]
test_paths = sorted(set(test_paths))
print('Found test images:', len(test_paths))

pred_ids = []
pred_labels = []
test_batch_size = FEATURE_BATCH_SIZE
num_test_batches = (len(test_paths) + test_batch_size - 1) // test_batch_size
for start in tqdm(range(0, len(test_paths), test_batch_size), desc='Predicting test (batched)', mininterval=TQDM_MININTERVAL, disable=(not SHOW_PROGRESS), total=num_test_batches):
    batch_paths = test_paths[start:start+test_batch_size]
    batch_ids = [Path(p).stem for p in batch_paths]
    Xp_batch = build_features_batched(batch_paths, members, NUM_TTA_AUGS, batch_size=test_batch_size)
    preds = meta_final.predict(Xp_batch)
    for img_id, pred in zip(batch_ids, preds):
        pred = int(pred)
        pred = max(0, min(pred, len(VENDOR_CLASSES)-1))
        pred_ids.append(img_id)
        pred_labels.append(IDX_TO_VENDOR[pred])

out_path = Path(PREDICTIONS_OUTPUT_FILE)
with out_path.open('w') as f:
    f.write('Id,Label\n')
    for i,l in zip(pred_ids, pred_labels):
        f.write(f'{str(i).strip()},{l}\n')
print('Wrote:', out_path, 'rows:', len(pred_labels))
