# V14 — Mixed Ensemble: DINO + CNN + EfficientNet

**Goal:** maximize accuracy by training a diverse ensemble: 1 DINO, 1 CNN, and 2 EfficientNet models with optimized GPU batching.

**Pipeline:**
- Download HuggingFace dataset (`subinium/emojiimage-dataset`)
- Map 11 vendor classes to 7 target classes
- **Pre-train 1 CNN** (ConvNeXtV2-base) on HuggingFace dataset (seed=42)
- **Pre-train 1 DINOv2** on HuggingFace dataset (seed=42)
- **Pre-train 2 EfficientNet models** (b0 and b2) on HuggingFace dataset (seed=42)
- Split target dataset train/val (stratified if possible)
- **Fine-tune all models** on target dataset
- **Final training** on combined train+val for CNN and EfficientNet models
- For each image: compute deterministic TTA prob-vectors for **all models** + statistical features
- Train **LightGBM** meta-model on top

**Key Features:**
- Diverse ensemble: DINO (transformer), CNN (ConvNeXtV2), EfficientNet (b0, b2)
- Improved classifier heads (deeper for CNN/EfficientNet)
- Model-specific learning rates and schedulers
- Optimized feature extraction (pre-loading, single augmentation generation)
- Final training on combined data
- Better LightGBM hyperparameters

**Classes:** apple, google, whatsapp, facebook, samsung, mozilla, messenger


## Install and Import


In [47]:
%pip install -r requirements.txt --upgrade

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [48]:
import os
import hashlib
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

import torchvision.transforms.functional as F

from transformers import AutoImageProcessor, AutoModelForImageClassification
from transformers.modeling_outputs import ImageClassifierOutput

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import kagglehub

try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception as e:
    HAS_LGB = False
    raise Exception('LightGBM import failed:', e)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    torch.cuda.empty_cache()


Using device: cuda
GPU: NVIDIA GeForce RTX 4090


## HYPERPARAMETERS


In [None]:
# Data
SECOND_DATASET_BASE_PATH = '.'
SECOND_DATASET_TRAIN_DIR = Path(SECOND_DATASET_BASE_PATH) / 'train'
SECOND_DATASET_CSV_PATH = Path(SECOND_DATASET_BASE_PATH) / 'train_labels.csv'
SECOND_DATASET_TEST_DIR = Path(SECOND_DATASET_BASE_PATH) / 'test'

# HuggingFace Dataset
HF_DATASET_ID = 'subinium/emojiimage-dataset'

# Models - Mixed ensemble: 1 DINO, 1 CNN, 2 EfficientNet
DINO_MODEL_ID = 'facebook/dinov2-base'
CNN_MODEL_ID = 'facebook/convnextv2-base-22k-224'
CNN_TAG = 'cnn_base'
EFFICIENTNET_MODELS = [
    ('google/efficientnet-b0', 'effnet_b0'),
    ('google/efficientnet-b2', 'effnet_b2'),
]

CNN_SEED = 42
DINO_SEED = 42
EFFNET_SEED = 42

# Train
VAL_SIZE = 0.10
RANDOM_STATE = 42
NUM_EPOCHS = 30
EARLY_STOPPING_PATIENCE = 7
LEARNING_RATE = 1e-5  # Default
CNN_LEARNING_RATE = 1e-5  # Lower LR for CNN (more careful fine-tuning)
DINO_LEARNING_RATE = 1e-5  # Keep DINO LR as is
EFFNET_LEARNING_RATE = 1e-5  # Learning rate for EfficientNet
FINAL_TRAIN_EPOCHS = 3  # Final training on combined data
FINAL_TRAIN_LR_MULT = 0.5  # Lower LR for final training
BATCH_SIZE_CUDA = 16
BATCH_SIZE_CPU = 4
NUM_WORKERS = 2
LABEL_SMOOTHING = 0.05
CNN_LABEL_SMOOTHING = 0.1  # More smoothing for CNN
EFFNET_LABEL_SMOOTHING = 0.1  # Label smoothing for EfficientNet

# TTA
NUM_TTA_AUGS = 10
TQDM_MININTERVAL = 10
SHOW_PROGRESS = True  # set False to reduce output
FEATURE_BATCH_SIZE = 16  # images per batch for feature extraction

# Meta-model
USE_LIGHTGBM = True
LGB_PARAMS = {
    'n_estimators': 1500,  # More trees
    'learning_rate': 0.02,  # Lower LR for better convergence
    'num_leaves': 127,  # More leaves
    'subsample': 0.8,
#    'colsample_bytree': 0.8,
    'bagging_freq' : 1,
    'random_state': RANDOM_STATE,
    'verbosity': -1,  # Suppress warnings
    'feature_fraction': 0.9,  # Use 90% of features
    'min_child_samples': 20  # Prevent overfitting
}

# Output
PREDICTIONS_OUTPUT_FILE = 'predictions_V14.csv'

print('CNN_MODEL:', f'{CNN_TAG} ({CNN_MODEL_ID})')
print('DINO_MODEL:', DINO_MODEL_ID)
print('EFFICIENTNET_MODELS:', [f'{tag} ({model_id})' for model_id, tag in EFFICIENTNET_MODELS], f'({len(EFFICIENTNET_MODELS)} models)')
print('CNN_SEED:', CNN_SEED, 'DINO_SEED:', DINO_SEED, 'EFFNET_SEED:', EFFNET_SEED)
print('CNN_LEARNING_RATE:', CNN_LEARNING_RATE)
print('DINO_LEARNING_RATE:', DINO_LEARNING_RATE)
print('EFFNET_LEARNING_RATE:', EFFNET_LEARNING_RATE)
print('NUM_EPOCHS:', NUM_EPOCHS, 'patience:', EARLY_STOPPING_PATIENCE)
print('FINAL_TRAIN_EPOCHS:', FINAL_TRAIN_EPOCHS)
print('NUM_TTA_AUGS:', NUM_TTA_AUGS)

CNN_MODELS: ['cnn_base (facebook/convnextv2-base-22k-224)', 'cnn_base_1k (facebook/convnext-base-224-22k-1k)', 'cnn_large (facebook/convnextv2-large-22k-224)'] (3 models)
CNN_SEED: 42
DINO_SEEDS: [42] (1 models)
CNN_LEARNING_RATE: 1e-05
DINO_LEARNING_RATE: 1e-05
NUM_EPOCHS: 30 patience: 7
FINAL_TRAIN_EPOCHS: 3
NUM_TTA_AUGS: 10


## Classes


In [50]:
VENDOR_CLASSES = ['apple','google','whatsapp','facebook','samsung','mozilla','messenger']
VENDOR_TO_IDX = {v:i for i,v in enumerate(VENDOR_CLASSES)}
IDX_TO_VENDOR = {i:v for v,i in VENDOR_TO_IDX.items()}

# Label mapping from HuggingFace dataset (11 classes) to target dataset (7 classes)
HF_TO_V11_MAPPING = {
    'Apple': 'apple',
    'Google': 'google', 'Gmail': 'google', 'Mozilla': 'google',
    'Facebook': 'facebook',
    'Samsung': 'samsung',
    'WhatsApp': 'whatsapp',  # if exists in HF dataset
    'Messenger': 'messenger',  # if exists in HF dataset
    'DoCoMo': 'apple', 'JoyPixels': 'apple', 'KDDI': 'apple', 'SoftBank': 'apple',
    'Twitter': 'google', 'Windows': 'google'
}

print('VENDOR_CLASSES:', VENDOR_CLASSES)
print('HF_TO_V11_MAPPING:', HF_TO_V11_MAPPING)


VENDOR_CLASSES: ['apple', 'google', 'whatsapp', 'facebook', 'samsung', 'mozilla', 'messenger']
HF_TO_V11_MAPPING: {'Apple': 'apple', 'Google': 'google', 'Gmail': 'google', 'Mozilla': 'google', 'Facebook': 'facebook', 'Samsung': 'samsung', 'WhatsApp': 'whatsapp', 'Messenger': 'messenger', 'DoCoMo': 'apple', 'JoyPixels': 'apple', 'KDDI': 'apple', 'SoftBank': 'apple', 'Twitter': 'google', 'Windows': 'google'}


## Deterministic Augmentation (Predictable TTA)


## Statistical features (incl. original_mode)


In [51]:
def extract_image_properties(image_path):
    try:
        img = Image.open(image_path)
        mode_mapping = {'L':0,'LA':1,'P':2,'RGB':3,'RGBA':4}
        original_mode = float(mode_mapping.get(img.mode, 3))
        # Normalize image to RGB for pixel stats
        if img.mode == 'P':
            img = img.convert('RGBA')
        if img.mode == 'RGBA':
            bg = Image.new('RGB', img.size, (255,255,255))
            bg.paste(img, mask=img.split()[3])
            img = bg
        elif img.mode != 'RGB':
            img = img.convert('RGB')
        w,h = img.size
        ar = w / h if h else 0.0
        pix = float(w*h)
        arr = np.array(img)
        mean_r = float(arr[:,:,0].mean()); mean_g = float(arr[:,:,1].mean()); mean_b = float(arr[:,:,2].mean())
        std_r = float(arr[:,:,0].std());  std_g  = float(arr[:,:,1].std());  std_b  = float(arr[:,:,2].std())
        brightness = float((mean_r+mean_g+mean_b)/3.0)
        is_mostly_white = float(brightness > 200)
        return {
            'width': float(w), 'height': float(h), 'aspect_ratio': float(ar), 'pixel_count': pix,
            'mean_r': mean_r, 'mean_g': mean_g, 'mean_b': mean_b,
            'std_r': std_r, 'std_g': std_g, 'std_b': std_b,
            'brightness': brightness, 'is_mostly_white': is_mostly_white,
            'original_mode': original_mode
        }
    except Exception as e:
        return {
            'width':224.0,'height':224.0,'aspect_ratio':1.0,'pixel_count':50176.0,
            'mean_r':128.0,'mean_g':128.0,'mean_b':128.0,
            'std_r':50.0,'std_g':50.0,'std_b':50.0,
            'brightness':128.0,'is_mostly_white':0.0,'original_mode':3.0
        }

STAT_COLS = ['width','height','aspect_ratio','pixel_count','mean_r','mean_g','mean_b','std_r','std_g','std_b','brightness','is_mostly_white','original_mode']


In [52]:
class DeterministicAugmentation:
    def __init__(self, image_size=224, seed=42):
        self.image_size = image_size
        self.seed = seed
        self.rotation_angles = [-10, -5, 5, 10]
        self.crop_ratios = [0.75, 0.85, 0.9, 0.95]
        self.color_jitter_params = {'brightness':0.3,'contrast':0.3,'saturation':0.3,'hue':0.1}
        self.translate_range = (0.1, 0.1)
        self.blur_sigma = (0.1, 0.5)

    def _get_deterministic_seed(self, image_or_hash):
        if isinstance(image_or_hash, Image.Image):
            img_bytes = image_or_hash.tobytes()
            return int(hashlib.md5(img_bytes).hexdigest()[:8], 16)
        return hash(str(image_or_hash)) & 0xFFFFFFFF

    def horizontal_flip(self, image):
        return F.hflip(image)

    def rotation(self, image, angle):
        return F.rotate(image, angle)

    def center_crop(self, image, crop_ratio=0.9):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        return F.center_crop(image, [crop,crop])

    def corner_crop(self, image, crop_ratio=0.9, position='tl'):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        if position=='tl': return F.crop(image, 0, 0, crop, crop)
        if position=='tr': return F.crop(image, 0, w-crop, crop, crop)
        if position=='bl': return F.crop(image, h-crop, 0, crop, crop)
        if position=='br': return F.crop(image, h-crop, w-crop, crop, crop)
        return image

    def resized_crop(self, image, crop_ratio=0.85):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        cropped = F.center_crop(image, [crop,crop])
        return cropped.resize((self.image_size,self.image_size), Image.BILINEAR)

    def color_jitter(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        b = 1.0 + np.random.uniform(-self.color_jitter_params['brightness'], self.color_jitter_params['brightness'])
        c = 1.0 + np.random.uniform(-self.color_jitter_params['contrast'], self.color_jitter_params['contrast'])
        s = 1.0 + np.random.uniform(-self.color_jitter_params['saturation'], self.color_jitter_params['saturation'])
        h = np.random.uniform(-self.color_jitter_params['hue'], self.color_jitter_params['hue'])
        img = F.adjust_brightness(image, b)
        img = F.adjust_contrast(img, c)
        img = F.adjust_saturation(img, s)
        img = F.adjust_hue(img, h)
        return img

    def affine_transform(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        tx = np.random.uniform(-self.translate_range[0], self.translate_range[0])
        ty = np.random.uniform(-self.translate_range[1], self.translate_range[1])
        return F.affine(image, angle=0, translate=(tx*image.width, ty*image.height), scale=1.0, shear=0.0)

    def gaussian_blur(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        sigma = np.random.uniform(self.blur_sigma[0], self.blur_sigma[1])
        return F.gaussian_blur(image, kernel_size=3, sigma=[sigma,sigma])

    def get_augmentations(self, image, num_augmentations=10, seed_source=None):
        if seed_source is None:
            seed_val = self._get_deterministic_seed(image)
        elif isinstance(seed_source, str):
            seed_val = self._get_deterministic_seed(seed_source)
        else:
            seed_val = int(seed_source)
        augs = []
        augs.append(image.resize((self.image_size,self.image_size), Image.BILINEAR))
        augs.append(self.horizontal_flip(image).resize((self.image_size,self.image_size), Image.BILINEAR))
        for angle in self.rotation_angles[:max(0, min(4, num_augmentations-len(augs)))]:
            augs.append(self.rotation(image, angle).resize((self.image_size,self.image_size), Image.BILINEAR))
        corners = ['tl','tr','bl','br']
        for cpos in corners[:max(0, min(4, num_augmentations-len(augs)))]:
            augs.append(self.corner_crop(image, 0.9, cpos).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.center_crop(image, 0.9).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.resized_crop(image, 0.85))
        if len(augs) < num_augmentations:
            augs.append(self.color_jitter(image, seed_val).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.affine_transform(image, seed_val+1).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.gaussian_blur(image, seed_val+2).resize((self.image_size,self.image_size), Image.BILINEAR))
        return augs[:num_augmentations]
    
    def apply_training_augmentation(self, image, seed_source=None):
        if seed_source is None:
            seed_val = self._get_deterministic_seed(image)
        elif isinstance(seed_source, str):
            seed_val = self._get_deterministic_seed(seed_source)
        else:
            seed_val = int(seed_source)
        
        np.random.seed(seed_val % (2**32))
        
        if (seed_val % 2 == 0):
            image = self.horizontal_flip(image)
        
        angle_idx = (seed_val // 2) % len(self.rotation_angles)
        angle = self.rotation_angles[angle_idx]
        image = self.rotation(image, angle)
        
        crop_idx = (seed_val // 10) % len(self.crop_ratios)
        crop_ratio = self.crop_ratios[crop_idx]
        w, h = image.size
        crop_size = int(min(w, h) * crop_ratio)
        image = F.center_crop(image, [crop_size, crop_size])
        
        image = self.color_jitter(image, seed_val)
        
        if (seed_val // 3) % 2 == 0:
            image = self.affine_transform(image, seed_val + 1)
        
        if (seed_val // 5) % 5 == 0:
            image = self.gaussian_blur(image, seed_val + 2)
        
        image = image.resize((self.image_size, self.image_size), Image.BILINEAR)
        
        return image

tta_aug = DeterministicAugmentation(image_size=224, seed=42)
print('Deterministic augmentation ready.')


Deterministic augmentation ready.


In [53]:
def prepare_hf_dataset_with_mapping(dataset_path):
    """
    Prepare HuggingFace dataset by finding all images and mapping vendor labels.
    Maps 11 HF classes to 7 target classes using HF_TO_V11_MAPPING.
    """
    image_paths = []
    labels = []
    dataset_path = Path(dataset_path)
    image_extensions = {'.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'}
    
    # HF dataset has vendor folders (Apple, Google, Facebook, etc.)
    # Scan each vendor folder and map to target classes
    for hf_vendor, target_vendor in HF_TO_V11_MAPPING.items():
        if target_vendor not in VENDOR_TO_IDX:
            continue  # Skip if target vendor not in our classes
        
        vendor_dir = dataset_path / hf_vendor
        if vendor_dir.exists() and vendor_dir.is_dir():
            for ext in image_extensions:
                images = list(vendor_dir.glob(f"*{ext}"))
                for img_path in images:
                    image_paths.append(str(img_path))
                    labels.append(VENDOR_TO_IDX[target_vendor])
    
    # Fallback: if no images found via vendor folders, try scanning all images
    if len(image_paths) == 0:
        for ext in image_extensions:
            all_images = list(dataset_path.rglob(f"*{ext}"))
            for img_path in all_images:
                filename = img_path.name.lower()
                parent_dir = img_path.parent.name
                # Try to match vendor from filename or parent directory
                for hf_vendor, target_vendor in HF_TO_V11_MAPPING.items():
                    if target_vendor not in VENDOR_TO_IDX:
                        continue
                    if hf_vendor.lower() in filename or hf_vendor.lower() in parent_dir.lower():
                        image_paths.append(str(img_path))
                        labels.append(VENDOR_TO_IDX[target_vendor])
                        break
    
    print(f'Loaded {len(image_paths)} images from HuggingFace dataset')
    if len(labels) > 0:
        label_counts = np.bincount(np.array(labels), minlength=len(VENDOR_CLASSES))
        print(f'Label distribution: {label_counts}')
        print(f'Label names: {[VENDOR_CLASSES[i] for i in range(len(VENDOR_CLASSES))]}')
    
    return image_paths, labels


## Data loading (CSV labels)


In [54]:
def prepare_dataset_from_csv(train_dir, csv_path):
    train_dir = Path(train_dir); csv_path = Path(csv_path)
    if not train_dir.exists() or not csv_path.exists():
        raise FileNotFoundError(f'Missing train_dir or csv: {train_dir} / {csv_path}')
    df = pd.read_csv(csv_path)
    label_map = {v: VENDOR_TO_IDX[v] for v in VENDOR_CLASSES}
    img_paths=[]; labels=[]
    missing=0; unmapped=0
    for _, r in df.iterrows():
        img_id = str(r['Id']).zfill(5)
        lab = str(r['Label']).lower()
        if lab not in label_map:
            unmapped += 1
            continue
        found = None
        for ext in ('.png','.jpg','.jpeg','.PNG','.JPG','.JPEG'):
            p = train_dir / f'{img_id}{ext}'
            if p.exists():
                found = str(p)
                break
        if found is None:
            missing += 1
            continue
        img_paths.append(found)
        labels.append(int(label_map[lab]))
    print('Loaded:', len(img_paths), 'images')
    print('Unmapped labels skipped:', unmapped, 'Missing files skipped:', missing)
    if labels:
        print('Label distribution:', np.bincount(np.array(labels), minlength=len(VENDOR_CLASSES)))
    return img_paths, labels


## Dataset class


In [55]:
def load_image_rgb(path):
    img = Image.open(path)
    if img.mode == 'P':
        img = img.convert('RGBA')
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255,255,255))
        bg.paste(img, mask=img.split()[3])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    return img

class EmojiDataset(Dataset):
    def __init__(self, image_paths, labels, processor, use_augmentation=False):
        self.image_paths = list(image_paths)
        self.labels = list(labels)
        self.processor = processor
        self.use_augmentation = use_augmentation
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        p = self.image_paths[idx]
        y = int(self.labels[idx])
        img = load_image_rgb(p)
        
        # Apply training augmentation if enabled (deterministic based on image path)
        if self.use_augmentation:
            img = tta_aug.apply_training_augmentation(img, seed_source=str(p))
        
        inputs = self.processor(img, return_tensors='pt')
        pixel_values = inputs['pixel_values'].squeeze(0)
        y = int(max(0, min(y, len(VENDOR_CLASSES)-1)))
        return {'pixel_values': pixel_values, 'labels': torch.tensor(y, dtype=torch.long)}


## Model wrapper (DINOv2, ConvNeXtV2, and EfficientNet backbones → 7 classes)


In [None]:
class ConvNeXtV2ForEmojiClassification(nn.Module):
    def __init__(self, base_model, num_labels, label_smoothing=0.1):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        self.label_smoothing = label_smoothing
        # ConvNeXtV2 hidden size is in config.hidden_sizes
        hidden = getattr(getattr(base_model, 'config', None), 'hidden_sizes', [1024])[-1]
        
        # Improved classifier with more capacity (deeper, better for CNN)
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Dropout(0.3),
            nn.Linear(hidden, hidden // 2),
            nn.GELU(),
            nn.LayerNorm(hidden // 2),
            nn.Dropout(0.2),
            nn.Linear(hidden // 2, num_labels)
        )

    def forward(self, pixel_values, labels=None):
        # ConvNeXtV2 backbone feature map
        out = self.base_model.convnextv2(pixel_values)
        feats = out.last_hidden_state
        if len(feats.shape) == 4:
            pooled = feats.mean(dim=[2,3])
        else:
            pooled = feats
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels-1)
            loss = nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)(logits.view(-1, self.num_labels), labels.view(-1))
        return ImageClassifierOutput(loss=loss, logits=logits)

class DINOv2ForEmojiClassification(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        hidden = getattr(base_model.config, 'hidden_size', 1024)
        
        # Simpler head: less capacity, less overfitting risk
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Dropout(0.1),  # Lower dropout
            nn.Linear(hidden, num_labels)  # Direct projection
        )
    
    def forward(self, pixel_values, labels=None):
        out = self.base_model(pixel_values=pixel_values, output_hidden_states=True)
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            pooled = out.pooler_output
        else:
            pooled = out.hidden_states[-1][:,0,:]
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels-1)
            loss = nn.CrossEntropyLoss(label_smoothing=0.1)(logits.view(-1, self.num_labels), labels.view(-1))
        return ImageClassifierOutput(loss=loss, logits=logits)

class EfficientNetForEmojiClassification(nn.Module):
    def __init__(self, base_model, num_labels, label_smoothing=0.1):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        self.label_smoothing = label_smoothing
        
        # EfficientNet models in transformers have different structures
        # Try to get hidden size from config or model
        hidden = None
        if hasattr(base_model, 'config'):
            if hasattr(base_model.config, 'hidden_size'):
                hidden = base_model.config.hidden_size
            elif hasattr(base_model.config, 'hidden_dim'):
                hidden = base_model.config.hidden_dim
            elif hasattr(base_model.config, 'num_channels'):
                # Some EfficientNet models use num_channels
                hidden = getattr(base_model.config, 'hidden_size', 1280)
        
        # Fallback: try to get from model directly
        if hidden is None:
            if hasattr(base_model, 'classifier'):
                # Try to infer from classifier
                if isinstance(base_model.classifier, nn.Linear):
                    hidden = base_model.classifier.in_features
                elif isinstance(base_model.classifier, nn.Sequential):
                    for layer in base_model.classifier:
                        if isinstance(layer, nn.Linear):
                            hidden = layer.in_features
                            break
        
        # Final fallback
        if hidden is None:
            hidden = 1280  # Default for EfficientNet
        
        # Improved classifier similar to CNN (deeper, more capacity)
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Dropout(0.3),
            nn.Linear(hidden, hidden // 2),
            nn.GELU(),
            nn.LayerNorm(hidden // 2),
            nn.Dropout(0.2),
            nn.Linear(hidden // 2, num_labels)
        )
    
    def forward(self, pixel_values, labels=None):
        # EfficientNet models may have different forward signatures
        # Try standard forward
        out = self.base_model(pixel_values=pixel_values, output_hidden_states=True)
        
        # Extract features
        if hasattr(out, 'last_hidden_state'):
            feats = out.last_hidden_state
        elif hasattr(out, 'hidden_states') and out.hidden_states is not None:
            feats = out.hidden_states[-1]
        elif hasattr(out, 'pooler_output') and out.pooler_output is not None:
            pooled = out.pooler_output
            logits = self.classifier(pooled)
            loss = None
            if labels is not None:
                labels = torch.clamp(labels, 0, self.num_labels-1)
                loss = nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)(logits.view(-1, self.num_labels), labels.view(-1))
            return ImageClassifierOutput(loss=loss, logits=logits)
        else:
            # Fallback: try to get logits directly
            if hasattr(out, 'logits'):
                pooled = out.logits
                # Use base model's pooling if available
                if hasattr(self.base_model, 'global_pool'):
                    pooled = self.base_model.global_pool(pixel_values)
                else:
                    pooled = out.logits.mean(dim=1) if len(out.logits.shape) > 2 else out.logits
            else:
                raise ValueError("Could not extract features from EfficientNet model")
        
        # Global average pooling if needed
        if len(feats.shape) == 4:
            pooled = feats.mean(dim=[2, 3])
        elif len(feats.shape) == 3:
            pooled = feats.mean(dim=1)
        else:
            pooled = feats
        
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels-1)
            loss = nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)(logits.view(-1, self.num_labels), labels.view(-1))
        return ImageClassifierOutput(loss=loss, logits=logits)


## Train / Validate loops


In [57]:
def seed_everything(seed):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_epoch(model, loader, optimizer, device, scaler=None):
    model.train()
    total_loss=0.0; correct=0; total=0
    use_amp = (device.type=='cuda')
    for batch in tqdm(loader, desc='Training', mininterval=TQDM_MININTERVAL):
        x = batch['pixel_values'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels-1)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast('cuda', enabled=use_amp):
            out = model(pixel_values=x, labels=y)
            loss = out.loss
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        total_loss += float(loss.item())
        pred = torch.argmax(out.logits, dim=1)
        correct += int((pred==y).sum().item())
        total += int(y.size(0))
    return total_loss/max(1,len(loader)), 100.0*correct/max(1,total)

@torch.no_grad()
def validate(model, loader, device):
    model.eval()
    total_loss=0.0; correct=0; total=0
    preds=[]; labels=[]
    use_amp = (device.type=='cuda')
    for batch in tqdm(loader, desc='Validation', mininterval=TQDM_MININTERVAL):
        x = batch['pixel_values'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels-1)
        with torch.amp.autocast('cuda', enabled=use_amp):
            out = model(pixel_values=x, labels=y)
            loss = out.loss
        total_loss += float(loss.item())
        pred = torch.argmax(out.logits, dim=1)
        pred = torch.clamp(pred, 0, model.num_labels-1)
        correct += int((pred==y).sum().item())
        total += int(y.size(0))
        preds.extend(pred.cpu().numpy().tolist())
        labels.extend(y.cpu().numpy().tolist())
    return total_loss/max(1,len(loader)), 100.0*correct/max(1,total), preds, labels


## Pre-training on HuggingFace Dataset


In [None]:
def pretrain_on_hf_dataset(model_kind, model_id, seed, tag=None, learning_rate=None, label_smoothing=None):
    """
    Pre-train model on HuggingFace dataset with label mapping.
    Returns model, processor, checkpoint_path, best_acc
    
    Args:
        model_kind: 'cnn' or 'dino'
        model_id: HuggingFace model identifier
        seed: Random seed
        tag: Optional tag for checkpoint naming (e.g., 'cnn_base', 'cnn_tiny')
    """
    seed_everything(seed)
    
    # Use model-specific learning rate and label smoothing
    if learning_rate is None:
        if model_kind == 'cnn':
            learning_rate = CNN_LEARNING_RATE
        elif model_kind == 'efficientnet':
            learning_rate = EFFNET_LEARNING_RATE
        else:
            learning_rate = DINO_LEARNING_RATE
    if label_smoothing is None:
        if model_kind == 'cnn':
            label_smoothing = CNN_LABEL_SMOOTHING
        elif model_kind == 'efficientnet':
            label_smoothing = EFFNET_LABEL_SMOOTHING
        else:
            label_smoothing = LABEL_SMOOTHING
    
    # Download and load HuggingFace dataset
    print(f'\n=== Pre-training {model_kind.upper()} (seed={seed}) on HuggingFace dataset ===')
    hf_path = kagglehub.dataset_download(HF_DATASET_ID)
    print(f'HuggingFace dataset path: {hf_path}')
    
    hf_paths, hf_labels = prepare_hf_dataset_with_mapping(hf_path)
    
    if len(hf_paths) == 0:
        raise ValueError('No images found in HuggingFace dataset')
    
    # Split HF dataset into train/val
    labels_arr = np.array(hf_labels)
    min_count = np.bincount(labels_arr, minlength=len(VENDOR_CLASSES)).min() if len(labels_arr) else 0
    can_stratify = (min_count >= 2)
    
    hf_train_paths, hf_val_paths, hf_train_y, hf_val_y = train_test_split(
        list(hf_paths), list(hf_labels),
        test_size=VAL_SIZE, random_state=seed,
        stratify=list(hf_labels) if can_stratify else None
    )
    
    print(f'HF Train: {len(hf_train_paths)}, HF Val: {len(hf_val_paths)}')
    
    # Load model and processor
    processor = AutoImageProcessor.from_pretrained(model_id)
    backbone = AutoModelForImageClassification.from_pretrained(model_id).to(device)
    if model_kind == 'dino':
        model = DINOv2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES)).to(device)
    elif model_kind == 'cnn':
        model = ConvNeXtV2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES), label_smoothing=label_smoothing).to(device)
    elif model_kind == 'efficientnet':
        model = EfficientNetForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES), label_smoothing=label_smoothing).to(device)
    else:
        raise ValueError('Unknown model_kind: ' + str(model_kind))
    
    # Create datasets and loaders
    hf_train_ds = EmojiDataset(hf_train_paths, hf_train_y, processor, use_augmentation=True)
    hf_val_ds = EmojiDataset(hf_val_paths, hf_val_y, processor, use_augmentation=False)
    bs = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
    hf_train_loader = DataLoader(hf_train_ds, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    hf_val_loader = DataLoader(hf_val_ds, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    
    # Training setup
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    # Use model-specific scheduler: CosineAnnealingWarmRestarts for CNN, ReduceLROnPlateau for DINO
    # DINO: ReduceLROnPlateau (works well for transformers)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3, min_lr=1e-7, cooldown=1
    )
    use_plateau_scheduler = True
    print(f'Using ReduceLROnPlateau scheduler')
    
    scaler = None
    if torch.cuda.is_available() and (not torch.cuda.is_bf16_supported()):
        scaler = torch.cuda.amp.GradScaler()
    
    # Training loop
    best_acc = -1.0
    # Use tag in filename if provided, otherwise use seed
    if tag:
        best_path = f'pretrained_{model_kind}_{tag}_hf.pt'
    else:
        best_path = f'pretrained_{model_kind}_seed{seed}_hf.pt'
    bad = 0
    for epoch in range(NUM_EPOCHS):
        print(f'\n[HF Pre-train {model_kind} seed={seed}] epoch {epoch+1}/{NUM_EPOCHS}')
        tr_loss, tr_acc = train_epoch(model, hf_train_loader, optimizer, device, scaler)
        va_loss, va_acc, va_pred, va_true = validate(model, hf_val_loader, device)
        
        # Step scheduler: ReduceLROnPlateau needs metric, others don't
        if use_plateau_scheduler:
            scheduler.step(va_acc)
        else:
            scheduler.step()
        print(f'Train: loss={tr_loss:.4f} acc={tr_acc:.2f}% | Val: loss={va_loss:.4f} acc={va_acc:.2f}%')
        if va_acc > best_acc + 1e-6:
            best_acc = va_acc
            bad = 0
            torch.save(model.state_dict(), best_path)
            print('✓ saved', best_path)
        else:
            bad += 1
            if bad >= EARLY_STOPPING_PATIENCE:
                print('Early stopping: no improvement for', EARLY_STOPPING_PATIENCE, 'epochs')
                break
    
    model.load_state_dict(torch.load(best_path, map_location=device))
    print(f'✓ Pre-training completed! Best HF validation accuracy: {best_acc:.2f}%')
    return model, processor, best_path, best_acc


## Fine-tuning on Target Dataset


In [None]:
def finetune_model(model_kind, model_id, seed, stage_tag, train_paths_s, train_y_s, val_paths_s, val_y_s, 
                   pretrained_checkpoint=None, learning_rate=None, label_smoothing=None):
    """
    Fine-tune model on target dataset.
    If pretrained_checkpoint is provided, loads weights from pre-training.
    """
    seed_everything(seed)
    
    # Use model-specific learning rate and label smoothing
    if learning_rate is None:
        if model_kind == 'cnn':
            learning_rate = CNN_LEARNING_RATE
        elif model_kind == 'efficientnet':
            learning_rate = EFFNET_LEARNING_RATE
        else:
            learning_rate = DINO_LEARNING_RATE
    if label_smoothing is None:
        if model_kind == 'cnn':
            label_smoothing = CNN_LABEL_SMOOTHING
        elif model_kind == 'efficientnet':
            label_smoothing = EFFNET_LABEL_SMOOTHING
        else:
            label_smoothing = LABEL_SMOOTHING
    
    processor = AutoImageProcessor.from_pretrained(model_id)
    backbone = AutoModelForImageClassification.from_pretrained(model_id).to(device)
    if model_kind == 'dino':
        model = DINOv2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES)).to(device)
    elif model_kind == 'cnn':
        model = ConvNeXtV2ForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES), label_smoothing=label_smoothing).to(device)
    elif model_kind == 'efficientnet':
        model = EfficientNetForEmojiClassification(backbone, num_labels=len(VENDOR_CLASSES), label_smoothing=label_smoothing).to(device)
    else:
        raise ValueError('Unknown model_kind: ' + str(model_kind))
    
    # Load pre-trained weights if provided
    if pretrained_checkpoint is not None and os.path.exists(pretrained_checkpoint):
        model.load_state_dict(torch.load(pretrained_checkpoint, map_location=device))
        print(f'✓ Loaded pre-trained weights from {pretrained_checkpoint}')
    else:
        print('Starting from scratch (no pre-trained checkpoint)')

    train_ds = EmojiDataset(train_paths_s, train_y_s, processor, use_augmentation=True)
    val_ds = EmojiDataset(val_paths_s, val_y_s, processor, use_augmentation=False)
    bs = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
    train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    val_loader = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    # Use model-specific scheduler: CosineAnnealingWarmRestarts for CNN, ReduceLROnPlateau for DINO
    # DINO: ReduceLROnPlateau (works well for transformers)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3, min_lr=1e-7, cooldown=1
    )
    use_plateau_scheduler = True
    print(f'Using ReduceLROnPlateau scheduler')
    
    scaler = None
    if torch.cuda.is_available() and (not torch.cuda.is_bf16_supported()):
        scaler = torch.cuda.amp.GradScaler()

    best_acc = -1.0
    best_path = f'best_{stage_tag}.pt'
    bad = 0
    for epoch in range(NUM_EPOCHS):
        print(f'\n[{stage_tag}] epoch {epoch+1}/{NUM_EPOCHS}')
        tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, device, scaler)
        va_loss, va_acc, va_pred, va_true = validate(model, val_loader, device)
        
        # Step scheduler: ReduceLROnPlateau needs metric, others don't
        if use_plateau_scheduler:
            scheduler.step(va_acc)
        else:
            scheduler.step()
        print(f'Train: loss={tr_loss:.4f} acc={tr_acc:.2f}% | Val: loss={va_loss:.4f} acc={va_acc:.2f}%')
        if va_acc > best_acc + 1e-6:
            best_acc = va_acc
            bad = 0
            torch.save(model.state_dict(), best_path)
            print('✓ saved', best_path)
        else:
            bad += 1
            if bad >= EARLY_STOPPING_PATIENCE:
                print('Early stopping: no improvement for', EARLY_STOPPING_PATIENCE, 'epochs')
                break

    model.load_state_dict(torch.load(best_path, map_location=device))
    return model, processor, best_path, best_acc


## Load Target Dataset and Split

In [60]:
# Load target dataset
all_paths, all_labels = prepare_dataset_from_csv(SECOND_DATASET_TRAIN_DIR, SECOND_DATASET_CSV_PATH)

# Split target dataset
min_count = np.bincount(np.array(all_labels), minlength=len(VENDOR_CLASSES)).min()
can_stratify = (min_count >= 2)
print('Min class count:', int(min_count), 'Stratify:', can_stratify)
train_paths, val_paths, train_y, val_y = train_test_split(
    all_paths, all_labels, test_size=VAL_SIZE, random_state=RANDOM_STATE,
    stratify=all_labels if can_stratify else None
)
print('Train:', len(train_paths), 'Val:', len(val_paths))
print('Train dist:', np.bincount(np.array(train_y), minlength=len(VENDOR_CLASSES)))
print('Val   dist:', np.bincount(np.array(val_y), minlength=len(VENDOR_CLASSES)))


Loaded: 9879 images
Unmapped labels skipped: 0 Missing files skipped: 0
Label distribution: [1924 1877 1644 1667 1790  397  580]
Min class count: 397 Stratify: True
Train: 8891 Val: 988
Train dist: [1732 1689 1480 1500 1611  357  522]
Val   dist: [192 188 164 167 179  40  58]


## Training Pipeline: Pre-train + Fine-tune + Final Training


In [None]:
print('=== Pre-training on HuggingFace dataset ===')

# Pre-train 1 CNN model
print('\n--- Pre-training CNN ---')
cnn_pretrained, cnn_proc, cnn_hf_ckpt, cnn_hf_best = pretrain_on_hf_dataset('cnn', CNN_MODEL_ID, CNN_SEED, tag=CNN_TAG)
if device.type == 'cuda':
    cnn_pretrained = cnn_pretrained.to('cpu')
    torch.cuda.empty_cache()

# Pre-train 1 DINO model
print('\n--- Pre-training DINO ---')
dino_tag = f'dino_seed{DINO_SEED}'
dino_pretrained, dino_proc, dino_hf_ckpt, dino_hf_best = pretrain_on_hf_dataset('dino', DINO_MODEL_ID, DINO_SEED, tag=dino_tag)
if device.type == 'cuda':
    dino_pretrained = dino_pretrained.to('cpu')
    torch.cuda.empty_cache()

# Pre-train 2 EfficientNet models
print('\n--- Pre-training EfficientNet models ---')
effnet_pretrained_list = []
for effnet_model_id, effnet_tag in EFFICIENTNET_MODELS:
    effnet_pretrained, effnet_proc, effnet_hf_ckpt, effnet_hf_best = pretrain_on_hf_dataset('efficientnet', effnet_model_id, EFFNET_SEED, tag=effnet_tag)
    effnet_pretrained_list.append((effnet_pretrained, effnet_proc, effnet_hf_ckpt, effnet_tag))
    if device.type == 'cuda':
        effnet_pretrained = effnet_pretrained.to('cpu')
        torch.cuda.empty_cache()

print('\n=== Fine-tuning on target dataset ===')

# Fine-tune CNN
print('\n--- Fine-tuning CNN ---')
cnn_final, cnn_proc, cnn_ckpt, cnn_best = finetune_model(
    'cnn', CNN_MODEL_ID, CNN_SEED, f'cnn_finetuned_{CNN_TAG}',
    train_paths, train_y, val_paths, val_y,
    pretrained_checkpoint=cnn_hf_ckpt
)
if device.type == 'cuda':
    cnn_final = cnn_final.to('cpu')
    torch.cuda.empty_cache()

# Fine-tune DINO
print('\n--- Fine-tuning DINO ---')
dino_final, dino_proc, dino_ckpt, dino_best = finetune_model(
    'dino', DINO_MODEL_ID, DINO_SEED, f'dino_finetuned_seed{DINO_SEED}',
    train_paths, train_y, val_paths, val_y,
    pretrained_checkpoint=dino_hf_ckpt
)
if device.type == 'cuda':
    dino_final = dino_final.to('cpu')
    torch.cuda.empty_cache()

# Fine-tune EfficientNet models
print('\n--- Fine-tuning EfficientNet models ---')
effnet_final_list = []
for effnet_pretrained, effnet_proc, effnet_hf_ckpt, effnet_tag in effnet_pretrained_list:
    effnet_model_id = next(model_id for model_id, tag in EFFICIENTNET_MODELS if tag == effnet_tag)
    effnet_final, effnet_proc, effnet_ckpt, effnet_best = finetune_model(
        'efficientnet', effnet_model_id, EFFNET_SEED, f'effnet_finetuned_{effnet_tag}',
        train_paths, train_y, val_paths, val_y,
        pretrained_checkpoint=effnet_hf_ckpt
    )
    effnet_final_list.append((effnet_final, effnet_proc, effnet_ckpt, effnet_tag))
    if device.type == 'cuda':
        effnet_final = effnet_final.to('cpu')
        torch.cuda.empty_cache()

print('\n=== Final training on combined train+val (CNN + EfficientNet) ===')

# Final training on combined data for CNN and EfficientNet models
combined_train_paths = train_paths + val_paths
combined_train_labels = train_y + val_y

# Final training for CNN
print(f'\n--- Final training CNN {CNN_TAG} ---')
cnn_model = cnn_final.to(device)
combined_ds = EmojiDataset(combined_train_paths, combined_train_labels, cnn_proc, use_augmentation=True)
bs = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
combined_loader = DataLoader(combined_ds, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
final_lr = CNN_LEARNING_RATE * FINAL_TRAIN_LR_MULT
optimizer = torch.optim.AdamW(cnn_model.parameters(), lr=final_lr, weight_decay=0.01)
scaler = None
if torch.cuda.is_available() and (not torch.cuda.is_bf16_supported()):
    scaler = torch.cuda.amp.GradScaler()
for epoch in range(FINAL_TRAIN_EPOCHS):
    print(f'  Epoch {epoch+1}/{FINAL_TRAIN_EPOCHS}')
    tr_loss, tr_acc = train_epoch(cnn_model, combined_loader, optimizer, device, scaler)
    print(f'  Train: loss={tr_loss:.4f} acc={tr_acc:.2f}%')
final_cnn_ckpt = f'final_cnn_{CNN_TAG}.pt'
torch.save(cnn_model.state_dict(), final_cnn_ckpt)
if device.type == 'cuda':
    cnn_model = cnn_model.to('cpu')
    torch.cuda.empty_cache()

# Final training for EfficientNet models
effnet_final_trained_list = []
for effnet_final, effnet_proc, effnet_ckpt, effnet_tag in effnet_final_list:
    print(f'\n--- Final training EfficientNet {effnet_tag} ---')
    effnet_model = effnet_final.to(device)
    combined_ds = EmojiDataset(combined_train_paths, combined_train_labels, effnet_proc, use_augmentation=True)
    combined_loader = DataLoader(combined_ds, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    final_lr = EFFNET_LEARNING_RATE * FINAL_TRAIN_LR_MULT
    optimizer = torch.optim.AdamW(effnet_model.parameters(), lr=final_lr, weight_decay=0.01)
    scaler = None
    if torch.cuda.is_available() and (not torch.cuda.is_bf16_supported()):
        scaler = torch.cuda.amp.GradScaler()
    for epoch in range(FINAL_TRAIN_EPOCHS):
        print(f'  Epoch {epoch+1}/{FINAL_TRAIN_EPOCHS}')
        tr_loss, tr_acc = train_epoch(effnet_model, combined_loader, optimizer, device, scaler)
        print(f'  Train: loss={tr_loss:.4f} acc={tr_acc:.2f}%')
    final_effnet_ckpt = f'final_effnet_{effnet_tag}.pt'
    torch.save(effnet_model.state_dict(), final_effnet_ckpt)
    effnet_final_trained_list.append((effnet_model, effnet_proc, final_effnet_ckpt, effnet_tag))
    if device.type == 'cuda':
        effnet_model = effnet_model.to('cpu')
        torch.cuda.empty_cache()

# Build trained_members list
trained_members = []
# Load final CNN model
cnn_model.load_state_dict(torch.load(final_cnn_ckpt, map_location=device))
trained_members.append((cnn_model, cnn_proc, CNN_TAG))
# Add DINO (no final training)
trained_members.append((dino_final, dino_proc, dino_tag))
# Add EfficientNet models
for effnet_model, effnet_proc, effnet_ckpt, effnet_tag in effnet_final_trained_list:
    trained_members.append((effnet_model, effnet_proc, effnet_tag))

print('\n✓ Trained members:', [t for _,_,t in trained_members])


=== Pre-training on HuggingFace dataset ===

=== Pre-training CNN (seed=42) on HuggingFace dataset ===
HuggingFace dataset path: /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2
Loaded 14253 images from HuggingFace dataset
Label distribution: [4993 5809    0 1727 1724    0    0]
Label names: ['apple', 'google', 'whatsapp', 'facebook', 'samsung', 'mozilla', 'messenger']
HF Train: 12827, HF Val: 1426
Using ReduceLROnPlateau scheduler

[HF Pre-train cnn seed=42] epoch 1/30


Training: 100%|██████████| 802/802 [01:03<00:00, 12.59it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 50.09it/s]


Train: loss=1.0592 acc=67.36% | Val: loss=1.1991 acc=66.97%
✓ saved pretrained_cnn_cnn_base_hf.pt

[HF Pre-train cnn seed=42] epoch 2/30


Training: 100%|██████████| 802/802 [00:58<00:00, 13.62it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 50.40it/s]


Train: loss=0.7439 acc=86.11% | Val: loss=0.9691 acc=74.68%
✓ saved pretrained_cnn_cnn_base_hf.pt

[HF Pre-train cnn seed=42] epoch 3/30


Training: 100%|██████████| 802/802 [01:02<00:00, 12.89it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 50.56it/s]


Train: loss=0.6277 acc=92.08% | Val: loss=0.9530 acc=77.00%
✓ saved pretrained_cnn_cnn_base_hf.pt

[HF Pre-train cnn seed=42] epoch 4/30


Training: 100%|██████████| 802/802 [01:03<00:00, 12.68it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 46.00it/s]


Train: loss=0.5570 acc=95.62% | Val: loss=1.0125 acc=76.65%

[HF Pre-train cnn seed=42] epoch 5/30


Training: 100%|██████████| 802/802 [00:57<00:00, 13.90it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 50.85it/s]


Train: loss=0.5111 acc=97.92% | Val: loss=1.0403 acc=75.95%

[HF Pre-train cnn seed=42] epoch 6/30


Training: 100%|██████████| 802/802 [00:58<00:00, 13.73it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 47.97it/s]


Train: loss=0.4931 acc=98.54% | Val: loss=1.0765 acc=75.81%

[HF Pre-train cnn seed=42] epoch 7/30


Training: 100%|██████████| 802/802 [01:01<00:00, 12.98it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 49.15it/s]


Train: loss=0.4802 acc=99.13% | Val: loss=1.0430 acc=76.79%

[HF Pre-train cnn seed=42] epoch 8/30


Training: 100%|██████████| 802/802 [01:00<00:00, 13.32it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 49.97it/s]


Train: loss=0.4628 acc=99.76% | Val: loss=1.0105 acc=78.54%
✓ saved pretrained_cnn_cnn_base_hf.pt

[HF Pre-train cnn seed=42] epoch 9/30


Training: 100%|██████████| 802/802 [00:58<00:00, 13.79it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 48.97it/s]


Train: loss=0.4582 acc=99.88% | Val: loss=1.0268 acc=78.89%
✓ saved pretrained_cnn_cnn_base_hf.pt

[HF Pre-train cnn seed=42] epoch 10/30


Training: 100%|██████████| 802/802 [00:58<00:00, 13.62it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 48.80it/s]


Train: loss=0.4564 acc=99.89% | Val: loss=0.9834 acc=80.08%
✓ saved pretrained_cnn_cnn_base_hf.pt

[HF Pre-train cnn seed=42] epoch 11/30


Training: 100%|██████████| 802/802 [01:01<00:00, 13.00it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 50.46it/s]


Train: loss=0.4607 acc=99.73% | Val: loss=1.0572 acc=77.91%

[HF Pre-train cnn seed=42] epoch 12/30


Training: 100%|██████████| 802/802 [00:59<00:00, 13.49it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 50.65it/s]


Train: loss=0.4578 acc=99.77% | Val: loss=1.0936 acc=77.00%

[HF Pre-train cnn seed=42] epoch 13/30


Training: 100%|██████████| 802/802 [00:59<00:00, 13.56it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 49.28it/s]


Train: loss=0.4543 acc=99.92% | Val: loss=1.0320 acc=79.45%

[HF Pre-train cnn seed=42] epoch 14/30


Training: 100%|██████████| 802/802 [00:57<00:00, 14.00it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 51.34it/s]


Train: loss=0.4593 acc=99.67% | Val: loss=1.0568 acc=78.19%

[HF Pre-train cnn seed=42] epoch 15/30


Training: 100%|██████████| 802/802 [00:58<00:00, 13.74it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 49.79it/s]


Train: loss=0.4531 acc=99.93% | Val: loss=1.0452 acc=79.17%

[HF Pre-train cnn seed=42] epoch 16/30


Training: 100%|██████████| 802/802 [00:59<00:00, 13.39it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 50.83it/s]


Train: loss=0.4518 acc=99.97% | Val: loss=1.0407 acc=79.17%

[HF Pre-train cnn seed=42] epoch 17/30


Training: 100%|██████████| 802/802 [01:03<00:00, 12.58it/s]
Validation: 100%|██████████| 90/90 [00:01<00:00, 52.93it/s]


Train: loss=0.4515 acc=99.97% | Val: loss=1.0460 acc=79.38%
Early stopping: no improvement for 7 epochs
✓ Pre-training completed! Best HF validation accuracy: 80.08%

=== Pre-training CNN (seed=42) on HuggingFace dataset ===
HuggingFace dataset path: /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2
Loaded 14253 images from HuggingFace dataset
Label distribution: [4993 5809    0 1727 1724    0    0]
Label names: ['apple', 'google', 'whatsapp', 'facebook', 'samsung', 'mozilla', 'messenger']
HF Train: 12827, HF Val: 1426


preprocessor_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/354M [00:00<?, ?B/s]

Using ReduceLROnPlateau scheduler

[HF Pre-train cnn seed=42] epoch 1/30


Training:   0%|          | 0/802 [00:00<?, ?it/s]


AttributeError: 'ConvNextForImageClassification' object has no attribute 'convnextv2'

## Feature matrix: stats + (A×C) prob vectors (Optimized)


In [None]:
def _prob_cols_for_members(members, num_augmentations):
    cols = []
    for _, _, tag in members:
        for i in range(num_augmentations):
            for c in range(len(VENDOR_CLASSES)):
                cols.append(f'prob_{tag}_aug{i}_cls{c}')
    return cols

@torch.no_grad()
def build_features_batched(image_paths, members, num_augmentations, batch_size=None):
    """
    Optimized batched feature extraction for maximum GPU efficiency.
    - Pre-loads all images and generates augmentations once
    - Processes all batches for each model before switching to the next model
    - Minimizes GPU model loading/unloading overhead
    """
    if batch_size is None:
        batch_size = FEATURE_BATCH_SIZE

    num_batches = (len(image_paths) + batch_size - 1) // batch_size
    
    # Step 1: Pre-load all images and generate augmentations once (model-independent)
    print('Pre-loading images and generating augmentations...')
    all_stats = []
    all_augmentations = []  # List of lists: [img_idx][aug_idx] = PIL Image
    
    for img_idx, img_path in enumerate(tqdm(image_paths, desc='Loading images', disable=(not SHOW_PROGRESS))):
        stats = extract_image_properties(img_path)
        img = load_image_rgb(img_path)
        augs = tta_aug.get_augmentations(img, num_augmentations=num_augmentations, seed_source=str(img_path))
        all_stats.append(stats)
        all_augmentations.append(augs)
    
    # Step 2: Process all batches for each model (minimize GPU model loading/unloading)
    model_probs = {}  # tag -> list of (B, A, C) arrays, one per batch
    
    for m, proc, tag in tqdm(members, desc='Processing models', disable=(not SHOW_PROGRESS)):
        # Load model to GPU once for all batches
        m.to(device)
        m.eval()
        
        model_batch_probs = []  # Store probabilities for all batches of this model
        
        for start in tqdm(range(0, len(image_paths), batch_size),
                          desc=f'  {tag} batches',
                          mininterval=TQDM_MININTERVAL,
                          disable=(not SHOW_PROGRESS),
                          total=num_batches,
                          leave=False):
            batch_end = min(start + batch_size, len(image_paths))
            B = batch_end - start
            
            # Collect augmentations for this batch
            batch_augs = []
            for img_idx in range(start, batch_end):
                batch_augs.extend(all_augmentations[img_idx])
            
            # Process batch
            inputs = proc(batch_augs, return_tensors='pt')
            x = inputs['pixel_values'].to(device)
            out = m(pixel_values=x)
            probs = torch.softmax(out.logits, dim=-1)
            probs = probs.view(B, num_augmentations, -1).detach().cpu().numpy()
            model_batch_probs.append(probs)
        
        model_probs[tag] = model_batch_probs
        
        # Move model back to CPU after processing all batches
        if device.type == 'cuda':
            m.to('cpu')
            torch.cuda.empty_cache()
    
    # Step 3: Combine stats and probabilities into final feature matrix
    print('Combining features...')
    all_rows = []
    for img_idx, stats in enumerate(all_stats):
        row = dict(stats)
        
        # Find which batch this image belongs to
        batch_idx = img_idx // batch_size
        batch_local_idx = img_idx % batch_size
        
        # Extract probabilities for this image from all models
        for tag, batch_probs_list in model_probs.items():
            batch_probs = batch_probs_list[batch_idx]  # (B, A, C)
            for a in range(num_augmentations):
                for c in range(len(VENDOR_CLASSES)):
                    row[f'prob_{tag}_aug{a}_cls{c}'] = float(batch_probs[batch_local_idx, a, c])
        
        all_rows.append(row)

    df = pd.DataFrame(all_rows)
    prob_cols = _prob_cols_for_members(members, num_augmentations)
    all_cols = STAT_COLS + prob_cols
    for col in all_cols:
        if col not in df.columns:
            df[col] = 0.0
    return df[all_cols]

# Estimate total features based on model configuration
example_tags = [CNN_TAG, f'dino_seed{DINO_SEED}'] + [tag for _, tag in EFFICIENTNET_MODELS]
print('Total features:', len(STAT_COLS) + len(_prob_cols_for_members([('','',t) for t in example_tags], NUM_TTA_AUGS)))
print('Using optimized batched feature extraction with batch_size:', FEATURE_BATCH_SIZE)


## Meta-model Training


In [None]:
# Use the trained_members list from the training pipeline
members = trained_members
print('Members for features:', [t for _,_,t in members])

X_train = build_features_batched(train_paths, members, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
X_val = build_features_batched(val_paths, members, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
y_train = np.array(train_y)
y_val = np.array(val_y)
print('X_train:', X_train.shape, 'X_val:', X_val.shape)

if USE_LIGHTGBM and HAS_LGB:
    meta = lgb.LGBMClassifier(**LGB_PARAMS)
    meta.fit(X_train, y_train)
    val_pred = meta.predict(X_val)
    print('Meta(LGB) val acc:', accuracy_score(y_val, val_pred)*100.0)
    print(classification_report(y_val, val_pred, target_names=VENDOR_CLASSES))
    meta.booster_.save_model('meta_lgb_v14.txt')
    print('Saved meta_lgb_v14.txt')
else:
    raise RuntimeError('LightGBM not available in this environment. Please install lightgbm or switch USE_LIGHTGBM=False')


## Final fit on Train+Val and predict Test


In [None]:
members = trained_members
combined_paths = train_paths + val_paths
combined_y = train_y + val_y

X_all = build_features_batched(combined_paths, members, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
y_all = np.array(combined_y)

if not (USE_LIGHTGBM and HAS_LGB):
    raise RuntimeError('LightGBM not available for final fit')
meta_final = lgb.LGBMClassifier(**LGB_PARAMS)
meta_final.fit(X_all, y_all)
meta_final.booster_.save_model('meta_lgb_v14_final.txt')
print('Saved meta_lgb_v14_final.txt')

test_dir = SECOND_DATASET_TEST_DIR
if not test_dir.exists():
    raise FileNotFoundError(f'Missing test dir: {test_dir}')
test_paths = []
for ext in ('.png','.jpg','.jpeg','.PNG','.JPG','.JPEG'):
    test_paths += [str(p) for p in test_dir.rglob(f'*{ext}')]
test_paths = sorted(set(test_paths))
print('Found test images:', len(test_paths))

# Process all test images at once (optimized)
print('Extracting features for test set...')
X_test = build_features_batched(test_paths, members, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
preds = meta_final.predict(X_test)

pred_ids = []
pred_labels = []
for img_path, pred in zip(test_paths, preds):
    img_id = Path(img_path).stem
    pred = int(pred)
    pred = max(0, min(pred, len(VENDOR_CLASSES)-1))
    pred_ids.append(img_id)
    pred_labels.append(IDX_TO_VENDOR[pred])

out_path = Path(PREDICTIONS_OUTPUT_FILE)
with out_path.open('w') as f:
    f.write('Id,Label\n')
    for i,l in zip(pred_ids, pred_labels):
        f.write(f'{str(i).strip()},{l}\n')
print('Wrote:', out_path, 'rows:', len(pred_labels))
