# V17 — No K-Fold, Gentle Fine-tune + Regularization

**Goal:** Improve generalization with gentler fine-tuning, stronger regularization, better augmentations, and no K-fold (single split).

**Key Features (V17):**
- **5 Architectures**: EfficientNet B0/B1/B2 + ConvNeXtV2 + DINOv2
- **No K-Fold**: Single stratified train/val split; each model trained once
- **Gentle Fine-tune**: Lower LR, shorter fine-tune, smaller scheduler patience; partial unfreeze
- **Regularization**: Higher dropout, label smoothing, optional head-only weight decay
- **Augmentations**: Light MixUp, small RandAugment/ColorJitter, enhanced TTA (extra scale + blur/sharpen)
- **Pseudo-Labeling**: Confidence raised (0.98–0.99) with V17-tagged checkpoints
- **LightGBM**: Tighter regularization

**Classes:** apple, google, whatsapp, facebook, samsung, mozilla, messenger


## Section 1: Setup & Installation


In [None]:
%pip install -r requirements.txt --upgrade -q


In [None]:
import os
import hashlib
import random
from pathlib import Path
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

import torchvision.transforms.functional as F

from transformers import AutoImageProcessor, AutoModelForImageClassification, AutoModel
from transformers.modeling_outputs import ImageClassifierOutput

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

import kagglehub

try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception as e:
    HAS_LGB = False
    print('LightGBM import failed:', e)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    torch.cuda.empty_cache()


## Section 2: Configuration & Hyperparameters


In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

# Data Paths
SECOND_DATASET_BASE_PATH = '.'
SECOND_DATASET_TRAIN_DIR = Path(SECOND_DATASET_BASE_PATH) / 'train'
SECOND_DATASET_CSV_PATH = Path(SECOND_DATASET_BASE_PATH) / 'train_labels.csv'
SECOND_DATASET_TEST_DIR = Path(SECOND_DATASET_BASE_PATH) / 'test'

# HuggingFace Dataset
HF_DATASET_ID = 'subinium/emojiimage-dataset'

# ============================================================================
# MODEL CONFIGURATIONS - 5 Architectures with native resolutions
# ============================================================================
# Format: (model_id, tag, image_size)
MODELS = [
    ('google/efficientnet-b0', 'effnet_b0', 224),
    ('google/efficientnet-b1', 'effnet_b1', 240),
    ('google/efficientnet-b2', 'effnet_b2', 260),
    ('facebook/convnextv2-base-22k-224', 'cnn_base', 224),
    ('facebook/dinov2-base', 'dino', 224),
]

# Model type mapping for architecture-specific handling
MODEL_TYPES = {
    'effnet_b0': 'efficientnet',
    'effnet_b1': 'efficientnet',
    'effnet_b2': 'efficientnet',
    'cnn_base': 'convnext',
    'dino': 'dino',
}

# ============================================================================
# TRAINING HYPERPARAMETERS
# ============================================================================
RANDOM_STATE = 42
USE_KFOLD = False  # No K-Fold in V17
NUM_FOLDS = 1      # Placeholder to keep legacy functions inert

# Two-Phase Training
WARMUP_EPOCHS = 5            # Phase 1: Frozen backbone
WARMUP_LR = 1e-3
FINETUNE_EPOCHS = 12         # Gentler fine-tune
FINETUNE_LR = 4e-6
EARLY_STOPPING_PATIENCE = 2  # Lower patience so LR drops sooner

# Batch sizes
BATCH_SIZE_CUDA = 16
BATCH_SIZE_CPU = 4
NUM_WORKERS = 2

# Regularization
LABEL_SMOOTHING = 0.1  # base; heads may override
WEIGHT_DECAY = 1e-5
HEAD_WEIGHT_DECAY = 1e-4  # optional, heads only
HEAD_DROPOUT_DELTA = 0.05  # add to existing head dropouts
LABEL_SMOOTHING_CNN = 0.14
LABEL_SMOOTHING_DINO = 0.1

# MixUp / Augment
USE_MIXUP = True
MIXUP_ALPHA = 0.3
USE_EXTRA_COLOR = True  # small RandAugment/ColorJitter-like boost

# Pseudo-labeling
PSEUDO_LABEL_CONFIDENCE = 0.985
PSEUDO_LABEL_EPOCHS = 3
PSEUDO_LABEL_LR = 5e-6

# TTA tweaks
EXTRA_TTA_SCALE = 0.95
USE_TTA_BLUR_SHARPEN = True



# ============================================================================
# TTA & FEATURE EXTRACTION
# ============================================================================
NUM_TTA_AUGS = 12  # Multi-scale TTA
FEATURE_BATCH_SIZE = 16
TQDM_MININTERVAL = 10
SHOW_PROGRESS = True

# ============================================================================
# META-MODEL (LightGBM)
# ============================================================================
USE_LIGHTGBM = True
LGB_PARAMS = {
    'n_estimators': 2000,
    'learning_rate': 0.015,
    'num_leaves': 95,
    'max_depth': -1,
    'subsample': 0.8,
    'bagging_freq': 1,
    'feature_fraction': 0.85,
    'min_child_samples': 40,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'n_jobs': -1,
}

# Output
PREDICTIONS_OUTPUT_FILE = 'predictions_V17.csv'

# ============================================================================
# PRINT CONFIGURATION
# ============================================================================
print('='*60)
print('V17 Configuration (No K-Fold)')
print('='*60)
print(f'Models: {len(MODELS)} architectures')
for model_id, tag, img_size in MODELS:
    print(f'  - {tag}: {model_id} ({img_size}x{img_size})')
print(f'Two-Phase: {WARMUP_EPOCHS} warmup + {FINETUNE_EPOCHS} finetune epochs')
print(f'Fine-tune LR: {FINETUNE_LR}, scheduler patience: {EARLY_STOPPING_PATIENCE}')
print(f'TTA: {NUM_TTA_AUGS} augmentations (multi-scale)')
print(f'Pseudo-labeling: confidence > {PSEUDO_LABEL_CONFIDENCE}')
print('='*60)


In [None]:
# ============================================================================
# CLASS DEFINITIONS & LABEL MAPPING
# ============================================================================

VENDOR_CLASSES = ['apple', 'google', 'whatsapp', 'facebook', 'samsung', 'mozilla', 'messenger']
VENDOR_TO_IDX = {v: i for i, v in enumerate(VENDOR_CLASSES)}
IDX_TO_VENDOR = {i: v for v, i in VENDOR_TO_IDX.items()}
NUM_CLASSES = len(VENDOR_CLASSES)

# Label mapping from HuggingFace dataset (11 classes) to target dataset (7 classes)
HF_TO_TARGET_MAPPING = {
    'Apple': 'apple',
    'Google': 'google', 'Gmail': 'google', 'Mozilla': 'google',
    'Facebook': 'facebook',
    'Samsung': 'samsung',
    'WhatsApp': 'whatsapp',
    'Messenger': 'messenger',
    'DoCoMo': 'apple', 'JoyPixels': 'apple', 'KDDI': 'apple', 'SoftBank': 'apple',
    'Twitter': 'google', 'Windows': 'google'
}

print('VENDOR_CLASSES:', VENDOR_CLASSES)
print('NUM_CLASSES:', NUM_CLASSES)


## Section 3: Model Wrappers with Metadata Fusion

Each model wrapper includes:
- Backbone feature extraction
- Metadata branch (image size + transparency)
- Fusion layer combining visual features + metadata
- Classification head with label smoothing


In [None]:
class EfficientNetWithMetadata(nn.Module):
    """EfficientNet wrapper with metadata fusion."""
    
    def __init__(self, base_model, num_labels, label_smoothing=0.1):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        self.label_smoothing = label_smoothing
        
        # Dynamic hidden size detection
        self._hidden_size = None
        self.classifier = None
        self._needs_init = True
        
        # Metadata branch: [height_norm, width_norm, has_alpha]
        self.metadata_mlp = nn.Sequential(
            nn.Linear(3, 32),
            nn.ReLU(),
            nn.Dropout(0.2 + HEAD_DROPOUT_DELTA),
            nn.Linear(32, 32),
            nn.ReLU()
        )
    
    def _extract_pooled_features(self, pixel_values):
        """Extract pooled features from EfficientNet backbone."""
        backbone = getattr(self.base_model, 'efficientnet', self.base_model)
        backbone_out = backbone(pixel_values)
        
        # Handle different output types
        if hasattr(backbone_out, 'pooler_output') and backbone_out.pooler_output is not None:
            return backbone_out.pooler_output
        elif hasattr(backbone_out, 'last_hidden_state'):
            features = backbone_out.last_hidden_state
            if len(features.shape) == 4:
                return features.mean(dim=[2, 3])
            elif len(features.shape) == 3:
                return features[:, 0]
            return features
        elif isinstance(backbone_out, tuple):
            features = backbone_out[0]
            if len(features.shape) == 4:
                return features.mean(dim=[2, 3])
            return features
        else:
            if len(backbone_out.shape) == 4:
                return backbone_out.mean(dim=[2, 3])
            return backbone_out
    
    def _ensure_classifier_initialized(self, pixel_values):
        """Initialize classifier with correct hidden size."""
        if self.classifier is not None:
            return
        
        with torch.no_grad():
            pooled = self._extract_pooled_features(pixel_values[:1])
            hidden_size = pooled.shape[-1]
        
        self._hidden_size = hidden_size
        # Fusion: image features + metadata
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden_size + 32),
            nn.Dropout(0.3 + HEAD_DROPOUT_DELTA),
            nn.Linear(hidden_size + 32, 256),
            nn.GELU(),
            nn.LayerNorm(256),
            nn.Dropout(0.2 + HEAD_DROPOUT_DELTA),
            nn.Linear(256, self.num_labels)
        ).to(pixel_values.device)
        self._needs_init = False
        print(f'Initialized classifier with hidden_size={hidden_size}')
    
    def forward(self, pixel_values, metadata=None, labels=None):
        # Ensure classifier is initialized
        if self.classifier is None:
            self._ensure_classifier_initialized(pixel_values)
        
        # Extract visual features
        pooled = self._extract_pooled_features(pixel_values)
        
        # Process metadata
        if metadata is not None:
            meta_features = self.metadata_mlp(metadata)
            fused = torch.cat([pooled, meta_features], dim=-1)
        else:
            # If no metadata, use zeros
            batch_size = pooled.shape[0]
            zero_meta = torch.zeros(batch_size, 32, device=pooled.device)
            fused = torch.cat([pooled, zero_meta], dim=-1)
        
        logits = self.classifier(fused)
        
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels - 1)
            loss = nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)(
                logits.view(-1, self.num_labels), labels.view(-1)
            )
        
        return ImageClassifierOutput(loss=loss, logits=logits)
    
    def freeze_backbone(self):
        """Freeze backbone for warmup training."""
        for param in self.base_model.parameters():
            param.requires_grad = False
        # Keep classifier and metadata trainable
        for param in self.metadata_mlp.parameters():
            param.requires_grad = True
        if self.classifier is not None:
            for param in self.classifier.parameters():
                param.requires_grad = True
    
    def unfreeze_backbone(self):
        """Unfreeze backbone for fine-tuning."""
        for param in self.base_model.parameters():
            param.requires_grad = True

print('EfficientNetWithMetadata defined')


In [None]:
class ConvNeXtWithMetadata(nn.Module):
    """ConvNeXtV2 wrapper with metadata fusion."""
    
    def __init__(self, base_model, num_labels, label_smoothing=0.1):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        self.label_smoothing = label_smoothing
        
        # Get hidden size from config
        hidden = getattr(getattr(base_model, 'config', None), 'hidden_sizes', [1024])[-1]
        self._hidden_size = hidden
        
        # Metadata branch
        self.metadata_mlp = nn.Sequential(
            nn.Linear(3, 32),
            nn.ReLU(),
            nn.Dropout(0.2 + HEAD_DROPOUT_DELTA),
            nn.Linear(32, 32),
            nn.ReLU()
        )
        
        # Fusion classifier
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden + 32),
            nn.Dropout(0.3 + HEAD_DROPOUT_DELTA),
            nn.Linear(hidden + 32, hidden // 2),
            nn.GELU(),
            nn.LayerNorm(hidden // 2),
            nn.Dropout(0.2 + HEAD_DROPOUT_DELTA),
            nn.Linear(hidden // 2, num_labels)
        )
    
    def forward(self, pixel_values, metadata=None, labels=None):
        # Get backbone - try convnextv2 first, then convnext
        backbone = getattr(self.base_model, 'convnextv2', None)
        if backbone is None:
            backbone = getattr(self.base_model, 'convnext', self.base_model)
        
        out = backbone(pixel_values)
        feats = out.last_hidden_state
        
        # Pool spatial dimensions
        if len(feats.shape) == 4:
            pooled = feats.mean(dim=[2, 3])
        else:
            pooled = feats
        
        # Process metadata
        if metadata is not None:
            meta_features = self.metadata_mlp(metadata)
            fused = torch.cat([pooled, meta_features], dim=-1)
        else:
            batch_size = pooled.shape[0]
            zero_meta = torch.zeros(batch_size, 32, device=pooled.device)
            fused = torch.cat([pooled, zero_meta], dim=-1)
        
        logits = self.classifier(fused)
        
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels - 1)
            loss = nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)(
                logits.view(-1, self.num_labels), labels.view(-1)
            )
        
        return ImageClassifierOutput(loss=loss, logits=logits)
    
    def freeze_backbone(self):
        for param in self.base_model.parameters():
            param.requires_grad = False
        for param in self.metadata_mlp.parameters():
            param.requires_grad = True
        for param in self.classifier.parameters():
            param.requires_grad = True
    
    def unfreeze_backbone(self):
        for param in self.base_model.parameters():
            param.requires_grad = True

print('ConvNeXtWithMetadata defined')


In [None]:
class DINOWithMetadata(nn.Module):
    """DINOv2 wrapper with metadata fusion."""
    
    def __init__(self, base_model, num_labels, label_smoothing=0.1):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        self.label_smoothing = label_smoothing
        
        hidden = getattr(base_model.config, 'hidden_size', 768)
        self._hidden_size = hidden
        
        # Metadata branch
        self.metadata_mlp = nn.Sequential(
            nn.Linear(3, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 32),
            nn.ReLU()
        )
        
        # Simpler classifier for DINO (less prone to overfitting)
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden + 32),
            nn.Dropout(0.2),
            nn.Linear(hidden + 32, num_labels)
        )
    
    def forward(self, pixel_values, metadata=None, labels=None):
        out = self.base_model(pixel_values=pixel_values, output_hidden_states=True)
        
        # Get pooled output
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            pooled = out.pooler_output
        else:
            pooled = out.hidden_states[-1][:, 0, :]
        
        # Process metadata
        if metadata is not None:
            meta_features = self.metadata_mlp(metadata)
            fused = torch.cat([pooled, meta_features], dim=-1)
        else:
            batch_size = pooled.shape[0]
            zero_meta = torch.zeros(batch_size, 32, device=pooled.device)
            fused = torch.cat([pooled, zero_meta], dim=-1)
        
        logits = self.classifier(fused)
        
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels - 1)
            loss = nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)(
                logits.view(-1, self.num_labels), labels.view(-1)
            )
        
        return ImageClassifierOutput(loss=loss, logits=logits)
    
    def freeze_backbone(self):
        for param in self.base_model.parameters():
            param.requires_grad = False
        for param in self.metadata_mlp.parameters():
            param.requires_grad = True
        for param in self.classifier.parameters():
            param.requires_grad = True
    
    def unfreeze_backbone(self):
        for param in self.base_model.parameters():
            param.requires_grad = True

print('DINOWithMetadata defined')


In [None]:
def create_model(model_id: str, tag: str, num_labels: int = NUM_CLASSES) -> Tuple[nn.Module, any]:
    """
    Factory function to create the appropriate model wrapper.
    Returns (model, processor)
    """
    model_type = MODEL_TYPES[tag]
    
    if model_type == 'efficientnet':
        processor = AutoImageProcessor.from_pretrained(model_id)
        backbone = AutoModelForImageClassification.from_pretrained(model_id)
        model = EfficientNetWithMetadata(backbone, num_labels, label_smoothing=LABEL_SMOOTHING_CNN)
    elif model_type == 'convnext':
        processor = AutoImageProcessor.from_pretrained(model_id)
        backbone = AutoModelForImageClassification.from_pretrained(model_id)
        model = ConvNeXtWithMetadata(backbone, num_labels, label_smoothing=LABEL_SMOOTHING_CNN)
    elif model_type == 'dino':
        processor = AutoImageProcessor.from_pretrained(model_id)
        backbone = AutoModel.from_pretrained(model_id)
        model = DINOWithMetadata(backbone, num_labels, label_smoothing=LABEL_SMOOTHING_DINO)
    else:
        raise ValueError(f'Unknown model type: {model_type}')
    
    return model, processor

print('Model factory function defined')


## Section 4: Utility Functions & Data Loading


In [None]:
def seed_everything(seed: int):
    """Set all random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def load_image_rgb(path: str) -> Image.Image:
    """Load image and convert to RGB with white background for transparency."""
    img = Image.open(path)
    if img.mode == 'P':
        img = img.convert('RGBA')
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255, 255, 255))
        bg.paste(img, mask=img.split()[3])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    return img

def extract_metadata(image_path: str) -> Dict[str, float]:
    """
    Extract metadata features from image for fusion.
    Returns: dict with height_norm, width_norm, has_alpha
    """
    try:
        img = Image.open(image_path)
        w, h = img.size
        has_alpha = 1.0 if img.mode in ('RGBA', 'LA', 'P') else 0.0
        return {
            'height_norm': h / 256.0,
            'width_norm': w / 256.0,
            'has_alpha': has_alpha
        }
    except Exception:
        return {'height_norm': 1.0, 'width_norm': 1.0, 'has_alpha': 0.0}

def extract_image_properties(image_path: str) -> Dict[str, float]:
    """Extract full statistical properties from image for feature matrix."""
    try:
        img = Image.open(image_path)
        mode_mapping = {'L': 0, 'LA': 1, 'P': 2, 'RGB': 3, 'RGBA': 4}
        original_mode = float(mode_mapping.get(img.mode, 3))
        
        # Convert for pixel stats
        if img.mode == 'P':
            img = img.convert('RGBA')
        if img.mode == 'RGBA':
            bg = Image.new('RGB', img.size, (255, 255, 255))
            bg.paste(img, mask=img.split()[3])
            img = bg
        elif img.mode != 'RGB':
            img = img.convert('RGB')
        
        w, h = img.size
        arr = np.array(img)
        
        return {
            'width': float(w),
            'height': float(h),
            'aspect_ratio': float(w / h) if h else 1.0,
            'pixel_count': float(w * h),
            'mean_r': float(arr[:, :, 0].mean()),
            'mean_g': float(arr[:, :, 1].mean()),
            'mean_b': float(arr[:, :, 2].mean()),
            'std_r': float(arr[:, :, 0].std()),
            'std_g': float(arr[:, :, 1].std()),
            'std_b': float(arr[:, :, 2].std()),
            'brightness': float((arr[:, :, 0].mean() + arr[:, :, 1].mean() + arr[:, :, 2].mean()) / 3.0),
            'is_mostly_white': float(arr.mean() > 200),
            'original_mode': original_mode
        }
    except Exception:
        return {
            'width': 224.0, 'height': 224.0, 'aspect_ratio': 1.0, 'pixel_count': 50176.0,
            'mean_r': 128.0, 'mean_g': 128.0, 'mean_b': 128.0,
            'std_r': 50.0, 'std_g': 50.0, 'std_b': 50.0,
            'brightness': 128.0, 'is_mostly_white': 0.0, 'original_mode': 3.0
        }

STAT_COLS = ['width', 'height', 'aspect_ratio', 'pixel_count', 'mean_r', 'mean_g', 'mean_b',
             'std_r', 'std_g', 'std_b', 'brightness', 'is_mostly_white', 'original_mode']

print('Utility functions defined')


In [None]:
def compute_class_weights(labels: List[int]) -> torch.Tensor:
    """
    Compute class weights for handling class imbalance.
    Weight = total_samples / (num_classes * class_count)
    """
    labels_arr = np.array(labels)
    class_counts = np.bincount(labels_arr, minlength=NUM_CLASSES)
    total_samples = len(labels)
    
    # Avoid division by zero
    class_counts = np.maximum(class_counts, 1)
    
    weights = total_samples / (NUM_CLASSES * class_counts)
    weights = weights / weights.sum() * NUM_CLASSES  # Normalize
    
    print('Class weights:')
    for i, (cls, w) in enumerate(zip(VENDOR_CLASSES, weights)):
        count = class_counts[i]
        print(f'  {cls:12s}: {w:.3f} (n={count})')
    
    return torch.tensor(weights, dtype=torch.float32)

print('Class weights function defined')


## Section 5: Multi-Scale TTA Augmentations (12 augmentations)


In [None]:
class MultiScaleTTA:
    """
    Multi-scale test-time augmentation with 12 augmentations:
    - Original + HFlip + VFlip (3)
    - Rotations: +5, +10, -5, -10 degrees (4)
    - Corner crops: TL, TR, BL, BR at 90% (4)
    - Center crop at 85% (1)
    """
    
    def __init__(self, base_image_size: int = 224):
        self.base_size = base_image_size
        self.rotation_angles = [-10, -5, 5, 10]
        self.crop_ratio = 0.9
        self.center_crop_ratio = 0.85
    
    def _get_deterministic_seed(self, image_or_hash):
        """Get deterministic seed from image content."""
        if isinstance(image_or_hash, Image.Image):
            img_bytes = image_or_hash.tobytes()
            return int(hashlib.md5(img_bytes).hexdigest()[:8], 16)
        return hash(str(image_or_hash)) & 0xFFFFFFFF
    
    def get_augmentations(self, image: Image.Image, num_augmentations: int = 12, 
                          seed_source=None) -> List[Image.Image]:
        """Generate deterministic augmentations."""
        augs = []
        target_size = self.base_size
        
        # 1. Original
        augs.append(image.resize((target_size, target_size), Image.BILINEAR))
        
        # 2. Horizontal flip
        if len(augs) < num_augmentations:
            augs.append(F.hflip(image).resize((target_size, target_size), Image.BILINEAR))
        
        # 3. Vertical flip
        if len(augs) < num_augmentations:
            augs.append(F.vflip(image).resize((target_size, target_size), Image.BILINEAR))
        
        # 4-7. Rotations
        for angle in self.rotation_angles:
            if len(augs) >= num_augmentations:
                break
            augs.append(F.rotate(image, angle).resize((target_size, target_size), Image.BILINEAR))
        
        # 8-11. Corner crops
        w, h = image.size
        crop_size = int(min(w, h) * self.crop_ratio)
        corners = [
            (0, 0),  # TL
            (w - crop_size, 0),  # TR
            (0, h - crop_size),  # BL
            (w - crop_size, h - crop_size),  # BR
        ]
        for (left, top) in corners:
            if len(augs) >= num_augmentations:
                break
            cropped = F.crop(image, top, left, crop_size, crop_size)
            augs.append(cropped.resize((target_size, target_size), Image.BILINEAR))
        
        # 12. Center crop
        if len(augs) < num_augmentations:
            center_size = int(min(w, h) * self.center_crop_ratio)
            center_cropped = F.center_crop(image, [center_size, center_size])
            augs.append(center_cropped.resize((target_size, target_size), Image.BILINEAR))
        
        # Extra mild scale
        if EXTRA_TTA_SCALE and len(augs) < num_augmentations:
            scale_size = int(min(w, h) * EXTRA_TTA_SCALE)
            scaled = F.center_crop(image, [scale_size, scale_size])
            augs.append(scaled.resize((target_size, target_size), Image.BILINEAR))
        
        # Blur / sharpen variants
        if USE_TTA_BLUR_SHARPEN and len(augs) < num_augmentations:
            blur_img = F.gaussian_blur(image, kernel_size=3, sigma=(0.5, 0.8))
            augs.append(blur_img.resize((target_size, target_size), Image.BILINEAR))
        if USE_TTA_BLUR_SHARPEN and len(augs) < num_augmentations:
            sharp_img = F.adjust_sharpness(image, sharpness_factor=1.5)
            augs.append(sharp_img.resize((target_size, target_size), Image.BILINEAR))
        
        return augs[:num_augmentations]
    
    def apply_training_augmentation(self, image: Image.Image, seed_source=None) -> Image.Image:
        """Apply random augmentation for training."""
        if seed_source is None:
            seed_val = self._get_deterministic_seed(image)
        elif isinstance(seed_source, str):
            seed_val = self._get_deterministic_seed(seed_source)
        else:
            seed_val = int(seed_source)
        
        np.random.seed(seed_val % (2**32))
        
        # Random horizontal flip
        if seed_val % 2 == 0:
            image = F.hflip(image)
        
        # Random rotation
        angle_idx = (seed_val // 2) % len(self.rotation_angles)
        angle = self.rotation_angles[angle_idx]
        image = F.rotate(image, angle)
        
        # Random crop
        w, h = image.size
        crop_ratio = 0.85 + np.random.uniform(0, 0.15)
        crop_size = int(min(w, h) * crop_ratio)
        image = F.center_crop(image, [crop_size, crop_size])
        
        # Random color jitter
        brightness = 1.0 + np.random.uniform(-0.3, 0.3)
        contrast = 1.0 + np.random.uniform(-0.3, 0.3)
        saturation = 1.0 + np.random.uniform(-0.3, 0.3)
        if USE_EXTRA_COLOR:
            hue = np.random.uniform(-0.08, 0.08)
            image = F.adjust_hue(image, hue)
            saturation *= 1.05
            contrast *= 1.05
        
        image = F.adjust_brightness(image, brightness)
        image = F.adjust_contrast(image, contrast)
        image = F.adjust_saturation(image, saturation)
        
        # Optional mild blur/sharpen during training
        if USE_TTA_BLUR_SHARPEN and (seed_val % 5 == 0):
            image = F.gaussian_blur(image, kernel_size=3, sigma=(0.5, 0.8))
        if USE_TTA_BLUR_SHARPEN and (seed_val % 7 == 0):
            image = F.adjust_sharpness(image, sharpness_factor=1.5)
        
        # Resize to target
        image = image.resize((self.base_size, self.base_size), Image.BILINEAR)
        
        return image

# Create TTA instances for different image sizes
tta_instances = {}
for model_id, tag, img_size in MODELS:
    tta_instances[tag] = MultiScaleTTA(base_image_size=img_size)

print(f'Created TTA instances for {len(tta_instances)} model sizes')


In [None]:
class EmojiDatasetWithMetadata(Dataset):
    """Dataset with metadata extraction for model fusion."""
    
    def __init__(self, image_paths: List[str], labels: List[int], 
                 processor, model_tag: str, use_augmentation: bool = False):
        self.image_paths = list(image_paths)
        self.labels = list(labels)
        self.processor = processor
        self.model_tag = model_tag
        self.use_augmentation = use_augmentation
        self.tta = tta_instances.get(model_tag, MultiScaleTTA(224))
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        path = self.image_paths[idx]
        label = int(self.labels[idx])
        
        # Load image
        img = load_image_rgb(path)
        
        # Extract metadata
        meta = extract_metadata(path)
        metadata = torch.tensor([
            meta['height_norm'],
            meta['width_norm'],
            meta['has_alpha']
        ], dtype=torch.float32)
        
        # Apply augmentation if training
        if self.use_augmentation:
            img = self.tta.apply_training_augmentation(img, seed_source=str(path))
        
        # Process image
        inputs = self.processor(img, return_tensors='pt')
        pixel_values = inputs['pixel_values'].squeeze(0)
        
        label = int(max(0, min(label, NUM_CLASSES - 1)))
        
        return {
            'pixel_values': pixel_values,
            'metadata': metadata,
            'labels': torch.tensor(label, dtype=torch.long)
        }

print('EmojiDatasetWithMetadata defined')


In [None]:
def train_epoch(model, loader, optimizer, device, scaler=None, class_weights=None,
                use_mixup=False, mixup_alpha=0.3):
    """Train for one epoch with optional class weights and MixUp."""
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    use_amp = (device.type == 'cuda')
    
    criterion = nn.CrossEntropyLoss(label_smoothing=getattr(model, 'label_smoothing', LABEL_SMOOTHING))
    
    for batch in tqdm(loader, desc='Training', mininterval=TQDM_MININTERVAL, disable=(not SHOW_PROGRESS)):
        x = batch['pixel_values'].to(device, non_blocking=True)
        meta = batch['metadata'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels - 1)
        
        optimizer.zero_grad(set_to_none=True)
        
        if use_mixup:
            lam = np.random.beta(mixup_alpha, mixup_alpha)
            perm = torch.randperm(x.size(0), device=device)
            x_mixed = lam * x + (1 - lam) * x[perm]
            meta_mixed = lam * meta + (1 - lam) * meta[perm]
            targets_a, targets_b = y, y[perm]
            
            with torch.amp.autocast('cuda', enabled=use_amp):
                out = model(pixel_values=x_mixed, metadata=meta_mixed, labels=None)
                logits = out.logits
                loss = lam * criterion(logits, targets_a) + (1 - lam) * criterion(logits, targets_b)
        else:
            with torch.amp.autocast('cuda', enabled=use_amp):
                out = model(pixel_values=x, metadata=meta, labels=y)
                loss = out.loss
                logits = out.logits
        
        # Apply class weights if provided
        if class_weights is not None:
            weight_tensor = class_weights.to(device)
            if use_mixup:
                # weight per sample: interpolate weights
                batch_weights = lam * weight_tensor[targets_a] + (1 - lam) * weight_tensor[targets_b]
                loss = loss * batch_weights.mean()
            else:
                batch_weights = weight_tensor[y]
                loss = (loss * batch_weights).mean() if loss.dim() > 0 else loss
        
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        
        total_loss += float(loss.item())
        pred = torch.argmax(logits, dim=1)
        correct += int((pred == y).sum().item())
        total += int(y.size(0))
    
    return total_loss / max(1, len(loader)), 100.0 * correct / max(1, total)

@torch.no_grad()
def validate(model, loader, device):
    """Validate model."""
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    preds = []
    labels = []
    use_amp = (device.type == 'cuda')
    
    for batch in tqdm(loader, desc='Validation', mininterval=TQDM_MININTERVAL, disable=(not SHOW_PROGRESS)):
        x = batch['pixel_values'].to(device, non_blocking=True)
        meta = batch['metadata'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels - 1)
        
        with torch.amp.autocast('cuda', enabled=use_amp):
            out = model(pixel_values=x, metadata=meta, labels=y)
            loss = out.loss
        
        total_loss += float(loss.item())
        pred = torch.argmax(out.logits, dim=1)
        pred = torch.clamp(pred, 0, model.num_labels - 1)
        correct += int((pred == y).sum().item())
        total += int(y.size(0))
        preds.extend(pred.cpu().numpy().tolist())
        labels.extend(y.cpu().numpy().tolist())
    
    return total_loss / max(1, len(loader)), 100.0 * correct / max(1, total), preds, labels

print('Train/validate functions defined')


In [None]:
def two_phase_train(model, processor, model_tag, train_paths, train_labels, 
                    val_paths, val_labels, checkpoint_prefix, class_weights=None):
    """
    Two-phase training:
    Phase 1: Frozen backbone, train classifier head (warmup)
    Phase 2: Partial unfreeze + gentle fine-tune with lower LR
    """
    print(f'\n{"="*60}')
    print(f'Two-Phase Training: {model_tag}')
    print(f'{"="*60}')
    
    model = model.to(device)
    img_size = next((s for _, t, s in MODELS if t == model_tag), 224)
    
    # Initialize classifier if needed (for EfficientNet)
    if hasattr(model, '_ensure_classifier_initialized') and model.classifier is None:
        dummy_input = torch.randn(1, 3, img_size, img_size).to(device)
        model._ensure_classifier_initialized(dummy_input)
    
    # Create datasets
    train_ds = EmojiDatasetWithMetadata(train_paths, train_labels, processor, model_tag, use_augmentation=True)
    val_ds = EmojiDatasetWithMetadata(val_paths, val_labels, processor, model_tag, use_augmentation=False)
    
    bs = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
    train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, 
                              num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    val_loader = DataLoader(val_ds, batch_size=bs, shuffle=False,
                            num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    
    scaler = None
    if torch.cuda.is_available() and not torch.cuda.is_bf16_supported():
        scaler = torch.cuda.amp.GradScaler()
    
    best_acc = -1.0
    best_path = f'{checkpoint_prefix}_V17.pt'
    
    # ========================================
    # Phase 1: Warmup (Frozen Backbone)
    # ========================================
    print(f'\n[Phase 1] Warmup - Training head only (frozen backbone)')
    model.freeze_backbone()
    
    # Only train unfrozen parameters
    head_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.Adam(head_params, lr=WARMUP_LR)
    
    for epoch in range(WARMUP_EPOCHS):
        print(f'\n[Warmup] Epoch {epoch+1}/{WARMUP_EPOCHS}')
        tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, device, scaler, class_weights,
                                      use_mixup=False)
        va_loss, va_acc, _, _ = validate(model, val_loader, device)
        print(f'Train: loss={tr_loss:.4f} acc={tr_acc:.2f}% | Val: loss={va_loss:.4f} acc={va_acc:.2f}%')
        
        if va_acc > best_acc + 1e-6:
            best_acc = va_acc
            torch.save(model.state_dict(), best_path)
            print(f'✓ Saved {best_path}')
    
    # ========================================
    # Phase 2: Fine-tuning (Partial Unfreeze)
    # ========================================
    print(f'\n[Phase 2] Fine-tuning - Partial unfreeze')
    model.unfreeze_backbone()
    
    # Partial unfreeze: keep early 70% frozen for stability
    backbone_params = list(model.base_model.parameters())
    freeze_upto = int(len(backbone_params) * 0.7)
    for i, p in enumerate(backbone_params):
        p.requires_grad = (i >= freeze_upto)
    # Heads always trainable
    for p in model.metadata_mlp.parameters():
        p.requires_grad = True
    if getattr(model, 'classifier', None) is not None:
        for p in model.classifier.parameters():
            p.requires_grad = True
    
    # Param groups: backbone vs head
    head_params = list(model.metadata_mlp.parameters())
    if getattr(model, 'classifier', None) is not None:
        head_params += list(model.classifier.parameters())
    backbone_trainable = [p for p in model.base_model.parameters() if p.requires_grad]
    
    optimizer = torch.optim.AdamW([
        {'params': head_params, 'weight_decay': HEAD_WEIGHT_DECAY},
        {'params': backbone_trainable, 'weight_decay': WEIGHT_DECAY},
    ], lr=FINETUNE_LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=max(1, EARLY_STOPPING_PATIENCE), min_lr=1e-7
    )
    
    bad_epochs = 0
    for epoch in range(FINETUNE_EPOCHS):
        print(f'\n[Fine-tune] Epoch {epoch+1}/{FINETUNE_EPOCHS}')
        tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, device, scaler, class_weights,
                                      use_mixup=USE_MIXUP, mixup_alpha=MIXUP_ALPHA)
        va_loss, va_acc, _, _ = validate(model, val_loader, device)
        
        scheduler.step(va_acc)
        current_lr = optimizer.param_groups[0]['lr']
        print(f'Train: loss={tr_loss:.4f} acc={tr_acc:.2f}% | Val: loss={va_loss:.4f} acc={va_acc:.2f}% | LR: {current_lr:.2e}')
        
        if va_acc > best_acc + 1e-6:
            best_acc = va_acc
            bad_epochs = 0
            torch.save(model.state_dict(), best_path)
            print(f'✓ Saved {best_path}')
        else:
            bad_epochs += 1
            if bad_epochs >= EARLY_STOPPING_PATIENCE:
                print(f'Early stopping after {EARLY_STOPPING_PATIENCE} epochs without improvement')
                break
    
    # Load best weights
    model.load_state_dict(torch.load(best_path, map_location=device))
    print(f'\n✓ Two-phase training complete! Best val accuracy: {best_acc:.2f}%')
    
    return model, best_path, best_acc

print('Two-phase training function defined')


In [None]:
def prepare_hf_dataset_with_mapping(dataset_path: str) -> Tuple[List[str], List[int]]:
    """
    Prepare HuggingFace dataset by finding all images and mapping vendor labels.
    Maps 11 HF classes to 7 target classes using HF_TO_TARGET_MAPPING.
    """
    image_paths = []
    labels = []
    dataset_path = Path(dataset_path)
    image_extensions = {'.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'}
    
    # HF dataset has vendor folders
    for hf_vendor, target_vendor in HF_TO_TARGET_MAPPING.items():
        if target_vendor not in VENDOR_TO_IDX:
            continue
        
        vendor_dir = dataset_path / hf_vendor
        if vendor_dir.exists() and vendor_dir.is_dir():
            for ext in image_extensions:
                images = list(vendor_dir.glob(f"*{ext}"))
                for img_path in images:
                    image_paths.append(str(img_path))
                    labels.append(VENDOR_TO_IDX[target_vendor])
    
    # Fallback: scan all images if no vendor folders found
    if len(image_paths) == 0:
        for ext in image_extensions:
            all_images = list(dataset_path.rglob(f"*{ext}"))
            for img_path in all_images:
                filename = img_path.name.lower()
                parent_dir = img_path.parent.name
                for hf_vendor, target_vendor in HF_TO_TARGET_MAPPING.items():
                    if target_vendor not in VENDOR_TO_IDX:
                        continue
                    if hf_vendor.lower() in filename or hf_vendor.lower() in parent_dir.lower():
                        image_paths.append(str(img_path))
                        labels.append(VENDOR_TO_IDX[target_vendor])
                        break
    
    print(f'Loaded {len(image_paths)} images from HuggingFace dataset')
    if len(labels) > 0:
        label_counts = np.bincount(np.array(labels), minlength=NUM_CLASSES)
        print(f'Label distribution: {label_counts}')
    
    return image_paths, labels


def prepare_dataset_from_csv(train_dir: Path, csv_path: Path) -> Tuple[List[str], List[int]]:
    """Load dataset from CSV labels file."""
    train_dir = Path(train_dir)
    csv_path = Path(csv_path)
    
    if not train_dir.exists() or not csv_path.exists():
        raise FileNotFoundError(f'Missing train_dir or csv: {train_dir} / {csv_path}')
    
    df = pd.read_csv(csv_path)
    label_map = {v: VENDOR_TO_IDX[v] for v in VENDOR_CLASSES}
    
    img_paths = []
    labels = []
    missing = 0
    unmapped = 0
    
    for _, row in df.iterrows():
        img_id = str(row['Id']).zfill(5)
        lab = str(row['Label']).lower()
        
        if lab not in label_map:
            unmapped += 1
            continue
        
        found = None
        for ext in ('.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'):
            p = train_dir / f'{img_id}{ext}'
            if p.exists():
                found = str(p)
                break
        
        if found is None:
            missing += 1
            continue
        
        img_paths.append(found)
        labels.append(int(label_map[lab]))
    
    print(f'Loaded: {len(img_paths)} images')
    print(f'Unmapped labels: {unmapped}, Missing files: {missing}')
    if labels:
        print(f'Label distribution: {np.bincount(np.array(labels), minlength=NUM_CLASSES)}')
    
    return img_paths, labels

print('Data loading functions defined')


In [None]:
def pretrain_on_hf_dataset(model_id: str, tag: str, class_weights=None):
    """
    Pre-train model on HuggingFace emoji dataset with two-phase training.
    Returns: (model, processor, checkpoint_path, best_acc)
    """
    seed_everything(RANDOM_STATE)
    
    print(f'\n{"="*60}')
    print(f'Pre-training {tag} on HuggingFace dataset')
    print(f'{"="*60}')
    
    # Download HF dataset
    hf_path = kagglehub.dataset_download(HF_DATASET_ID)
    print(f'HuggingFace dataset path: {hf_path}')
    
    hf_paths, hf_labels = prepare_hf_dataset_with_mapping(hf_path)
    
    if len(hf_paths) == 0:
        raise ValueError('No images found in HuggingFace dataset')
    
    # Split HF dataset
    labels_arr = np.array(hf_labels)
    min_count = np.bincount(labels_arr, minlength=NUM_CLASSES).min()
    can_stratify = (min_count >= 2)
    
    hf_train_paths, hf_val_paths, hf_train_y, hf_val_y = train_test_split(
        hf_paths, hf_labels,
        test_size=0.1, random_state=RANDOM_STATE,
        stratify=hf_labels if can_stratify else None
    )
    
    print(f'HF Train: {len(hf_train_paths)}, HF Val: {len(hf_val_paths)}')
    
    # Create model
    model, processor = create_model(model_id, tag)
    
    # Two-phase training on HF dataset
    checkpoint_prefix = f'hf_pretrained_{tag}'
    model, best_path, best_acc = two_phase_train(
        model, processor, tag,
        hf_train_paths, hf_train_y,
        hf_val_paths, hf_val_y,
        checkpoint_prefix, class_weights
    )
    
    return model, processor, best_path, best_acc

print('HF pre-training function defined')


In [None]:
def kfold_training_pipeline(all_paths: List[str], all_labels: List[int], 
                            pretrained_checkpoints: Dict[str, str] = None):
    """
    K-Fold cross-validation training pipeline.
    Trains each model architecture on each fold.
    
    Returns: List of (model, processor, tag, fold, checkpoint_path, val_acc)
    """
    print(f'\n{"="*60}')
    print(f'K-Fold Training Pipeline ({NUM_FOLDS} folds x {len(MODELS)} models)')
    print(f'{"="*60}')
    
    # Compute class weights
    class_weights = compute_class_weights(all_labels)
    
    # Setup K-Fold
    skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    trained_models = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(all_paths, all_labels)):
        print(f'\n{"="*60}')
        print(f'FOLD {fold_idx + 1}/{NUM_FOLDS}')
        print(f'{"="*60}')
        
        # Split data for this fold
        fold_train_paths = [all_paths[i] for i in train_idx]
        fold_train_labels = [all_labels[i] for i in train_idx]
        fold_val_paths = [all_paths[i] for i in val_idx]
        fold_val_labels = [all_labels[i] for i in val_idx]
        
        print(f'Fold {fold_idx + 1}: Train={len(fold_train_paths)}, Val={len(fold_val_paths)}')
        
        # Train each model architecture on this fold
        for model_id, tag, img_size in MODELS:
            print(f'\n--- Training {tag} (fold {fold_idx + 1}) ---')
            
            # Create fresh model
            model, processor = create_model(model_id, tag)
            
            # Load pre-trained weights if available
            if pretrained_checkpoints and tag in pretrained_checkpoints:
                pretrained_path = pretrained_checkpoints[tag]
                if os.path.exists(pretrained_path):
                    model.load_state_dict(torch.load(pretrained_path, map_location=device))
                    print(f'✓ Loaded pre-trained weights from {pretrained_path}')
            
            # Two-phase training on this fold
            checkpoint_prefix = f'fold{fold_idx + 1}_{tag}'
            model, best_path, best_acc = two_phase_train(
                model, processor, tag,
                fold_train_paths, fold_train_labels,
                fold_val_paths, fold_val_labels,
                checkpoint_prefix, class_weights
            )
            
            trained_models.append({
                'model': model,
                'processor': processor,
                'tag': tag,
                'model_id': model_id,
                'fold': fold_idx + 1,
                'checkpoint': best_path,
                'val_acc': best_acc
            })
            
            # Move to CPU to save GPU memory
            if device.type == 'cuda':
                model = model.to('cpu')
                torch.cuda.empty_cache()
    
    print(f'\n{"="*60}')
    print(f'K-Fold Training Complete!')
    print(f'Total trained models: {len(trained_models)}')
    print(f'{"="*60}')
    
    # Print summary
    for m in trained_models:
        print(f"  {m['tag']} fold{m['fold']}: {m['val_acc']:.2f}%")
    
    return trained_models

print('K-Fold training pipeline defined')


In [None]:
def _prob_cols_for_members(trained_models: List[Dict], num_augmentations: int) -> List[str]:
    """Generate column names for probability features (no folds in V17)."""
    cols = []
    for m in trained_models:
        tag = m['tag']
        for i in range(num_augmentations):
            for c in range(NUM_CLASSES):
                cols.append(f'prob_{tag}_aug{i}_cls{c}')
    return cols

@torch.no_grad()
def build_features_batched(image_paths: List[str], trained_models: List[Dict], 
                           num_augmentations: int, batch_size: int = None) -> pd.DataFrame:
    """
    Optimized batched feature extraction for maximum GPU efficiency.
    - Pre-loads all images and generates augmentations once per model (size-specific)
    - Processes all batches for each model before switching
    - Minimizes GPU model loading/unloading overhead
    """
    if batch_size is None:
        batch_size = FEATURE_BATCH_SIZE
    
    num_batches = (len(image_paths) + batch_size - 1) // batch_size
    
    # Step 1: Pre-load all images and extract stats
    print('Pre-loading images and extracting stats...')
    all_stats = []
    all_images = []
    
    for img_path in tqdm(image_paths, desc='Loading images', disable=(not SHOW_PROGRESS)):
        stats = extract_image_properties(img_path)
        img = load_image_rgb(img_path)
        all_stats.append(stats)
        all_images.append((img, img_path))
    
    # Step 2: Process each model
    model_probs = {}  # tag -> list of batch probability arrays
    
    # Group models by image size for efficient augmentation caching
    models_by_size = {}
    for m in trained_models:
        model_config = next((cfg for cfg in MODELS if cfg[1] == m['tag']), None)
        img_size = model_config[2] if model_config else 224
        models_by_size.setdefault(img_size, []).append(m)
    
    for img_size, size_models in models_by_size.items():
        print(f'\nProcessing models with image size {img_size}x{img_size}...')
        
        # Generate augmentations for this size
        tta = MultiScaleTTA(base_image_size=img_size)
        all_augmentations = []
        for img, img_path in tqdm(all_images, desc=f'Generating {img_size}px augmentations', disable=(not SHOW_PROGRESS)):
            augs = tta.get_augmentations(img, num_augmentations=num_augmentations, seed_source=img_path)
            all_augmentations.append(augs)
        
        # Process each model of this size
        for m in tqdm(size_models, desc=f'Processing {img_size}px models', disable=(not SHOW_PROGRESS)):
            model = m['model']
            processor = m['processor']
            tag = m['tag']
            
            # Load model to GPU
            model.to(device)
            model.eval()
            
            batch_probs = []
            
            for start in range(0, len(image_paths), batch_size):
                batch_end = min(start + batch_size, len(image_paths))
                B = batch_end - start
                
                # Collect augmentations for this batch
                batch_augs = []
                batch_meta = []
                for img_idx in range(start, batch_end):
                    batch_augs.extend(all_augmentations[img_idx])
                    meta = extract_metadata(image_paths[img_idx])
                    for _ in range(num_augmentations):
                        batch_meta.append([meta['height_norm'], meta['width_norm'], meta['has_alpha']])
                
                # Process batch
                inputs = processor(batch_augs, return_tensors='pt')
                x = inputs['pixel_values'].to(device)
                meta_tensor = torch.tensor(batch_meta, dtype=torch.float32).to(device)
                
                out = model(pixel_values=x, metadata=meta_tensor)
                probs = torch.softmax(out.logits, dim=-1)
                probs = probs.view(B, num_augmentations, -1).detach().cpu().numpy()
                batch_probs.append(probs)
            
            model_probs[tag] = batch_probs
            
            # Move model back to CPU
            if device.type == 'cuda':
                model.to('cpu')
                torch.cuda.empty_cache()
    
    # Step 3: Combine stats and probabilities into final feature matrix
    print('Combining features...')
    all_rows = []
    
    for img_idx, stats in enumerate(all_stats):
        row = dict(stats)
        
        batch_idx = img_idx // batch_size
        batch_local_idx = img_idx % batch_size
        
        for m in trained_models:
            tag = m['tag']
            batch_probs_list = model_probs[tag]
            batch_probs = batch_probs_list[batch_idx]
            
            for a in range(num_augmentations):
                for c in range(NUM_CLASSES):
                    row[f'prob_{tag}_aug{a}_cls{c}'] = float(batch_probs[batch_local_idx, a, c])
        
        all_rows.append(row)
    
    df = pd.DataFrame(all_rows)
    prob_cols = _prob_cols_for_members(trained_models, num_augmentations)
    all_cols = STAT_COLS + prob_cols
    
    for col in all_cols:
        if col not in df.columns:
            df[col] = 0.0
    
    return df[all_cols]

print('Feature extraction function defined')
print(f'Expected features: {len(STAT_COLS)} stats + {len(MODELS) * NUM_TTA_AUGS * NUM_CLASSES} probabilities')


In [None]:
def pseudo_label_training(trained_models: List[Dict], meta_model, 
                          train_paths: List[str], train_labels: List[int],
                          test_paths: List[str]):
    """
    Pseudo-labeling: use confident test predictions as additional training data.
    
    1. Generate predictions on test set
    2. Filter to high-confidence predictions
    3. Add pseudo-labeled data to training set
    4. Retrain models with augmented dataset
    """
    print(f'\n{"="*60}')
    print(f'Pseudo-Labeling (confidence > {PSEUDO_LABEL_CONFIDENCE})')
    print(f'{"="*60}')
    
    # Step 1: Extract features for test set
    print('\nExtracting features for test set...')
    X_test = build_features_batched(test_paths, trained_models, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
    
    # Step 2: Get predictions and confidence
    test_probs = meta_model.predict_proba(X_test)
    test_preds = test_probs.argmax(axis=1)
    test_confidence = test_probs.max(axis=1)
    
    # Step 3: Filter high-confidence predictions
    confident_mask = test_confidence >= PSEUDO_LABEL_CONFIDENCE
    num_confident = confident_mask.sum()
    
    print(f'Test samples: {len(test_paths)}')
    print(f'Confident predictions (>={PSEUDO_LABEL_CONFIDENCE}): {num_confident} ({100*num_confident/len(test_paths):.1f}%)')
    
    if num_confident < 10:
        print('Too few confident predictions for pseudo-labeling. Skipping.')
        return trained_models
    
    # Get pseudo-labeled data
    pseudo_paths = [test_paths[i] for i in range(len(test_paths)) if confident_mask[i]]
    pseudo_labels = [int(test_preds[i]) for i in range(len(test_paths)) if confident_mask[i]]
    
    # Show pseudo-label distribution
    pseudo_dist = np.bincount(pseudo_labels, minlength=NUM_CLASSES)
    print(f'Pseudo-label distribution: {pseudo_dist}')
    
    # Step 4: Combine with original training data
    combined_paths = train_paths + pseudo_paths
    combined_labels = train_labels + pseudo_labels
    
    print(f'Combined training set: {len(combined_paths)} samples')
    
    # Step 5: Retrain models with pseudo-labeled data (fewer epochs)
    print('\nRetraining with pseudo-labeled data...')
    
    class_weights = compute_class_weights(combined_labels)
    
    updated_models = []
    for m in trained_models:
        model = m['model']
        processor = m['processor']
        tag = m['tag']
        
        print(f'\n--- Retraining {tag} with pseudo-labels ---')
        
        model = model.to(device)
        
        # Create dataset with combined data
        train_ds = EmojiDatasetWithMetadata(
            combined_paths, combined_labels, processor, tag, use_augmentation=True
        )
        bs = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
        train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True,
                                  num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
        
        # Train with low LR
        optimizer = torch.optim.AdamW(model.parameters(), lr=PSEUDO_LABEL_LR, weight_decay=WEIGHT_DECAY)
        scaler = None
        if torch.cuda.is_available() and not torch.cuda.is_bf16_supported():
            scaler = torch.cuda.amp.GradScaler()
        
        for epoch in range(PSEUDO_LABEL_EPOCHS):
            tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, device, scaler, class_weights,
                                          use_mixup=USE_MIXUP, mixup_alpha=MIXUP_ALPHA)
            print(f'  Epoch {epoch+1}/{PSEUDO_LABEL_EPOCHS}: loss={tr_loss:.4f} acc={tr_acc:.2f}%')
        
        # Save updated checkpoint
        checkpoint_path = f'pseudo_{tag}_V17.pt'
        torch.save(model.state_dict(), checkpoint_path)
        
        updated_models.append({
            'model': model,
            'processor': processor,
            'tag': tag,
            'model_id': m['model_id'],
            'checkpoint': checkpoint_path,
            'val_acc': m.get('val_acc')
        })
        
        if device.type == 'cuda':
            model = model.to('cpu')
            torch.cuda.empty_cache()
    
    print(f'\n✓ Pseudo-labeling complete! {len(updated_models)} models updated')
    
    return updated_models

print('Pseudo-labeling function defined')


In [None]:
# ============================================================================
# STEP 1: Load Target Dataset
# ============================================================================
print('='*60)
print('STEP 1: Loading Target Dataset')
print('='*60)

all_paths, all_labels = prepare_dataset_from_csv(SECOND_DATASET_TRAIN_DIR, SECOND_DATASET_CSV_PATH)

# Show class distribution
print(f'\nTotal samples: {len(all_paths)}')
print(f'Class distribution: {np.bincount(np.array(all_labels), minlength=NUM_CLASSES)}')


In [None]:
# ============================================================================
# STEP 2: Pre-train Models on HuggingFace Dataset
# ============================================================================
print('='*60)
print('STEP 2: Pre-training on HuggingFace Dataset')
print('='*60)

pretrained_checkpoints = {}

for model_id, tag, img_size in MODELS:
    print(f'\n--- Pre-training {tag} ---')
    model, processor, checkpoint, best_acc = pretrain_on_hf_dataset(model_id, tag)
    pretrained_checkpoints[tag] = checkpoint
    
    # Move to CPU to save memory
    if device.type == 'cuda':
        model = model.to('cpu')
        torch.cuda.empty_cache()

print(f'\n✓ Pre-trained checkpoints: {list(pretrained_checkpoints.keys())}')


In [None]:
# ============================================================================
# STEP 3: Train on Target Dataset (No K-Fold)
# ============================================================================
print('='*60)
print('STEP 3: Training on Target Dataset (No K-Fold)')
print('='*60)

# Stratified split
val_size = 0.10
min_count = np.bincount(np.array(all_labels), minlength=NUM_CLASSES).min()
can_stratify = (min_count >= 2)
train_paths, val_paths, train_y, val_y = train_test_split(
    all_paths, all_labels, test_size=val_size, random_state=RANDOM_STATE,
    stratify=all_labels if can_stratify else None
)
print(f'Train: {len(train_paths)}, Val: {len(val_paths)}')
print('Train dist:', np.bincount(np.array(train_y), minlength=NUM_CLASSES))
print('Val   dist:', np.bincount(np.array(val_y), minlength=NUM_CLASSES))

class_weights = compute_class_weights(train_y)

trained_models = []
for model_id, tag, img_size in MODELS:
    print(f'\n--- Training {tag} ---')
    model, processor = create_model(model_id, tag)
    
    # Load HF pretrained weights if available
    if tag in pretrained_checkpoints and os.path.exists(pretrained_checkpoints[tag]):
        model.load_state_dict(torch.load(pretrained_checkpoints[tag], map_location=device))
        print(f'✓ Loaded HF pretrained weights from {pretrained_checkpoints[tag]}')
    
    checkpoint_prefix = f'finetuned_{tag}'
    model, best_path, best_acc = two_phase_train(
        model, processor, tag,
        train_paths, train_y,
        val_paths, val_y,
        checkpoint_prefix, class_weights
    )
    
    trained_models.append({
        'model': model,
        'processor': processor,
        'tag': tag,
        'model_id': model_id,
        'checkpoint': best_path,
        'val_acc': best_acc
    })
    
    # Move to CPU to save GPU memory
    if device.type == 'cuda':
        model = model.to('cpu')
        torch.cuda.empty_cache()

print(f'\n✓ Total trained models: {len(trained_models)}')


In [None]:
# ============================================================================
# STEP 4: Build Features and Train LightGBM Meta-Model
# ============================================================================
print('='*60)
print('STEP 4: Training LightGBM Meta-Model')
print('='*60)

# For validation, use one fold's validation set
# (In K-Fold, we use out-of-fold predictions for unbiased evaluation)
# Here we'll use a simple split for quick validation

# Split data for meta-model validation
meta_train_paths, meta_val_paths, meta_train_y, meta_val_y = train_test_split(
    all_paths, all_labels,
    test_size=0.1, random_state=RANDOM_STATE,
    stratify=all_labels
)

print(f'Meta train: {len(meta_train_paths)}, Meta val: {len(meta_val_paths)}')

# Extract features
print('\nExtracting training features...')
X_train = build_features_batched(meta_train_paths, trained_models, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
print('\nExtracting validation features...')
X_val = build_features_batched(meta_val_paths, trained_models, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)

y_train = np.array(meta_train_y)
y_val = np.array(meta_val_y)

print(f'\nX_train: {X_train.shape}, X_val: {X_val.shape}')

# Train LightGBM meta-model
if USE_LIGHTGBM and HAS_LGB:
    print('\nTraining LightGBM meta-model...')
    meta_model = lgb.LGBMClassifier(**LGB_PARAMS)
    meta_model.fit(X_train, y_train)
    
    # Evaluate
    val_pred = meta_model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred) * 100.0
    
    print(f'\nMeta-model validation accuracy: {val_acc:.2f}%')
    print('\nClassification Report:')
    print(classification_report(y_val, val_pred, target_names=VENDOR_CLASSES))
    
    # Save model
    meta_model.booster_.save_model('meta_lgb_v17.txt')
    print('✓ Saved meta_lgb_v17.txt')
else:
    raise RuntimeError('LightGBM not available!')


In [None]:
# ============================================================================
# STEP 5: Load Test Data
# ============================================================================
print('='*60)
print('STEP 5: Loading Test Data')
print('='*60)

test_dir = SECOND_DATASET_TEST_DIR
if not test_dir.exists():
    raise FileNotFoundError(f'Missing test dir: {test_dir}')

test_paths = []
for ext in ('.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'):
    test_paths += [str(p) for p in test_dir.rglob(f'*{ext}')]
test_paths = sorted(set(test_paths))

print(f'Found {len(test_paths)} test images')


In [None]:
# ============================================================================
# STEP 6: Pseudo-Labeling (Optional - Set to True to enable)
# ============================================================================
ENABLE_PSEUDO_LABELING = True

if ENABLE_PSEUDO_LABELING:
    print('='*60)
    print('STEP 6: Pseudo-Labeling')
    print('='*60)
    
    # Use pseudo-labeling to improve models
    trained_models = pseudo_label_training(
        trained_models, meta_model,
        all_paths, all_labels,
        test_paths
    )
else:
    print('Pseudo-labeling disabled. Skipping...')


In [None]:
# ============================================================================
# STEP 7: Train Final Meta-Model on All Data
# ============================================================================
print('='*60)
print('STEP 7: Training Final Meta-Model on All Data')
print('='*60)

# Extract features for all training data
print('\nExtracting features for all training data...')
X_all = build_features_batched(all_paths, trained_models, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)
y_all = np.array(all_labels)

print(f'\nX_all: {X_all.shape}')

# Train final meta-model on all data
print('\nTraining final LightGBM meta-model...')
meta_final = lgb.LGBMClassifier(**LGB_PARAMS)
meta_final.fit(X_all, y_all)

# Save final model
meta_final.booster_.save_model('meta_lgb_v17_final.txt')
print('✓ Saved meta_lgb_v17_final.txt')


In [None]:
# ============================================================================
# STEP 8: Generate Test Predictions
# ============================================================================
print('='*60)
print('STEP 8: Generating Test Predictions')
print('='*60)

# Extract features for test set
print('\nExtracting features for test set...')
X_test = build_features_batched(test_paths, trained_models, NUM_TTA_AUGS, batch_size=FEATURE_BATCH_SIZE)

print(f'\nX_test: {X_test.shape}')

# Generate predictions
print('\nGenerating predictions...')
test_preds = meta_final.predict(X_test)
test_probs = meta_final.predict_proba(X_test)

# Get prediction confidence
test_confidence = test_probs.max(axis=1)
avg_confidence = test_confidence.mean()
print(f'Average prediction confidence: {avg_confidence:.4f}')

# Create submission
pred_ids = []
pred_labels = []

for img_path, pred in zip(test_paths, test_preds):
    img_id = Path(img_path).stem
    pred = int(pred)
    pred = max(0, min(pred, NUM_CLASSES - 1))
    pred_ids.append(img_id)
    pred_labels.append(IDX_TO_VENDOR[pred])

# Show prediction distribution
pred_dist = np.bincount(test_preds.astype(int), minlength=NUM_CLASSES)
print(f'\nPrediction distribution:')
for i, (cls, count) in enumerate(zip(VENDOR_CLASSES, pred_dist)):
    print(f'  {cls:12s}: {count} ({100*count/len(test_preds):.1f}%)')

# Save submission
out_path = Path(PREDICTIONS_OUTPUT_FILE)
with out_path.open('w') as f:
    f.write('Id,Label\n')
    for img_id, label in zip(pred_ids, pred_labels):
        f.write(f'{str(img_id).strip()},{label}\n')

print(f'\n✓ Saved predictions to: {out_path}')
print(f'  Total predictions: {len(pred_labels)}')


## Section 6: Dataset Class with Metadata
