# Fine-tuning ConvNeXtV2 with Binary Classifiers + XGBoost Ensemble (V6)

Using one binary classifier per class, then XGBoost to vote between models.
**Optimized for second dataset classes: Apple, Google, Facebook, Samsung**

## Install and Import

In [None]:
# Install required packages
%pip install -q kagglehub transformers torch torchvision pillow datasets accelerate pandas scikit-learn xgboost

import kagglehub
import os
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import json
from transformers import AutoImageProcessor, AutoModelForImageClassification
from transformers.modeling_outputs import ImageClassifierOutput
import torch.nn as nn
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
from tqdm import tqdm
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
import pandas as pd
import random
import hashlib
import xgboost as xgb

# GPU Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    torch.cuda.empty_cache()
else:
    print("WARNING: CUDA not available. Training will be slow on CPU.")

## Deterministic Augmentation System

In [None]:
# Unified Deterministic Augmentation System
# Same augmentation methods used for both training and TTA inference
# Deterministic based on image hash/index for reproducibility

import hashlib

class DeterministicAugmentation:
    """
    Unified deterministic augmentation system for both training and TTA.
    Uses the same augmentation methods, but deterministic based on image content.
    """
    
    def __init__(self, image_size=224, seed=42):
        self.image_size = image_size
        self.seed = seed
        
        # Define augmentation parameters (same for training and TTA)
        self.rotation_angles = [-10, -5, 5, 10]  # Fixed rotation angles
        self.crop_ratios = [0.75, 0.85, 0.9, 0.95]  # Fixed crop ratios
        self.color_jitter_params = {
            'brightness': 0.3,
            'contrast': 0.3,
            'saturation': 0.3,
            'hue': 0.1
        }
        self.translate_range = (0.1, 0.1)
        self.blur_sigma = (0.1, 0.5)
        
    def _get_deterministic_seed(self, image_or_hash):
        """Generate deterministic seed from image hash or index."""
        if isinstance(image_or_hash, Image.Image):
            # Use image content hash
            img_bytes = image_or_hash.tobytes()
            hash_val = int(hashlib.md5(img_bytes).hexdigest()[:8], 16)
        elif isinstance(image_or_hash, (str, int)):
            # Use provided hash/index
            hash_val = hash(str(image_or_hash)) & 0xFFFFFFFF
        else:
            hash_val = hash(str(image_or_hash)) & 0xFFFFFFFF
        return hash_val
    
    def horizontal_flip(self, image, apply=True):
        """Deterministic horizontal flip."""
        if apply:
            return F.hflip(image)
        return image
    
    def rotation(self, image, angle):
        """Deterministic rotation."""
        return F.rotate(image, angle)
    
    def center_crop(self, image, crop_ratio=0.9):
        """Deterministic center crop."""
        w, h = image.size
        crop_size = int(min(w, h) * crop_ratio)
        return F.center_crop(image, [crop_size, crop_size])
    
    def corner_crop(self, image, crop_ratio=0.9, position='tl'):
        """Deterministic corner crop (top-left, top-right, bottom-left, bottom-right)."""
        w, h = image.size
        crop_size = int(min(w, h) * crop_ratio)
        
        if position == 'tl':  # Top-left
            return F.crop(image, 0, 0, crop_size, crop_size)
        elif position == 'tr':  # Top-right
            return F.crop(image, 0, w - crop_size, crop_size, crop_size)
        elif position == 'bl':  # Bottom-left
            return F.crop(image, h - crop_size, 0, crop_size, crop_size)
        elif position == 'br':  # Bottom-right
            return F.crop(image, h - crop_size, w - crop_size, crop_size, crop_size)
        return image
    
    def resized_crop(self, image, crop_ratio=0.85):
        """Deterministic resized crop (simulating RandomResizedCrop)."""
        w, h = image.size
        crop_size = int(min(w, h) * crop_ratio)
        # Use center crop as deterministic version
        cropped = F.center_crop(image, [crop_size, crop_size])
        return cropped.resize((self.image_size, self.image_size), Image.BILINEAR)
    
    def color_jitter(self, image, seed_val):
        """Deterministic color jitter based on seed."""
        # Use seed to deterministically select jitter parameters
        np.random.seed(seed_val % (2**32))
        brightness_factor = 1.0 + np.random.uniform(-self.color_jitter_params['brightness'], 
                                                      self.color_jitter_params['brightness'])
        contrast_factor = 1.0 + np.random.uniform(-self.color_jitter_params['contrast'],
                                                  self.color_jitter_params['contrast'])
        saturation_factor = 1.0 + np.random.uniform(-self.color_jitter_params['saturation'],
                                                     self.color_jitter_params['saturation'])
        hue_factor = np.random.uniform(-self.color_jitter_params['hue'],
                                      self.color_jitter_params['hue'])
        
        # Apply deterministic color jitter
        img = F.adjust_brightness(image, brightness_factor)
        img = F.adjust_contrast(img, contrast_factor)
        img = F.adjust_saturation(img, saturation_factor)
        img = F.adjust_hue(img, hue_factor)
        return img
    
    def affine_transform(self, image, seed_val):
        """Deterministic affine transform (translation)."""
        np.random.seed(seed_val % (2**32))
        translate_x = np.random.uniform(-self.translate_range[0], self.translate_range[0])
        translate_y = np.random.uniform(-self.translate_range[1], self.translate_range[1])
        return F.affine(image, angle=0, translate=(translate_x * image.width, translate_y * image.height),
                        scale=1.0, shear=0.0)
    
    def gaussian_blur(self, image, seed_val):
        """Deterministic Gaussian blur."""
        np.random.seed(seed_val % (2**32))
        sigma = np.random.uniform(self.blur_sigma[0], self.blur_sigma[1])
        return F.gaussian_blur(image, kernel_size=3, sigma=[sigma, sigma])
    
    def get_augmentations(self, image, num_augmentations=10, seed_source=None):
        """
        Generate deterministic augmented versions for TTA.
        Uses same augmentation methods as training.
        
        Args:
            image: PIL Image
            num_augmentations: Number of augmentations to generate
            seed_source: Optional seed source (image hash, index, etc.) for determinism
        """
        augmentations = []
        
        # Get deterministic seed
        if seed_source is None:
            seed_source = self._get_deterministic_seed(image)
        seed_val = seed_source
        
        # Original (resized)
        augmentations.append(image.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        # Horizontal flip
        flipped = self.horizontal_flip(image, apply=True)
        augmentations.append(flipped.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        # Rotations (deterministic angles)
        for angle in self.rotation_angles[:min(4, num_augmentations - len(augmentations))]:
            rotated = self.rotation(image, angle)
            augmentations.append(rotated.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        # Corner crops (4 corners)
        corners = ['tl', 'tr', 'bl', 'br']
        for corner in corners[:min(4, num_augmentations - len(augmentations))]:
            cropped = self.corner_crop(image, crop_ratio=0.9, position=corner)
            augmentations.append(cropped.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        # Center crop
        if len(augmentations) < num_augmentations:
            center_cropped = self.center_crop(image, crop_ratio=0.9)
            augmentations.append(center_cropped.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        # Resized crop (simulating RandomResizedCrop)
        if len(augmentations) < num_augmentations:
            resized_cropped = self.resized_crop(image, crop_ratio=0.85)
            augmentations.append(resized_cropped)
        
        # Color jitter
        if len(augmentations) < num_augmentations:
            jittered = self.color_jitter(image, seed_val)
            augmentations.append(jittered.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        # Affine transform
        if len(augmentations) < num_augmentations:
            affine_img = self.affine_transform(image, seed_val + 1)
            augmentations.append(affine_img.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        # Gaussian blur
        if len(augmentations) < num_augmentations:
            blurred = self.gaussian_blur(image, seed_val + 2)
            augmentations.append(blurred.resize((self.image_size, self.image_size), Image.BILINEAR))
        
        return augmentations[:num_augmentations]
    
    def apply_training_augmentation(self, image, index=None):
        """
        Apply deterministic training augmentation.
        Uses same methods as TTA but applied once per training sample.
        
        Args:
            image: PIL Image
            index: Optional index for deterministic seed
        """
        # Get deterministic seed from index or image
        if index is not None:
            seed_val = hash(str(index)) & 0xFFFFFFFF
        else:
            seed_val = self._get_deterministic_seed(image)
        
        # Apply augmentations deterministically based on seed
        np.random.seed(seed_val % (2**32))
        
        # Horizontal flip (50% probability, but deterministic)
        should_flip = (seed_val % 2 == 0)
        if should_flip:
            image = self.horizontal_flip(image, apply=True)
        
        # Rotation (deterministic angle selection)
        angle_idx = (seed_val // 2) % len(self.rotation_angles)
        angle = self.rotation_angles[angle_idx]
        image = self.rotation(image, angle)
        
        # Resized crop (deterministic crop ratio)
        crop_idx = (seed_val // 10) % len(self.crop_ratios)
        crop_ratio = self.crop_ratios[crop_idx]
        w, h = image.size
        crop_size = int(min(w, h) * crop_ratio)
        image = F.center_crop(image, [crop_size, crop_size])
        
        # Color jitter (deterministic)
        image = self.color_jitter(image, seed_val)
        
        # Affine transform (deterministic, 50% probability)
        if (seed_val // 3) % 2 == 0:
            image = self.affine_transform(image, seed_val + 1)
        
        # Gaussian blur (deterministic, 20% probability)
        if (seed_val // 5) % 5 == 0:
            image = self.gaussian_blur(image, seed_val + 2)
        
        # Resize to final size
        image = image.resize((self.image_size, self.image_size), Image.BILINEAR)
        
        return image

# Create global augmentation instance
augmentation_system = DeterministicAugmentation(image_size=224, seed=42)

# For backward compatibility
EnhancedTTAAugmentation = DeterministicAugmentation
TTAAugmentation = DeterministicAugmentation

# Training transform using deterministic augmentation
# This will be applied in EmojiDataset using the augmentation_system
train_transform = None  # Will use augmentation_system.apply_training_augmentation instead

print("Unified deterministic augmentation system defined!")
print("Same augmentation methods used for both training and TTA inference.")


## Load ConvNeXtV2 Model

In [None]:
# Load model directly
from transformers import AutoImageProcessor, AutoModelForImageClassification

processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-22k-224")
base_model = AutoModelForImageClassification.from_pretrained("facebook/convnextv2-tiny-22k-224")

# Move model to GPU
base_model = base_model.to(device)
print(f"ConvNeXtV2 model loaded and moved to {device}")
print(f"Model config: {base_model.config}")

## Define Vendor Classes

In [None]:
# Define classes optimized for second dataset
# Second dataset has: ["apple", "google", "whatsapp", "facebook", "samsung", "mozilla", "messenger"]
# After mapping: whatsapp/messenger → Facebook, mozilla → Google
# Effective classes: Apple, Google, Facebook, Samsung

SECOND_DATASET_CLASSES = ["Apple", "Google", "Facebook", "Samsung"]
SECOND_DATASET_TO_IDX = {vendor: idx for idx, vendor in enumerate(SECOND_DATASET_CLASSES)}
IDX_TO_SECOND_DATASET = {idx: vendor for vendor, idx in SECOND_DATASET_TO_IDX.items()}

# Mapping from first dataset classes to second dataset classes
FIRST_DATASET_CLASSES = [
    "Apple", "DoCoMo", "Facebook", "Gmail", "Google", "JoyPixels",
    "KDDI", "Samsung", "SoftBank", "Twitter", "Windows"
]

FIRST_TO_SECOND_MAPPING = {
    "Apple": "Apple",
    "Google": "Google", 
    "Gmail": "Google",  # Gmail is Google
    "Facebook": "Facebook",
    "Samsung": "Samsung",
    # Map other classes to closest match
    "DoCoMo": "Samsung",  # Japanese vendor → Samsung (both Asian)
    "KDDI": "Samsung",    # Japanese vendor → Samsung
    "SoftBank": "Samsung", # Japanese vendor → Samsung
    "JoyPixels": "Facebook",  # Emoji provider → Facebook
    "Twitter": "Facebook",     # Social media → Facebook
    "Windows": "Samsung",    # Microsoft → Samsung (closest match)
}

# Use optimized classes for model
VENDOR_CLASSES = SECOND_DATASET_CLASSES
VENDOR_TO_IDX = SECOND_DATASET_TO_IDX
IDX_TO_VENDOR = IDX_TO_SECOND_DATASET

print(f"Optimized for second dataset: {len(VENDOR_CLASSES)} classes")
print(f"Target classes: {VENDOR_CLASSES}")
print(f"\nFirst dataset → Second dataset mapping:")
for first_class, second_class in FIRST_TO_SECOND_MAPPING.items():
    print(f"  {first_class} → {second_class}")

## Dataset Class

In [None]:
class EmojiDataset(Dataset):
    """Dataset class with support for deterministic training-time augmentation."""
    def __init__(self, image_paths, labels, processor, use_augmentation=False, augmentation_system=None):
        self.image_paths = image_paths
        self.labels = labels
        self.processor = processor
        self.use_augmentation = use_augmentation
        self.augmentation_system = augmentation_system if augmentation_system is not None else globals().get('augmentation_system', None)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]

        try:
            image = Image.open(image_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            image = Image.new('RGB', (224, 224), color='white')

        if self.use_augmentation:
            aug_system = globals().get('augmentation_system', None)
            if aug_system is not None:
                image = aug_system.apply_training_augmentation(image, index=idx)

        inputs = self.processor(image, return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze(0)

        return {
            'pixel_values': pixel_values,
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Binary Classification Model Definition (One per Class)

In [None]:
class ConvNeXtV2ForBinaryClassification(nn.Module):
    """Binary classifier for a single vendor class."""
    def __init__(self, base_model_ref, hidden_size=None):
        super().__init__()
        self.base_model = base_model_ref
        
        # Get the hidden size from the model config
        if hidden_size is None:
            if hasattr(base_model_ref.config, 'hidden_sizes'):
                hidden_size = base_model_ref.config.hidden_sizes[-1]
            elif hasattr(base_model_ref.config, 'hidden_size'):
                hidden_size = base_model_ref.config.hidden_size
            else:
                hidden_size = 768  # Default for tiny model
        
        # Binary classification head
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden_size),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.GELU(),
            nn.LayerNorm(hidden_size // 2),
            nn.Dropout(0.2),
            nn.Linear(hidden_size // 2, 1)  # Binary: output 1 value
        )

    def forward(self, pixel_values, labels=None):
        # Get features from ConvNeXtV2 backbone
        # convnextv2 returns BaseModelOutputWithPoolingAndNoAttention
        backbone_output = self.base_model.convnextv2(pixel_values)
        
        # Extract the last hidden state (feature map)
        if hasattr(backbone_output, 'last_hidden_state'):
            features = backbone_output.last_hidden_state
        elif hasattr(backbone_output, 'pooler_output') and backbone_output.pooler_output is not None:
            # Use pooled output if available
            pooled_output = backbone_output.pooler_output
        else:
            # Fallback: try to get hidden states
            if hasattr(backbone_output, 'hidden_states') and backbone_output.hidden_states is not None:
                features = backbone_output.hidden_states[-1]
            else:
                raise ValueError("Could not extract features from ConvNeXtV2 backbone")
        
        # Global average pooling: (B, C, H, W) -> (B, C)
        if 'pooled_output' not in locals():
            if len(features.shape) == 4:
                pooled_output = features.mean(dim=[2, 3])
            else:
                pooled_output = features

        # Binary classification
        logits = self.classifier(pooled_output)  # Shape: (B, 1)

        loss = None
        if labels is not None:
            # Binary cross entropy with logits
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.squeeze(-1), labels.float())

        return ImageClassifierOutput(loss=loss, logits=logits)

# Create one binary model per class
binary_models = {}
for vendor_idx, vendor_name in enumerate(VENDOR_CLASSES):
    # Create a new base model instance for each binary classifier
    model_base = AutoModelForImageClassification.from_pretrained("facebook/convnextv2-tiny-22k-224")
    model_base = model_base.to(device)
    
    binary_model = ConvNeXtV2ForBinaryClassification(model_base)
    binary_model = binary_model.to(device)
    binary_models[vendor_idx] = binary_model
    print(f"Created binary classifier for {vendor_name} (class {vendor_idx})")

print(f"\nTotal binary models created: {len(binary_models)}")

## Training Functions

In [None]:
def train_epoch_binary(model, train_loader, optimizer, device, scaler=None):
    """Train binary classifier for one epoch."""
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    is_cuda_available = (device.type == 'cuda')

    for batch in tqdm(train_loader, desc="Training"):
        pixel_values = batch['pixel_values'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)
        optimizer.zero_grad()

        with torch.amp.autocast('cuda', enabled=is_cuda_available):
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()

        total_loss += loss.item()
        # Binary prediction: sigmoid + threshold
        probs = torch.sigmoid(outputs.logits.squeeze(-1))
        predicted = (probs > 0.5).long()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return total_loss / len(train_loader), 100 * correct / total

def validate_binary(model, val_loader, device):
    """Validate binary classifier."""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_probs = []
    all_labels = []
    is_cuda_available = (device.type == 'cuda')

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            pixel_values = batch['pixel_values'].to(device, non_blocking=True)
            labels = batch['labels'].to(device, non_blocking=True)

            with torch.amp.autocast('cuda', enabled=is_cuda_available):
                outputs = model(pixel_values=pixel_values, labels=labels)
                loss = outputs.loss

            total_loss += loss.item()
            probs = torch.sigmoid(outputs.logits.squeeze(-1))
            predicted = (probs > 0.5).long()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = 100 * correct / total
    return total_loss / len(val_loader), accuracy, all_probs, all_labels

## TTA Inference Functions (for final tests only)

In [None]:
def predict_binary_with_tta(model, image, processor, tta_aug, num_augmentations=10, device='cuda'):
    """Predict using binary classifier with TTA."""
    model.eval()
    augmented_images = tta_aug.get_augmentations(image, num_augmentations=num_augmentations)
    all_logits = []
    weights = []
    
    with torch.no_grad():
        for i, aug_image in enumerate(augmented_images):
            inputs = processor(aug_image, return_tensors="pt")
            pixel_values = inputs['pixel_values'].to(device)
            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits
            all_logits.append(logits)
            weight = 2.0 if i == 0 else 1.0
            weights.append(weight)
    
    weights = torch.tensor(weights, device=device).view(-1, 1, 1)
    weighted_logits = torch.stack(all_logits) * weights
    averaged_logits = weighted_logits.sum(dim=0) / weights.sum()
    probabilities = torch.sigmoid(averaged_logits.squeeze(-1))
    
    return probabilities.item()

tta_aug = augmentation_system
print("TTA inference functions defined!")

## Dataset Preparation Functions

In [None]:
def prepare_dataset(dataset_path):
    """Prepare dataset by finding all images and mapping labels to second dataset classes."""
    image_paths = []
    labels = []
    dataset_path = Path(dataset_path)
    image_extensions = {'.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'}

    # First, collect all images with their original vendor labels
    vendor_to_images = {}
    for vendor in FIRST_DATASET_CLASSES:
        vendor_dir = dataset_path / vendor
        if vendor_dir.exists() and vendor_dir.is_dir():
            for ext in image_extensions:
                images = list(vendor_dir.glob(f"*{ext}"))
                if vendor not in vendor_to_images:
                    vendor_to_images[vendor] = []
                vendor_to_images[vendor].extend([str(img_path) for img_path in images])

    # If no vendor directories found, try filename/path matching
    if len(vendor_to_images) == 0:
        for ext in image_extensions:
            all_images = list(dataset_path.rglob(f"*{ext}"))
            for img_path in all_images:
                filename = img_path.name.lower()
                for vendor in FIRST_DATASET_CLASSES:
                    if vendor.lower() in filename or vendor.lower() in str(img_path.parent).lower():
                        if vendor not in vendor_to_images:
                            vendor_to_images[vendor] = []
                        vendor_to_images[vendor].append(str(img_path))
                        break

    # Map to second dataset classes
    mapping_summary = {}
    for original_vendor, img_list in vendor_to_images.items():
        mapped_vendor = FIRST_TO_SECOND_MAPPING.get(original_vendor)
        if mapped_vendor and mapped_vendor in VENDOR_CLASSES:
            mapped_idx = VENDOR_TO_IDX[mapped_vendor]
            for img_path in img_list:
                image_paths.append(img_path)
                labels.append(mapped_idx)
            mapping_summary[original_vendor] = (mapped_vendor, len(img_list))
        else:
            print(f"WARNING: {original_vendor} → {mapped_vendor} not in target classes, skipping {len(img_list)} images")

    print(f"\nFirst dataset label mapping summary:")
    for original_vendor, (mapped_vendor, count) in mapping_summary.items():
        print(f"  {original_vendor} ({count} images) → {mapped_vendor}")

    return image_paths, labels

def prepare_dataset_from_csv(train_dir, csv_path):
    """Prepare dataset by loading images and labels from CSV file."""
    image_paths = []
    labels = []
    train_dir = Path(train_dir)
    csv_path = Path(csv_path)

    if not csv_path.exists() or not train_dir.exists():
        print(f"WARNING: CSV or train directory not found")
        return image_paths, labels

    df = pd.read_csv(csv_path)
    # Mapping optimized for second dataset
    explicit_mapping = {
        'messenger': 'Facebook', 
        'whatsapp': 'Facebook', 
        'mozilla': 'Google'
    }
    unique_labels = df['Label'].str.lower().unique()
    label_mapping = {}
    
    for csv_label in unique_labels:
        matched = False
        if csv_label in explicit_mapping:
            mapped_vendor = explicit_mapping[csv_label]
            if mapped_vendor in VENDOR_CLASSES:
                label_mapping[csv_label.lower()] = VENDOR_TO_IDX[mapped_vendor]
                matched = True
        if not matched:
            # Direct match with target classes
            for vendor in VENDOR_CLASSES:
                if csv_label == vendor.lower():
                    label_mapping[csv_label.lower()] = VENDOR_TO_IDX[vendor]
                    matched = True
                    break
        if not matched:
            print(f"WARNING: Label '{csv_label}' not found in target classes, skipping")

    skipped_count = 0
    for _, row in df.iterrows():
        image_id = str(row['Id']).zfill(5)
        label_str = str(row['Label']).lower()
        
        if label_str not in label_mapping:
            skipped_count += 1
            continue
        
        image_found = False
        for ext in ['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG']:
            image_path = train_dir / f"{image_id}{ext}"
            if image_path.exists():
                image_paths.append(str(image_path))
                labels.append(label_mapping[label_str])
                image_found = True
                break
        
        if not image_found:
            print(f"WARNING: Image not found for ID {image_id}")
    
    if skipped_count > 0:
        print(f"WARNING: Skipped {skipped_count} images due to unmapped labels")
    
    print(f"Loaded {len(image_paths)} images with {len(set(labels))} unique classes")
    if len(labels) > 0:
        label_counts = np.bincount(labels)
        print(f"Label distribution: {label_counts}")
        # Show which classes are present
        unique_label_indices = sorted(set(labels))
        print(f"Classes present: {[VENDOR_CLASSES[i] for i in unique_label_indices]}")
    
    return image_paths, labels

## Phase 1: First Dataset - Download and Prepare

In [None]:
# Download first dataset
path = kagglehub.dataset_download("subinium/emojiimage-dataset")
print(f"Path to dataset files: {path}")

# Prepare dataset
image_paths, labels = prepare_dataset(path)

print(f"\nFound {len(image_paths)} images")
if len(labels) > 0:
    print(f"Labels distribution: {np.bincount(labels)}")

## Phase 1: Split First Dataset

In [None]:
# Split first dataset into train and test
if len(image_paths) > 0:
    train_paths, test_paths, train_labels, test_labels = train_test_split(
        image_paths, labels, test_size=0.2, random_state=42, stratify=labels
    )

    print(f"Train samples: {len(train_paths)}")
    print(f"Test samples: {len(test_paths)}")
    print(f"Train label distribution: {np.bincount(train_labels)}")
    print(f"Test label distribution: {np.bincount(test_labels)}")
else:
    print("ERROR: No images found.")

## Phase 1: Train Binary Classifiers

In [None]:
# Train one binary classifier per class
if len(image_paths) > 0:
    print("\n" + "="*50)
    print("Training Binary Classifiers")
    print("="*50)
    
    batch_size = 16 if torch.cuda.is_available() else 8
    num_epochs = 10
    learning_rate = 1e-5
    
    # Mixed precision scaler
    scaler = None
    if torch.cuda.is_available():
        model_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
        if model_dtype == torch.float16:
            scaler = torch.cuda.amp.GradScaler()
    
    # Train each binary classifier
    for vendor_idx, vendor_name in enumerate(VENDOR_CLASSES):
        print(f"\n{'='*50}")
        print(f"Training binary classifier for {vendor_name} (Class {vendor_idx})")
        print(f"{'='*50}")
        
        # Create binary labels: 1 if this class, 0 otherwise
        binary_train_labels = [1 if label == vendor_idx else 0 for label in train_labels]
        binary_test_labels = [1 if label == vendor_idx else 0 for label in test_labels]
        
        # Create datasets
        binary_train_dataset = EmojiDataset(train_paths, binary_train_labels, processor, use_augmentation=True)
        binary_test_dataset = EmojiDataset(test_paths, binary_test_labels, processor, use_augmentation=False)
        
        binary_train_loader = DataLoader(
            binary_train_dataset, batch_size=batch_size, shuffle=True,
            num_workers=4 if torch.cuda.is_available() else 2,
            pin_memory=torch.cuda.is_available()
        )
        
        binary_test_loader = DataLoader(
            binary_test_dataset, batch_size=batch_size, shuffle=False,
            num_workers=4 if torch.cuda.is_available() else 2,
            pin_memory=torch.cuda.is_available()
        )
        
        # Get model and optimizer
        model = binary_models[vendor_idx]
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', factor=0.5, patience=2, verbose=True, min_lr=1e-7
        )
        
        best_val_acc = 0
        early_stopping_patience = 3
        early_stopping_counter = 0
        
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")
            
            train_loss, train_acc = train_epoch_binary(model, binary_train_loader, optimizer, device, scaler)
            val_loss, val_acc, val_probs, val_labels_bin = validate_binary(model, binary_test_loader, device)
            
            scheduler.step(val_acc)
            
            print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
            print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                early_stopping_counter = 0
                torch.save(model.state_dict(), f'best_binary_model_class_{vendor_idx}.pt')
                print(f"✓ Saved best model with validation accuracy: {best_val_acc:.2f}%")
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= early_stopping_patience:
                    print(f"Early stopping triggered!")
                    break
        
        # Load best model
        if os.path.exists(f'best_binary_model_class_{vendor_idx}.pt'):
            model.load_state_dict(torch.load(f'best_binary_model_class_{vendor_idx}.pt', map_location=device))
        
        print(f"\nCompleted training for {vendor_name}")
    
    print("\n" + "="*50)
    print("All binary classifiers trained!")
    print("="*50)
else:
    print("ERROR: Cannot train without data.")

## Generate Features for XGBoost

In [None]:
# Generate predictions from all binary models to use as features for XGBoost
if len(image_paths) > 0:
    print("\n" + "="*50)
    print("Generating Features from Binary Classifiers")
    print("="*50)
    
def generate_binary_features(image_paths_list, labels_list, use_augmentation=False):
        """Generate features from all binary classifiers."""
        features = []
        true_labels = []
        
        dataset = EmojiDataset(image_paths_list, labels_list, processor, use_augmentation=use_augmentation)
        loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=2)
        
        all_binary_probs = [[] for _ in range(len(VENDOR_CLASSES))]
        
        for batch in tqdm(loader, desc="Generating features"):
            pixel_values = batch['pixel_values'].to(device)
            batch_labels = batch['labels'].cpu().numpy()
            true_labels.extend(batch_labels)
            
            # Get predictions from all binary models
            batch_features = []
            for vendor_idx in range(len(VENDOR_CLASSES)):
                model = binary_models[vendor_idx]
                model.eval()
                with torch.no_grad():
                    outputs = model(pixel_values=pixel_values)
                    probs = torch.sigmoid(outputs.logits.squeeze(-1))
                    batch_features.append(probs.cpu().numpy())
            
            # Stack features: (num_classes, batch_size) -> (batch_size, num_classes)
            batch_features = np.stack(batch_features, axis=1)
            features.extend(batch_features)
        
        return np.array(features), np.array(true_labels)
    
    # Generate features for train and test
    print("Generating training features...")
    X_train, y_train = generate_binary_features(train_paths, train_labels, use_augmentation=False)
    
    print("Generating test features...")
    X_test, y_test = generate_binary_features(test_paths, test_labels, use_augmentation=False)
    
    print(f"\nTraining features shape: {X_train.shape}")
    print(f"Training labels shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}")
    print(f"Test labels shape: {y_test.shape}")
    print(f"\nFeature range: [{X_train.min():.4f}, {X_train.max():.4f}]")
else:
    print("ERROR: Cannot generate features without data.")

## Train XGBoost Ensemble

In [None]:
# Train XGBoost classifier on binary model outputs
if len(image_paths) > 0 and 'X_train' in locals():
    print("\n" + "="*50)
    print("Training XGBoost Ensemble")
    print("="*50)
    
    # XGBoost parameters
    xgb_params = {
        'objective': 'multi:softprob',
        'num_class': len(VENDOR_CLASSES),
        'max_depth': 6,
        'learning_rate': 0.1,
        'n_estimators': 200,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 1,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42,
        'tree_method': 'hist' if torch.cuda.is_available() else 'approx'
    }
    
    # Create XGBoost classifier
    xgb_classifier = xgb.XGBClassifier(**xgb_params)
    
    print("Training XGBoost...")
    xgb_classifier.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)
    
    # Evaluate
    train_pred = xgb_classifier.predict(X_train)
    test_pred = xgb_classifier.predict(X_test)
    
    train_acc = accuracy_score(y_train, train_pred) * 100
    test_acc = accuracy_score(y_test, test_pred) * 100
    
    print(f"\nXGBoost Training Accuracy: {train_acc:.2f}%")
    print(f"XGBoost Test Accuracy: {test_acc:.2f}%")
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, test_pred, target_names=VENDOR_CLASSES))
    
    # Save XGBoost model
    xgb_classifier.save_model('xgb_ensemble_model.json')
    print("\n✓ Saved XGBoost model to xgb_ensemble_model.json")
else:
    print("ERROR: Cannot train XGBoost without features.")

## Phase 2: Second Dataset - Load and Prepare

In [None]:
# Load second dataset (single path, no alternatives)
dataset_base = Path("2-computer-vision-2025-b-sc-aidams-final-proj")
train_dir = dataset_base / "train"
csv_path = dataset_base / "train_labels.csv"

if not train_dir.exists():
    print(f"ERROR: Train directory not found at {train_dir}")
    print("Please ensure the dataset is in the correct location.")
    second_dataset_paths = []
    second_dataset_labels = []
else:
    print(f"Found train directory at: {train_dir}")
    print(f"Found CSV file at: {csv_path}")
    
    second_dataset_paths, second_dataset_labels = prepare_dataset_from_csv(train_dir, csv_path)
    print(f"\nFound {len(second_dataset_paths)} labeled images from second dataset")
    
    if len(second_dataset_paths) > 0:
        print(f"Label distribution: {np.bincount(second_dataset_labels)}")

In [None]:
    xgb_params = {
        'objective': 'multi:softprob',
        'num_class': len(VENDOR_CLASSES),
        'max_depth': 6,
        'learning_rate': 0.1,
        'n_estimators': 200,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 1,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42,
        'tree_method': 'hist' if torch.cuda.is_available() else 'approx'
    }

# Global XGBoost parameters

## Phase 2: Re-fine-tune Binary Classifiers

In [None]:
# Re-fine-tune binary classifiers on second dataset
if len(second_dataset_paths) > 0:
    print("\n" + "="*50)
    print("Re-fine-tuning Binary Classifiers on Second Dataset")
    print("="*50)
    
    # Split second dataset with STRICT stratified sampling
    if len(second_dataset_paths) > 100:
        unique_labels, label_counts = np.unique(second_dataset_labels, return_counts=True)
        min_class_count = label_counts.min()
        
        # Always try stratified split if possible
        if min_class_count >= 2:
            try:
                second_train_paths, second_val_paths, second_train_labels, second_val_labels = train_test_split(
                    second_dataset_paths, second_dataset_labels, test_size=0.1, random_state=42, stratify=second_dataset_labels
                )
                print(f"✓ Split with STRATIFIED sampling: {len(second_train_paths)} train, {len(second_val_paths)} validation")
            except ValueError as e:
                print(f"⚠️ Stratification failed: {e}")
                # Fallback: still try to maintain some balance
                second_train_paths, second_val_paths, second_train_labels, second_val_labels = train_test_split(
                    second_dataset_paths, second_dataset_labels, test_size=0.1, random_state=42
                )
                print(f"Split without stratification: {len(second_train_paths)} train, {len(second_val_paths)} validation")
        else:
            print(f"⚠️ Warning: Cannot stratify (min class count: {min_class_count} < 2)")
            second_train_paths, second_val_paths, second_train_labels, second_val_labels = train_test_split(
                second_dataset_paths, second_dataset_labels, test_size=0.1, random_state=42
            )
        
        print(f"\nTrain label distribution: {np.bincount(second_train_labels)}")
        print(f"Validation label distribution: {np.bincount(second_val_labels)}")
    else:
        second_train_paths, second_val_paths = second_dataset_paths, []
        second_train_labels, second_val_labels = second_dataset_labels, []
        print("Dataset too small for validation split, using all for training")
    
    batch_size = 16 if torch.cuda.is_available() else 8
    refinetune_epochs = 5
    refinetune_lr = 5e-6
    
    scaler = None
    if torch.cuda.is_available():
        model_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
        if model_dtype == torch.float16:
            scaler = torch.cuda.amp.GradScaler()
    
    # Re-fine-tune each binary classifier
    for vendor_idx, vendor_name in enumerate(VENDOR_CLASSES):
        print(f"\nRe-fine-tuning binary classifier for {vendor_name}...")
        
        # Load best model from phase 1
        if os.path.exists(f'best_binary_model_class_{vendor_idx}.pt'):
            binary_models[vendor_idx].load_state_dict(torch.load(f'best_binary_model_class_{vendor_idx}.pt', map_location=device))
        
        # Create binary labels
        binary_train_labels = [1 if label == vendor_idx else 0 for label in second_train_labels]
        
        # Create dataset
        binary_train_dataset = EmojiDataset(second_train_paths, binary_train_labels, processor, use_augmentation=True)
        binary_train_loader = DataLoader(
            binary_train_dataset, batch_size=batch_size, shuffle=True,
            num_workers=4 if torch.cuda.is_available() else 2, pin_memory=torch.cuda.is_available()
        )
        
        model = binary_models[vendor_idx]
        optimizer = torch.optim.AdamW(model.parameters(), lr=refinetune_lr, weight_decay=0.01)
        
        for epoch in range(refinetune_epochs):
            train_loss, train_acc = train_epoch_binary(model, binary_train_loader, optimizer, device, scaler)
            if (epoch + 1) % 2 == 0:
                print(f"  Epoch {epoch + 1}/{refinetune_epochs}: Loss={train_loss:.4f}, Acc={train_acc:.2f}%")
        
        torch.save(model.state_dict(), f'best_binary_model_class_{vendor_idx}_phase2.pt')
    
    print("\n✓ All binary classifiers re-fine-tuned!")
    
    # Re-train XGBoost on combined features
    print("\nRe-training XGBoost on combined dataset...")
    
    # Generate features from second dataset for XGBoost (using train split only for now)
    second_X, second_y = generate_binary_features(second_train_paths, second_train_labels, use_augmentation=False)
    
    # Combine with first dataset features
    X_combined = np.vstack([X_train, second_X])
    y_combined = np.hstack([y_train, second_y])
    
    # Re-train XGBoost
    xgb_classifier = xgb.XGBClassifier(**xgb_params)
    xgb_classifier.fit(X_combined, y_combined, eval_set=[(X_test, y_test)], verbose=True)
    
    test_pred = xgb_classifier.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred) * 100
    print(f"\nXGBoost Test Accuracy after re-training: {test_acc:.2f}%")
    
    xgb_classifier.save_model('xgb_ensemble_model_phase2.json')
    print("✓ Saved updated XGBoost model")
else:
    print("Skipping phase 2 (no second dataset found)")

## Phase 2: Final Training on Combined Train+Validation Data


In [None]:
# Combine train and validation for final training (no data leakage)
if len(second_val_paths) > 0 and len(second_dataset_paths) > 0:
    print("\n" + "="*50)
    print("Final Training on Combined Second Dataset (Train + Validation)")
    print("="*50)
    
    # Combine splits
    combined_second_train_paths = second_train_paths + second_val_paths
    combined_second_train_labels = second_train_labels + second_val_labels
    
    print(f"Combined second dataset: {len(combined_second_train_paths)} samples")
    print(f"  - Original train: {len(second_train_paths)} samples")
    print(f"  - Original validation: {len(second_val_paths)} samples")
    print(f"Combined label distribution: {np.bincount(combined_second_train_labels)}")
    
    # Re-fine-tune binary classifiers on combined data
    for vendor_idx, vendor_name in enumerate(VENDOR_CLASSES):
        print(f"\nFinal training binary classifier for {vendor_name}...")
        
        # Load best model from phase 2
        if os.path.exists(f'best_binary_model_class_{vendor_idx}_phase2.pt'):
            binary_models[vendor_idx].load_state_dict(torch.load(f'best_binary_model_class_{vendor_idx}_phase2.pt', map_location=device))
        elif os.path.exists(f'best_binary_model_class_{vendor_idx}.pt'):
            binary_models[vendor_idx].load_state_dict(torch.load(f'best_binary_model_class_{vendor_idx}.pt', map_location=device))
        
        # Create binary labels for combined data
        binary_combined_labels = [1 if label == vendor_idx else 0 for label in combined_second_train_labels]
        
        # Create dataset
        binary_combined_dataset = EmojiDataset(combined_second_train_paths, binary_combined_labels, processor, use_augmentation=True)
        binary_combined_loader = DataLoader(
            binary_combined_dataset, batch_size=batch_size, shuffle=True,
            num_workers=4 if torch.cuda.is_available() else 2, pin_memory=torch.cuda.is_available()
        )
        
        model = binary_models[vendor_idx]
        optimizer = torch.optim.AdamW(model.parameters(), lr=3e-6, weight_decay=0.01)
        
        for epoch in range(2):  # Short final training
            train_loss, train_acc = train_epoch_binary(model, binary_combined_loader, optimizer, device, scaler)
        
        torch.save(model.state_dict(), f'best_binary_model_class_{vendor_idx}_final.pt')
    
    print("\n✓ All binary classifiers final training completed!")
    
    # Re-train XGBoost on final combined features
    print("\nRe-training XGBoost on final combined dataset...")
    combined_X, combined_y = generate_binary_features(combined_second_train_paths, combined_second_train_labels, use_augmentation=False)
    
    # Combine with first dataset
    X_final = np.vstack([X_train, combined_X])
    y_final = np.hstack([y_train, combined_y])
    
    xgb_classifier = xgb.XGBClassifier(**xgb_params)
    xgb_classifier.fit(X_final, y_final, eval_set=[(X_test, y_test)], verbose=True)
    
    test_pred = xgb_classifier.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred) * 100
    print(f"\nXGBoost Final Test Accuracy: {test_acc:.2f}%")
    
    xgb_classifier.save_model('xgb_ensemble_model_final.json')
    print("✓ Saved final XGBoost model")
else:
    print("No validation split available for final training.")


## Generate Predictions on Original Test Data

In [None]:
# Load original test data and generate predictions
test_dataset_path = dataset_base / "test"

if test_dataset_path.exists():
    print("\n" + "="*50)
    print("Generating Predictions on Original Test Data")
    print("="*50)
    
    # Load best final models (trained on combined data)
    for vendor_idx in range(len(VENDOR_CLASSES)):
        if os.path.exists(f'best_binary_model_class_{vendor_idx}_final.pt'):
            binary_models[vendor_idx].load_state_dict(torch.load(f'best_binary_model_class_{vendor_idx}_final.pt', map_location=device))
        elif os.path.exists(f'best_binary_model_class_{vendor_idx}_phase2.pt'):
            binary_models[vendor_idx].load_state_dict(torch.load(f'best_binary_model_class_{vendor_idx}_phase2.pt', map_location=device))
        elif os.path.exists(f'best_binary_model_class_{vendor_idx}.pt'):
            binary_models[vendor_idx].load_state_dict(torch.load(f'best_binary_model_class_{vendor_idx}.pt', map_location=device))
    
    # Load XGBoost model (prefer final model)
    if os.path.exists('xgb_ensemble_model_final.json'):
        xgb_classifier.load_model('xgb_ensemble_model_final.json')
        print("Using final XGBoost model (trained on combined data)")
    elif os.path.exists('xgb_ensemble_model_phase2.json'):
        xgb_classifier.load_model('xgb_ensemble_model_phase2.json')
        print("Using phase 2 XGBoost model")
    elif os.path.exists('xgb_ensemble_model.json'):
        xgb_classifier.load_model('xgb_ensemble_model.json')
        print("Using phase 1 XGBoost model")
    
    # Find test images
    test_image_paths = []
    image_extensions = {'.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'}
    for ext in image_extensions:
        test_image_paths.extend(list(test_dataset_path.rglob(f"*{ext}")))
    test_image_paths = [str(p) for p in test_image_paths]
    test_image_paths.sort()
    
    print(f"Found {len(test_image_paths)} test images")
    
    predictions = []
    image_ids = []
    
    print(f"Processing {len(test_image_paths)} test images with TTA...")
    
    with torch.no_grad():
        for image_path in tqdm(test_image_paths, desc="Generating predictions"):
            try:
                image_id = Path(image_path).stem
                image = Image.open(image_path).convert('RGB')
                
                # Get predictions from all binary models with TTA
                binary_features = []
                for vendor_idx in range(len(VENDOR_CLASSES)):
                    prob = predict_binary_with_tta(
                        binary_models[vendor_idx], image, processor, tta_aug,
                        num_augmentations=10, device=device
                    )
                    binary_features.append(prob)
                
                # Convert to numpy array and reshape for XGBoost
                binary_features = np.array(binary_features).reshape(1, -1)
                
                # Predict with XGBoost
                predicted_idx = xgb_classifier.predict(binary_features)[0]
                
                if predicted_idx >= len(VENDOR_CLASSES):
                    print(f"WARNING: Invalid prediction index {predicted_idx}, using first class")
                    predicted_idx = 0
                
                predicted_label = IDX_TO_VENDOR[predicted_idx]
                predictions.append(predicted_label)
                image_ids.append(image_id)
                
            except Exception as e:
                print(f"Error processing {image_path}: {e}")
                import traceback
                traceback.print_exc()
                predictions.append(VENDOR_CLASSES[0])
                image_ids.append(Path(image_path).stem)
    
    # Create predictions.csv file
    predictions_file = "predictions.csv"
    with open(predictions_file, 'w') as f:
        f.write("Id,Label\n")
        for img_id, pred_label in zip(image_ids, predictions):
            clean_id = str(img_id).strip()
            f.write(f"{clean_id},{pred_label}\n")
    
    print(f"\nPredictions saved to {predictions_file}")
    print(f"Total predictions: {len(predictions)}")
    print(f"Unique image IDs: {len(set(image_ids))}")
    
    from collections import Counter
    label_counts = Counter(predictions)
    print(f"\nPrediction distribution:")
    for label, count in sorted(label_counts.items()):
        percentage = 100 * count / len(predictions)
        print(f"  {label}: {count} ({percentage:.1f}%)")
    
    verification_df = pd.read_csv(predictions_file)
    print(f"\nVerification - Loaded {len(verification_df)} rows from {predictions_file}")
    print(f"Columns: {verification_df.columns.tolist()}")
    print(f"\nSample predictions:")
    print(verification_df.head(10))
    
    print("\n✓ Predictions generation completed!")
else:
    print(f"Test directory not found at {test_dataset_path}")
    print("Cannot generate predictions.")