# DINOv2-Large + XGBoost (TTA-columns) for Emoji Vendor Classification (V8)

**Train only on the 2nd dataset** (CSV labels).

**Pipeline:**
- DINOv2-Large fine-tuning on 7 classes
- Predictable TTA → create columns `dino_pred_0..dino_pred_{N-1}`
- XGBoost learns to combine statistical features (incl. `original_mode`) + TTA columns

**Classes:** apple, google, whatsapp, facebook, samsung, mozilla, messenger


## Install and Import


In [2]:
%pip install -r requirements.txt

import os
import hashlib
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

import torchvision.transforms.functional as F

from transformers import AutoImageProcessor, AutoModelForImageClassification
from transformers.modeling_outputs import ImageClassifierOutput

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import xgboost as xgb

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    torch.cuda.empty_cache()



[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Using device: cuda
GPU: Tesla V100-PCIE-16GB


## HYPERPARAMETERS


In [3]:
from pathlib import Path

# Model
MODEL_ID = 'facebook/dinov2-small'

# Second dataset paths (relative to this repo)
SECOND_DATASET_BASE_PATH = '.'
SECOND_DATASET_TRAIN_DIR = Path(SECOND_DATASET_BASE_PATH) / 'train'
SECOND_DATASET_CSV_PATH = Path(SECOND_DATASET_BASE_PATH) / 'train_labels.csv'
SECOND_DATASET_TEST_DIR = Path(SECOND_DATASET_BASE_PATH) / 'test'

# TTA / Features
NUM_TTA_AUGS = 10

# Train
VAL_SIZE = 0.10
RANDOM_STATE = 42
BATCH_SIZE_CUDA = 8
BATCH_SIZE_CPU = 4
NUM_EPOCHS = 7
LEARNING_RATE = 1e-5

# XGBoost
XGB_PARAMS = {
    'n_estimators': 400,
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_STATE,
    'eval_metric': 'mlogloss',
    'use_label_encoder': False
}

# Output
PREDICTIONS_OUTPUT_FILE = 'predictions.csv'

print('=', 70, sep='')
print('HYPERPARAMETERS')
print('=', 70, sep='')
print('MODEL_ID:', MODEL_ID)
print('TRAIN_DIR:', SECOND_DATASET_TRAIN_DIR)
print('CSV_PATH :', SECOND_DATASET_CSV_PATH)
print('TEST_DIR :', SECOND_DATASET_TEST_DIR)
print('NUM_TTA_AUGS:', NUM_TTA_AUGS)
print('PREDICTIONS_OUTPUT_FILE:', PREDICTIONS_OUTPUT_FILE)
print('=', 70, sep='')



=70
HYPERPARAMETERS
=70
MODEL_ID: facebook/dinov2-small
TRAIN_DIR: train
CSV_PATH : train_labels.csv
TEST_DIR : test
NUM_TTA_AUGS: 10
PREDICTIONS_OUTPUT_FILE: predictions.csv
=70


## Define Vendor Classes


In [4]:
VENDOR_CLASSES = ['apple','google','whatsapp','facebook','samsung','mozilla','messenger']
VENDOR_TO_IDX = {v:i for i,v in enumerate(VENDOR_CLASSES)}
IDX_TO_VENDOR = {i:v for v,i in VENDOR_TO_IDX.items()}
print('Num classes:', len(VENDOR_CLASSES))
print('Classes:', VENDOR_CLASSES)



Num classes: 7
Classes: ['apple', 'google', 'whatsapp', 'facebook', 'samsung', 'mozilla', 'messenger']


## Deterministic Augmentation System (Predictable TTA)


In [5]:
class DeterministicAugmentation:
    def __init__(self, image_size=224, seed=42):
        self.image_size = image_size
        self.seed = seed
        self.rotation_angles = [-10, -5, 5, 10]
        self.crop_ratios = [0.75, 0.85, 0.9, 0.95]
        self.color_jitter_params = {'brightness':0.3,'contrast':0.3,'saturation':0.3,'hue':0.1}
        self.translate_range = (0.1, 0.1)
        self.blur_sigma = (0.1, 0.5)

    def _get_deterministic_seed(self, image_or_hash):
        if isinstance(image_or_hash, Image.Image):
            img_bytes = image_or_hash.tobytes()
            return int(hashlib.md5(img_bytes).hexdigest()[:8], 16)
        return hash(str(image_or_hash)) & 0xFFFFFFFF

    def horizontal_flip(self, image):
        return F.hflip(image)

    def rotation(self, image, angle):
        return F.rotate(image, angle)

    def center_crop(self, image, crop_ratio=0.9):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        return F.center_crop(image, [crop,crop])

    def corner_crop(self, image, crop_ratio=0.9, position='tl'):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        if position=='tl': return F.crop(image, 0, 0, crop, crop)
        if position=='tr': return F.crop(image, 0, w-crop, crop, crop)
        if position=='bl': return F.crop(image, h-crop, 0, crop, crop)
        if position=='br': return F.crop(image, h-crop, w-crop, crop, crop)
        return image

    def resized_crop(self, image, crop_ratio=0.85):
        w,h = image.size
        crop = int(min(w,h)*crop_ratio)
        cropped = F.center_crop(image, [crop,crop])
        return cropped.resize((self.image_size,self.image_size), Image.BILINEAR)

    def color_jitter(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        b = 1.0 + np.random.uniform(-self.color_jitter_params['brightness'], self.color_jitter_params['brightness'])
        c = 1.0 + np.random.uniform(-self.color_jitter_params['contrast'], self.color_jitter_params['contrast'])
        s = 1.0 + np.random.uniform(-self.color_jitter_params['saturation'], self.color_jitter_params['saturation'])
        h = np.random.uniform(-self.color_jitter_params['hue'], self.color_jitter_params['hue'])
        img = F.adjust_brightness(image, b)
        img = F.adjust_contrast(img, c)
        img = F.adjust_saturation(img, s)
        img = F.adjust_hue(img, h)
        return img

    def affine_transform(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        tx = np.random.uniform(-self.translate_range[0], self.translate_range[0])
        ty = np.random.uniform(-self.translate_range[1], self.translate_range[1])
        return F.affine(image, angle=0, translate=(tx*image.width, ty*image.height), scale=1.0, shear=0.0)

    def gaussian_blur(self, image, seed_val):
        np.random.seed(seed_val % (2**32))
        sigma = np.random.uniform(self.blur_sigma[0], self.blur_sigma[1])
        return F.gaussian_blur(image, kernel_size=3, sigma=[sigma,sigma])

    def get_augmentations(self, image, num_augmentations=10, seed_source=None):
        if seed_source is None:
            seed_source = self._get_deterministic_seed(image)
        seed_val = seed_source
        augs = []
        augs.append(image.resize((self.image_size,self.image_size), Image.BILINEAR))
        augs.append(self.horizontal_flip(image).resize((self.image_size,self.image_size), Image.BILINEAR))
        for angle in self.rotation_angles[:max(0, min(4, num_augmentations-len(augs)))]:
            augs.append(self.rotation(image, angle).resize((self.image_size,self.image_size), Image.BILINEAR))
        corners = ['tl','tr','bl','br']
        for cpos in corners[:max(0, min(4, num_augmentations-len(augs)))]:
            augs.append(self.corner_crop(image, 0.9, cpos).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.center_crop(image, 0.9).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.resized_crop(image, 0.85))
        if len(augs) < num_augmentations:
            augs.append(self.color_jitter(image, seed_val).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.affine_transform(image, seed_val+1).resize((self.image_size,self.image_size), Image.BILINEAR))
        if len(augs) < num_augmentations:
            augs.append(self.gaussian_blur(image, seed_val+2).resize((self.image_size,self.image_size), Image.BILINEAR))
        return augs[:num_augmentations]

augmentation_system = DeterministicAugmentation(image_size=224, seed=42)
tta_aug = augmentation_system
print('Deterministic augmentation ready.')



Deterministic augmentation ready.


## Statistical Feature Extraction (incl. original_mode)


In [6]:
def extract_image_properties(image_path):
    try:
        img = Image.open(image_path)
        mode_mapping = {'L':0,'LA':1,'P':2,'RGB':3,'RGBA':4}
        original_mode = float(mode_mapping.get(img.mode, 3))
        # Normalize image to RGB for pixel stats
        if img.mode == 'P':
            img = img.convert('RGBA')
        if img.mode == 'RGBA':
            bg = Image.new('RGB', img.size, (255,255,255))
            bg.paste(img, mask=img.split()[3])
            img = bg
        elif img.mode != 'RGB':
            img = img.convert('RGB')
        w,h = img.size
        ar = w / h if h else 0.0
        pix = float(w*h)
        arr = np.array(img)
        mean_r = float(arr[:,:,0].mean()); mean_g = float(arr[:,:,1].mean()); mean_b = float(arr[:,:,2].mean())
        std_r = float(arr[:,:,0].std());  std_g  = float(arr[:,:,1].std());  std_b  = float(arr[:,:,2].std())
        brightness = float((mean_r+mean_g+mean_b)/3.0)
        is_mostly_white = float(brightness > 200)
        return {
            'width': float(w), 'height': float(h), 'aspect_ratio': float(ar), 'pixel_count': pix,
            'mean_r': mean_r, 'mean_g': mean_g, 'mean_b': mean_b,
            'std_r': std_r, 'std_g': std_g, 'std_b': std_b,
            'brightness': brightness, 'is_mostly_white': is_mostly_white,
            'original_mode': original_mode
        }
    except Exception as e:
        print('Feature extraction error:', image_path, e)
        return {
            'width':224.0,'height':224.0,'aspect_ratio':1.0,'pixel_count':50176.0,
            'mean_r':128.0,'mean_g':128.0,'mean_b':128.0,
            'std_r':50.0,'std_g':50.0,'std_b':50.0,
            'brightness':128.0,'is_mostly_white':0.0,'original_mode':3.0
        }

STATISTICAL_FEATURE_COLS = [
    'width','height','aspect_ratio','pixel_count',
    'mean_r','mean_g','mean_b','std_r','std_g','std_b',
    'brightness','is_mostly_white','original_mode'
]
print('Num statistical features:', len(STATISTICAL_FEATURE_COLS))



Num statistical features: 13


## Load DINOv2-Large


In [7]:
processor = AutoImageProcessor.from_pretrained(MODEL_ID)
base_model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to(device)
print('Loaded:', MODEL_ID)



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of Dinov2ForImageClassification were not initialized from the model checkpoint at facebook/dinov2-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded: facebook/dinov2-small


## Dataset


In [8]:
def load_image_rgb(path):
    img = Image.open(path)
    if img.mode == 'P':
        img = img.convert('RGBA')
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255,255,255))
        bg.paste(img, mask=img.split()[3])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    return img

class EmojiDataset(Dataset):
    def __init__(self, image_paths, labels, processor):
        self.image_paths = list(image_paths)
        self.labels = list(labels)
        self.processor = processor
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        p = self.image_paths[idx]
        y = int(self.labels[idx])
        img = load_image_rgb(p)
        inputs = self.processor(img, return_tensors='pt')
        pixel_values = inputs['pixel_values'].squeeze(0)
        y = int(max(0, min(y, len(VENDOR_CLASSES)-1)))
        return {'pixel_values': pixel_values, 'labels': torch.tensor(y, dtype=torch.long)}



## Model Head (DINOv2 features → 7 classes)


In [9]:
class DINOv2ForEmojiClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.base_model = base_model
        self.num_labels = num_labels
        hidden = getattr(base_model.config, 'hidden_size', 1024)
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Dropout(0.3),
            nn.Linear(hidden, hidden//2),
            nn.GELU(),
            nn.LayerNorm(hidden//2),
            nn.Dropout(0.2),
            nn.Linear(hidden//2, num_labels)
        )
    def forward(self, pixel_values, labels=None):
        out = self.base_model(pixel_values=pixel_values, output_hidden_states=True)
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            pooled = out.pooler_output
        else:
            pooled = out.hidden_states[-1][:,0,:]
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            labels = torch.clamp(labels, 0, self.num_labels-1)
            loss = nn.CrossEntropyLoss(label_smoothing=0.1)(logits.view(-1, self.num_labels), labels.view(-1))
        return ImageClassifierOutput(loss=loss, logits=logits)

classification_model = DINOv2ForEmojiClassification(num_labels=len(VENDOR_CLASSES)).to(device)
print('Classification model ready.')



Classification model ready.


## Train / Validate


In [10]:
def train_epoch(model, loader, optimizer, device, scaler=None):
    model.train()
    total_loss=0.0; correct=0; total=0
    use_amp = (device.type=='cuda')
    for batch in tqdm(loader, desc='Training'):
        x = batch['pixel_values'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels-1)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast('cuda', enabled=use_amp):
            out = model(pixel_values=x, labels=y)
            loss = out.loss
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        total_loss += float(loss.item())
        pred = torch.argmax(out.logits, dim=1)
        correct += int((pred==y).sum().item())
        total += int(y.size(0))
    return total_loss/max(1,len(loader)), 100.0*correct/max(1,total)

@torch.no_grad()
def validate(model, loader, device):
    model.eval()
    total_loss=0.0; correct=0; total=0
    preds=[]; labels=[]
    use_amp = (device.type=='cuda')
    for batch in tqdm(loader, desc='Validation'):
        x = batch['pixel_values'].to(device, non_blocking=True)
        y = batch['labels'].to(device, non_blocking=True)
        y = torch.clamp(y, 0, model.num_labels-1)
        with torch.amp.autocast('cuda', enabled=use_amp):
            out = model(pixel_values=x, labels=y)
            loss = out.loss
        total_loss += float(loss.item())
        pred = torch.argmax(out.logits, dim=1)
        pred = torch.clamp(pred, 0, model.num_labels-1)
        correct += int((pred==y).sum().item())
        total += int(y.size(0))
        preds.extend(pred.cpu().numpy().tolist())
        labels.extend(y.cpu().numpy().tolist())
    return total_loss/max(1,len(loader)), 100.0*correct/max(1,total), preds, labels



## TTA predictions → separate columns


In [11]:
@torch.no_grad()
def predict_with_tta_separate(model, image, processor, tta_aug, num_augmentations, device):
    model.eval()
    augs = tta_aug.get_augmentations(image, num_augmentations=num_augmentations)
    preds = []
    for aug in augs:
        inp = processor(aug, return_tensors='pt')
        x = inp['pixel_values'].to(device)
        out = model(pixel_values=x)
        pred = int(torch.argmax(out.logits, dim=-1).item())
        preds.append(max(0, min(pred, len(VENDOR_CLASSES)-1)))
    return preds



## Feature matrix for XGBoost (stats + dino_pred_0..N-1)


In [12]:
def generate_features_for_xgboost(image_paths, model, processor, tta_aug, device, num_augmentations, loadin_bar = True):
    if loadin_bar:
        image_paths = tqdm(image_paths, desc='Extracting features')
    rows = []
    for p in image_paths:
        stats = extract_image_properties(p)
        img = load_image_rgb(p)
        dino_preds = predict_with_tta_separate(model, img, processor, tta_aug, num_augmentations, device)
        row = dict(stats)
        for i,dp in enumerate(dino_preds):
            row[f'dino_pred_{i}'] = float(dp)
        rows.append(row)
    df = pd.DataFrame(rows)
    dino_cols = [f'dino_pred_{i}' for i in range(num_augmentations)]
    all_cols = STATISTICAL_FEATURE_COLS + dino_cols
    for c in all_cols:
        if c not in df.columns:
            df[c] = 0.0
    return df[all_cols]

print('XGBoost feature dims:', len(STATISTICAL_FEATURE_COLS) + NUM_TTA_AUGS)


XGBoost feature dims: 23


## Load 2nd dataset (CSV)


In [13]:
def prepare_dataset_from_csv(train_dir, csv_path):
    train_dir = Path(train_dir); csv_path = Path(csv_path)
    if not train_dir.exists() or not csv_path.exists():
        raise FileNotFoundError(f'Missing train_dir or csv: {train_dir} / {csv_path}')
    df = pd.read_csv(csv_path)
    label_map = {v: VENDOR_TO_IDX[v] for v in VENDOR_CLASSES}
    img_paths=[]; labels=[]
    missing=0; unmapped=0
    for _, r in df.iterrows():
        img_id = str(r['Id']).zfill(5)
        lab = str(r['Label']).lower()
        if lab not in label_map:
            unmapped += 1
            continue
        found = None
        for ext in ('.png','.jpg','.jpeg','.PNG','.JPG','.JPEG'):
            p = train_dir / f'{img_id}{ext}'
            if p.exists():
                found = str(p)
                break
        if found is None:
            missing += 1
            continue
        img_paths.append(found)
        labels.append(int(label_map[lab]))
    print('Loaded:', len(img_paths), 'images')
    print('Unmapped labels skipped:', unmapped, 'Missing files skipped:', missing)
    if labels:
        print('Label distribution:', np.bincount(np.array(labels), minlength=len(VENDOR_CLASSES)))
    return img_paths, labels

second_paths, second_labels = prepare_dataset_from_csv(SECOND_DATASET_TRAIN_DIR, SECOND_DATASET_CSV_PATH)



Loaded: 9879 images
Unmapped labels skipped: 0 Missing files skipped: 0
Label distribution: [1924 1877 1644 1667 1790  397  580]


## Split (stratified)


In [14]:
min_count = np.bincount(np.array(second_labels), minlength=len(VENDOR_CLASSES)).min()
can_stratify = (min_count >= 2)
print('Min class count:', int(min_count), 'Stratify:', can_stratify)
train_paths, val_paths, train_y, val_y = train_test_split(
    second_paths, second_labels, test_size=VAL_SIZE, random_state=RANDOM_STATE,
    stratify=second_labels if can_stratify else None
)
print('Train:', len(train_paths), 'Val:', len(val_paths))
print('Train dist:', np.bincount(np.array(train_y), minlength=len(VENDOR_CLASSES)))
print('Val   dist:', np.bincount(np.array(val_y), minlength=len(VENDOR_CLASSES)))



Min class count: 397 Stratify: True
Train: 8891 Val: 988
Train dist: [1732 1689 1480 1500 1611  357  522]
Val   dist: [192 188 164 167 179  40  58]


## Train DINOv2 (2nd dataset only)


In [15]:
train_ds = EmojiDataset(train_paths, train_y, processor)
val_ds = EmojiDataset(val_paths, val_y, processor)
batch_size = BATCH_SIZE_CUDA if torch.cuda.is_available() else BATCH_SIZE_CPU
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())

optimizer = torch.optim.AdamW(classification_model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-7, cooldown=1)

scaler = None
if torch.cuda.is_available() and (not torch.cuda.is_bf16_supported()):
    scaler = torch.cuda.amp.GradScaler()

best_acc = 0.0
for epoch in range(NUM_EPOCHS):
    print(f'\nEpoch {epoch+1}/{NUM_EPOCHS}')
    tr_loss, tr_acc = train_epoch(classification_model, train_loader, optimizer, device, scaler)
    va_loss, va_acc, va_pred, va_true = validate(classification_model, val_loader, device)
    scheduler.step(va_acc)
    print(f'Train: loss={tr_loss:.4f} acc={tr_acc:.2f}%')
    print(f'Val  : loss={va_loss:.4f} acc={va_acc:.2f}%')
    if va_acc > best_acc:
        best_acc = va_acc
        torch.save(classification_model.state_dict(), 'best_dino_v8.pt')
        print('✓ saved best_dino_v8.pt')

print('Best val acc:', best_acc)
if os.path.exists('best_dino_v8.pt'):
    classification_model.load_state_dict(torch.load('best_dino_v8.pt', map_location=device))
    print('Loaded best_dino_v8.pt')

print('Validation report (DINO only, no XGB):')
print(classification_report(va_true, va_pred, target_names=VENDOR_CLASSES))




Epoch 1/7


Training: 100%|██████████| 1112/1112 [01:03<00:00, 17.43it/s]
Validation: 100%|██████████| 124/124 [00:02<00:00, 52.02it/s]


Train: loss=0.8197 acc=83.59%
Val  : loss=0.6135 acc=93.83%
✓ saved best_dino_v8.pt

Epoch 2/7


Training: 100%|██████████| 1112/1112 [01:02<00:00, 17.81it/s]
Validation: 100%|██████████| 124/124 [00:02<00:00, 52.68it/s]


Train: loss=0.6059 acc=93.92%
Val  : loss=0.5728 acc=95.75%
✓ saved best_dino_v8.pt

Epoch 3/7


Training: 100%|██████████| 1112/1112 [01:02<00:00, 17.85it/s]
Validation: 100%|██████████| 124/124 [00:02<00:00, 51.88it/s]


Train: loss=0.5746 acc=95.02%
Val  : loss=0.6139 acc=92.51%

Epoch 4/7


Training: 100%|██████████| 1112/1112 [01:03<00:00, 17.61it/s]
Validation: 100%|██████████| 124/124 [00:02<00:00, 50.98it/s]


Train: loss=0.5535 acc=95.76%
Val  : loss=0.5685 acc=95.34%

Epoch 5/7


Training: 100%|██████████| 1112/1112 [01:02<00:00, 17.77it/s]
Validation: 100%|██████████| 124/124 [00:02<00:00, 52.88it/s]


Train: loss=0.5377 acc=96.37%
Val  : loss=0.5687 acc=95.34%

Epoch 6/7


Training: 100%|██████████| 1112/1112 [01:02<00:00, 17.93it/s]
Validation: 100%|██████████| 124/124 [00:02<00:00, 51.90it/s]


Train: loss=0.4888 acc=98.61%
Val  : loss=0.5332 acc=97.17%
✓ saved best_dino_v8.pt

Epoch 7/7


Training: 100%|██████████| 1112/1112 [01:03<00:00, 17.60it/s]
Validation: 100%|██████████| 124/124 [00:02<00:00, 51.95it/s]
  classification_model.load_state_dict(torch.load('best_dino_v8.pt', map_location=device))


Train: loss=0.4766 acc=99.17%
Val  : loss=0.5596 acc=95.85%
Best val acc: 97.16599190283401
Loaded best_dino_v8.pt
Validation report (DINO only, no XGB):
              precision    recall  f1-score   support

       apple       0.91      0.98      0.95       192
      google       0.95      0.95      0.95       188
    whatsapp       0.97      0.95      0.96       164
    facebook       0.99      0.94      0.97       167
     samsung       0.97      0.96      0.97       179
     mozilla       0.95      0.93      0.94        40
   messenger       0.98      0.98      0.98        58

    accuracy                           0.96       988
   macro avg       0.96      0.96      0.96       988
weighted avg       0.96      0.96      0.96       988



## Train XGBoost on top (TTA-columns + stats)


In [16]:
X_train = generate_features_for_xgboost(train_paths, classification_model, processor, tta_aug, device, NUM_TTA_AUGS)
X_val = generate_features_for_xgboost(val_paths, classification_model, processor, tta_aug, device, NUM_TTA_AUGS)
y_train = np.array(train_y)
y_val = np.array(val_y)
print('X_train:', X_train.shape, 'X_val:', X_val.shape)



Extracting features: 100%|██████████| 8891/8891 [17:06<00:00,  8.66it/s]
Extracting features: 100%|██████████| 988/988 [01:53<00:00,  8.69it/s]

X_train: (8891, 23) X_val: (988, 23)





In [17]:
xgb_model = xgb.XGBClassifier(**XGB_PARAMS)

# Train (no per-iteration printing)
xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Save (workaround for your xgboost sklearn wrapper bug)
xgb_model.get_booster().save_model('xgb_v8.json')
print('Saved xgb_v8.json')

# Evaluate
val_pred = xgb_model.predict(X_val)
print('XGB val acc:', accuracy_score(y_val, val_pred) * 100.0)
print(classification_report(y_val, val_pred, target_names=VENDOR_CLASSES))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Saved xgb_v8.json
XGB val acc: 97.06477732793523
              precision    recall  f1-score   support

       apple       0.93      0.97      0.95       192
      google       0.96      0.97      0.97       188
    whatsapp       0.98      0.97      0.98       164
    facebook       0.98      0.97      0.98       167
     samsung       0.99      0.97      0.98       179
     mozilla       1.00      0.93      0.96        40
   messenger       1.00      0.98      0.99        58

    accuracy                           0.97       988
   macro avg       0.98      0.97      0.97       988
weighted avg       0.97      0.97      0.97       988



## Final fit on Train+Val, then predict Test


In [18]:
# ... existing code above ...

combined_paths = train_paths + val_paths
combined_y = train_y + val_y

combined_ds = EmojiDataset(combined_paths, combined_y, processor)
combined_loader = DataLoader(
    combined_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=torch.cuda.is_available()
)

final_epochs = 2
final_lr = LEARNING_RATE / 2
final_opt = torch.optim.AdamW(classification_model.parameters(), lr=final_lr, weight_decay=0.01)

for ep in range(final_epochs):
    print(f"\nFinal epoch {ep+1}/{final_epochs}")
    tr_loss, tr_acc = train_epoch(classification_model, combined_loader, final_opt, device, scaler)
    print(f"Final train: loss={tr_loss:.4f} acc={tr_acc:.2f}%")



Final epoch 1/2


Training: 100%|██████████| 1235/1235 [01:08<00:00, 18.05it/s]


Final train: loss=0.4877 acc=98.75%

Final epoch 2/2


Training: 100%|██████████| 1235/1235 [01:09<00:00, 17.68it/s]

Final train: loss=0.4847 acc=98.77%





In [19]:
torch.save(classification_model.state_dict(), "best_dino_v8_final.pt")
print("Saved best_dino_v8_final.pt")

X_all = generate_features_for_xgboost(
    combined_paths, classification_model, processor, tta_aug, device, NUM_TTA_AUGS
)
y_all = np.array(combined_y)

xgb_final = xgb.XGBClassifier(**XGB_PARAMS)
xgb_final.fit(X_all, y_all, verbose=False)

# ✅ FIX: save via Booster to avoid `_estimator_type` crash in your xgboost build
xgb_final.get_booster().save_model("xgb_v8_final.json")
print("Saved xgb_v8_final.json")

test_dir = SECOND_DATASET_TEST_DIR
if not test_dir.exists():
    raise FileNotFoundError(f"Missing test dir: {test_dir}")

test_paths = []
for ext in (".png", ".jpg", ".jpeg", ".PNG", ".JPG", ".JPEG"):
    test_paths += [str(p) for p in test_dir.rglob(f"*{ext}")]
test_paths = sorted(set(test_paths))
print("Found test images:", len(test_paths))

pred_labels = []
pred_ids = []
for p in tqdm(test_paths, desc="Predicting test"):
    img_id = Path(p).stem
    Xp = generate_features_for_xgboost([p], classification_model, processor, tta_aug, device, NUM_TTA_AUGS, False)
    pred = int(xgb_final.predict(Xp)[0])
    pred = max(0, min(pred, len(VENDOR_CLASSES) - 1))
    pred_ids.append(img_id)
    pred_labels.append(IDX_TO_VENDOR[pred])

out_path = Path(PREDICTIONS_OUTPUT_FILE)
with out_path.open("w") as f:
    f.write("Id,Label\n")
    for i, l in zip(pred_ids, pred_labels):
        f.write(f"{str(i).strip()},{l}\n")

print("Wrote:", out_path, "rows:", len(pred_labels))

Saved best_dino_v8_final.pt


Extracting features: 100%|██████████| 9879/9879 [18:56<00:00,  8.69it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Saved xgb_v8_final.json
Found test images: 9879


Predicting test: 100%|██████████| 9879/9879 [19:43<00:00,  8.35it/s]

Wrote: predictions.csv rows: 9879



