# Fine-tuning Swin Transformer for Emoji Vendor Classification

This notebook fine-tunes a Swin Transformer model to classify emoji images by vendor (Apple, DoCoMo, Facebook, Gmail, Google, JoyPixels, KDDI, Samsung, SoftBank, Twitter, Windows).


In [None]:
# Install required packages
%pip install -q pydrive2 google-api-python-client google-auth-httplib2 google-auth-oauthlib

import os
import sys

# Detect if running on Kaggle or Colab
IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
IS_COLAB = 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules

print(f"Running on: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")


In [1]:
# Install required packages
!pip install -q kagglehub transformers torch torchvision pillow datasets accelerate


In [2]:
import kagglehub
import os
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import json
from transformers import AutoModel, AutoImageProcessor, Trainer, TrainingArguments
from transformers.modeling_outputs import ImageClassifierOutput
import torch.nn as nn
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

# GPU Setup - Ensure everything runs on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    # Clear GPU cache
    torch.cuda.empty_cache()
else:
    print("WARNING: CUDA not available. Training will be slow on CPU.")


Using device: cuda
GPU: Tesla T4
CUDA Version: 12.6
GPU Memory: 14.74 GB


## Download Dataset


In [3]:
# Download latest version
path = kagglehub.dataset_download("subinium/emojiimage-dataset")

print("Path to dataset files:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/subinium/emojiimage-dataset?dataset_version_number=2...


100%|██████████| 47.6M/47.6M [00:00<00:00, 131MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2


## Load Model


In [4]:
# Load model directly to GPU
from transformers import AutoModel

# Load model with appropriate dtype for GPU
if torch.cuda.is_available():
    # Use float16 for faster training on GPU (or bfloat16 if supported)
    if torch.cuda.is_bf16_supported():
        model_dtype = torch.bfloat16
    else:
        model_dtype = torch.float16
    print(f"Loading model with dtype: {model_dtype}")
else:
    model_dtype = torch.float32

model = AutoModel.from_pretrained(
    "timm/swin_base_patch4_window12_384.ms_in22k",
    torch_dtype=model_dtype
)

# Move model to GPU immediately
model = model.to(device)
model.eval()  # Set to eval mode initially

# Load image processor
processor = AutoImageProcessor.from_pretrained("timm/swin_base_patch4_window12_384.ms_in22k")

print("Model loaded successfully!")
print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

# Verify GPU memory usage
if torch.cuda.is_available():
    print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")


Loading model with dtype: torch.bfloat16


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/451M [00:00<?, ?B/s]

Model loaded successfully!
Model device: cuda:0
Model dtype: torch.bfloat16
GPU Memory allocated: 0.17 GB
GPU Memory reserved: 0.20 GB


## Explore Dataset Structure


In [5]:
# Explore the dataset directory structure
dataset_path = Path(path)
print("Dataset contents:")
for item in sorted(dataset_path.iterdir()):
    print(f"  {item.name}")

# Check for subdirectories
if dataset_path.is_dir():
    subdirs = [d for d in dataset_path.iterdir() if d.is_dir()]
    if subdirs:
        print("\nSubdirectories found:")
        for subdir in subdirs[:5]:  # Show first 5
            print(f"  {subdir.name}")
            # Check if it contains images
            images = list(subdir.glob("*.png")) + list(subdir.glob("*.jpg")) + list(subdir.glob("*.jpeg"))
            if images:
                print(f"    - Contains {len(images)} images")


Dataset contents:
  full_emoji.csv
  image

Subdirectories found:
  image


## Define Vendor Classes and Dataset Class


In [6]:
# Define vendor classes
VENDOR_CLASSES = [
    "Apple",
    "DoCoMo",
    "Facebook",
    "Gmail",
    "Google",
    "JoyPixels",
    "KDDI",
    "Samsung",
    "SoftBank",
    "Twitter",
    "Windows"
]

VENDOR_TO_IDX = {vendor: idx for idx, vendor in enumerate(VENDOR_CLASSES)}
IDX_TO_VENDOR = {idx: vendor for vendor, idx in VENDOR_TO_IDX.items()}

print(f"Number of vendor classes: {len(VENDOR_CLASSES)}")
print("Vendor classes:", VENDOR_CLASSES)


Number of vendor classes: 11
Vendor classes: ['Apple', 'DoCoMo', 'Facebook', 'Gmail', 'Google', 'JoyPixels', 'KDDI', 'Samsung', 'SoftBank', 'Twitter', 'Windows']


In [7]:
class EmojiDataset(Dataset):
    def __init__(self, image_paths, labels, processor, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]

        # Load image
        try:
            image = Image.open(image_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            # Return a blank image if loading fails
            image = Image.new('RGB', (384, 384), color='white')

        # Process image with the processor
        inputs = self.processor(image, return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze(0)  # Remove batch dimension

        return {
            'pixel_values': pixel_values,
            'labels': torch.tensor(label, dtype=torch.long)
        }


## Prepare Dataset


In [8]:
def prepare_dataset(dataset_path):
    """Prepare dataset by finding all images and their corresponding vendor labels."""
    image_paths = []
    labels = []

    dataset_path = Path(dataset_path)

    # Common image extensions
    image_extensions = {'.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'}

    # Strategy 1: Check if vendor names are in directory names
    for vendor in VENDOR_CLASSES:
        vendor_dir = dataset_path / vendor
        if vendor_dir.exists() and vendor_dir.is_dir():
            for ext in image_extensions:
                images = list(vendor_dir.glob(f"*{ext}"))
                for img_path in images:
                    image_paths.append(str(img_path))
                    labels.append(VENDOR_TO_IDX[vendor])

    # Strategy 2: Check if vendor names are in filenames
    if len(image_paths) == 0:
        for ext in image_extensions:
            all_images = list(dataset_path.rglob(f"*{ext}"))
            for img_path in all_images:
                filename = img_path.name.lower()
                for vendor in VENDOR_CLASSES:
                    if vendor.lower() in filename or vendor.lower() in str(img_path.parent).lower():
                        image_paths.append(str(img_path))
                        labels.append(VENDOR_TO_IDX[vendor])
                        break

    # Strategy 3: If still empty, check all subdirectories
    if len(image_paths) == 0:
        for subdir in dataset_path.iterdir():
            if subdir.is_dir():
                vendor_name = subdir.name
                # Try to match vendor name
                matched = False
                for vendor in VENDOR_CLASSES:
                    if vendor.lower() in vendor_name.lower() or vendor_name.lower() in vendor.lower():
                        for ext in image_extensions:
                            images = list(subdir.glob(f"*{ext}"))
                            for img_path in images:
                                image_paths.append(str(img_path))
                                labels.append(VENDOR_TO_IDX[vendor])
                        matched = True
                        break

                # If no match, check if it's a nested structure
                if not matched:
                    for vendor in VENDOR_CLASSES:
                        vendor_subdir = subdir / vendor
                        if vendor_subdir.exists():
                            for ext in image_extensions:
                                images = list(vendor_subdir.glob(f"*{ext}"))
                                for img_path in images:
                                    image_paths.append(str(img_path))
                                    labels.append(VENDOR_TO_IDX[vendor])

    return image_paths, labels

# Prepare dataset
image_paths, labels = prepare_dataset(path)

print(f"Found {len(image_paths)} images")
print(f"Labels distribution: {np.bincount(labels)}")

# Show some examples
if len(image_paths) > 0:
    print("\nSample image paths:")
    for i in range(min(5, len(image_paths))):
        print(f"  {image_paths[i]} -> {IDX_TO_VENDOR[labels[i]]}")
else:
    print("WARNING: No images found! Please check the dataset structure.")


Found 14253 images
Labels distribution: [1813  251 1727  720 1816 1816  637 1724  476 1816 1457]

Sample image paths:
  /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2/image/DoCoMo/1513.png -> DoCoMo
  /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2/image/DoCoMo/1063.png -> DoCoMo
  /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2/image/DoCoMo/94.png -> DoCoMo
  /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2/image/DoCoMo/1473.png -> DoCoMo
  /root/.cache/kagglehub/datasets/subinium/emojiimage-dataset/versions/2/image/DoCoMo/126.png -> DoCoMo


In [12]:
class SwinForEmojiClassification(nn.Module):
    def __init__(self, num_labels=len(VENDOR_CLASSES)):
        super().__init__()
        self.swin = model
        self.num_labels = num_labels

        # Get the hidden size from the model config
        # For TimmBackbone (used by AutoModel for timm models), access num_features from the wrapped timm_model
        hidden_size = self.swin.timm_model.num_features

        # Classification head
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden_size),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, pixel_values, labels=None):
        # Get embeddings from Swin
        outputs = self.swin(pixel_values=pixel_values)

        # Use pooler_output if available, otherwise use last_hidden_state mean
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            # Mean pooling over sequence dimension
            pooled_output = outputs.last_hidden_state.mean(dim=1)

        # Classification
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return ImageClassifierOutput(
            loss=loss,
            logits=logits
        )

# Create the model (base model is already on GPU)
classification_model = SwinForEmojiClassification(num_labels=len(VENDOR_CLASSES))

# Ensure entire model is on GPU
classification_model = classification_model.to(device)

# Verify all parameters are on GPU
all_on_gpu = all(p.device.type == 'cuda' for p in classification_model.parameters())
print(f"Model created and moved to {device}")
print(f"All parameters on GPU: {all_on_gpu}")
print(f"Total parameters: {sum(p.numel() for p in classification_model.parameters()):,}")

# GPU memory info
if torch.cuda.is_available():
    print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")


Model created and moved to cuda
All parameters on GPU: True
Total parameters: 86,891,907
GPU Memory allocated: 0.17 GB
GPU Memory reserved: 0.20 GB


## Split Dataset and Create DataLoaders


In [23]:
# Split dataset into train and validation
if len(image_paths) > 0:
    train_paths, val_paths, train_labels, val_labels = train_test_split(
        image_paths, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Create datasets
    train_dataset = EmojiDataset(train_paths, train_labels, processor)
    val_dataset = EmojiDataset(val_paths, val_labels, processor)

    # Create data loaders with GPU optimizations
    # Use larger batch size on GPU, pin_memory for faster GPU transfer
    batch_size = 8 if torch.cuda.is_available() else 8 # Reduced batch size to 8

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4 if torch.cuda.is_available() else 2,
        pin_memory=torch.cuda.is_available(),  # Pin memory for faster GPU transfer
        persistent_workers=True if torch.cuda.is_available() else False
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4 if torch.cuda.is_available() else 2,
        pin_memory=torch.cuda.is_available(),
        persistent_workers=True if torch.cuda.is_available() else False
    )

    print(f"Train samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    print(f"Batch size: {batch_size}")
    print(f"Pin memory: {torch.cuda.is_available()}")
else:
    print("ERROR: No images found. Cannot create data loaders.")


Train samples: 11402
Validation samples: 2851
Batch size: 8
Pin memory: True




## Training Setup


In [24]:
# Training parameters
num_epochs = 5
learning_rate = 2e-5

# Optimizer and scheduler
optimizer = torch.optim.AdamW(
    classification_model.parameters(),
    lr=learning_rate,
    weight_decay=0.01
)

# Learning rate scheduler
from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)

# Mixed precision scaler for GPU training
# GradScaler is typically used for float16 to prevent underflow/overflow.
# For bfloat16, which has a wider dynamic range, it's often not needed.
scaler = None
if torch.cuda.is_available():
    # model_dtype is set in cell xJsll1CjjO7A
    if model_dtype == torch.float16:
        scaler = torch.cuda.amp.GradScaler()
        print("Mixed precision training: Enabled (float16 with GradScaler)")
    elif model_dtype == torch.bfloat16:
        # For bfloat16, we use autocast but often don't need a GradScaler.
        # The `train_epoch` function will correctly use autocast but skip scaler steps.
        print("Mixed precision training: Enabled (bfloat16 without GradScaler)")
    else:
        print("Mixed precision training: Disabled (GPU, non-fp16/bf16 dtype)")
else:
    print("Mixed precision training: Disabled (CPU)")

print("Training setup complete!")


Mixed precision training: Enabled (bfloat16 without GradScaler)
Training setup complete!


## Training Loop


In [None]:
def train_epoch(model, train_loader, optimizer, device, scaler=None):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    # Check if CUDA is available to enable autocast
    is_cuda_available = (device.type == 'cuda')

    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Move data to GPU (with pin_memory, this should be fast)
        pixel_values = batch['pixel_values'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)

        optimizer.zero_grad()

        # Always use autocast if on CUDA, it handles casting inputs to model's precision
        with torch.cuda.amp.autocast(enabled=is_cuda_available):
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

        if scaler is not None:
            # For float16, use scaler
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            # For bfloat16 or CPU, standard backward pass
            loss.backward()
            optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        progress_bar.set_postfix({
            'loss': loss.item(),
            'acc': f'{100 * correct / total:.2f}%'
        })

    # Clear GPU cache periodically
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return total_loss / len(train_loader), 100 * correct / total

def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    # Check if CUDA is available to enable autocast
    is_cuda_available = (device.type == 'cuda')

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            pixel_values = batch['pixel_values'].to(device, non_blocking=True)
            labels = batch['labels'].to(device, non_blocking=True)

            # Always use autocast if on CUDA
            with torch.cuda.amp.autocast(enabled=is_cuda_available):
                outputs = model(pixel_values=pixel_values, labels=labels)
                loss = outputs.loss

            total_loss += loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return total_loss / len(val_loader), 100 * correct / total

# Training
if len(image_paths) > 0:
    print("Starting training on GPU..." if torch.cuda.is_available() else "Starting training on CPU...")
    best_val_acc = 0

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        print("-" * 50)

        train_loss, train_acc = train_epoch(classification_model, train_loader, optimizer, device, scaler)
        val_loss, val_acc = validate(classification_model, val_loader, device)

        scheduler.step()

        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

        # GPU memory info
        if torch.cuda.is_available():
            print(f"GPU Memory: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB / {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(classification_model.state_dict(), 'best_swin_emoji_model.pt')
            print(f"Saved best model with validation accuracy: {best_val_acc:.2f}%")

    print("\nTraining completed!")
else:
    print("ERROR: Cannot train without data.")


Starting training on GPU...

Epoch 1/5
--------------------------------------------------


  with torch.cuda.amp.autocast(enabled=is_cuda_available):
Training: 100%|██████████| 1426/1426 [09:01<00:00,  2.63it/s, loss=0.0597, acc=60.28%]
  with torch.cuda.amp.autocast(enabled=is_cuda_available):
Validation: 100%|██████████| 357/357 [00:52<00:00,  6.84it/s]


Train Loss: 1.1874, Train Acc: 60.28%
Val Loss: 0.6162, Val Acc: 77.17%
GPU Memory: 6.56 GB / 7.71 GB
Saved best model with validation accuracy: 77.17%

Epoch 2/5
--------------------------------------------------


Training:  19%|█▉        | 273/1426 [01:45<07:22,  2.60it/s, loss=0.571, acc=76.79%]

## Evaluation and Inference


In [None]:
# Load best model (ensure it's loaded to GPU)
if os.path.exists('best_swin_emoji_model.pt'):
    # Load state dict and ensure model is on GPU
    state_dict = torch.load('best_swin_emoji_model.pt', map_location=device)
    classification_model.load_state_dict(state_dict)
    classification_model = classification_model.to(device)
    print("Loaded best model for evaluation")
    print(f"Model device: {next(classification_model.parameters()).device}")

# Final evaluation
if len(image_paths) > 0:
    val_loss, val_acc = validate(classification_model, val_loader, device)
    print(f"\nFinal Validation Accuracy: {val_acc:.2f}%")

    # Per-class accuracy
    classification_model.eval()
    class_correct = [0] * len(VENDOR_CLASSES)
    class_total = [0] * len(VENDOR_CLASSES)

    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)

            outputs = classification_model(pixel_values=pixel_values)
            _, predicted = torch.max(outputs.logits, 1)

            for i in range(labels.size(0)):
                label = labels[i].item()
                class_total[label] += 1
                if predicted[i] == labels[i]:
                    class_correct[label] += 1

    print("\nPer-class accuracy:")
    for i, vendor in enumerate(VENDOR_CLASSES):
        if class_total[i] > 0:
            acc = 100 * class_correct[i] / class_total[i]
            print(f"  {vendor}: {acc:.2f}% ({class_correct[i]}/{class_total[i]})")


# 2nd Dataset