In [None]:
"""
DS-UA 301 Advanced Topics in Data Science
Homework 3 Solution

This file implements the solutions for Homework 3, focusing on:
- Problem 1: Learning Rate, Batch Size, FashionMNIST
- Problem 3: Transfer Learning

"""

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import FashionMNIST
import time
import math
import os
import pandas as pd
from sklearn.metrics import accuracy_score
import copy

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


#                  Problem 1: Learning Rate, Batch Size, FashionMNIST      #

In [None]:


class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        # First convolutional layer
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, padding=2)
        # Second convolutional layer
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        # Fully connected layers
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # First conv block
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)

        # Second conv block
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)

        # Flatten
        x = x.view(-1, 16 * 5 * 5)

        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x

def load_fashion_mnist(batch_size=64):
    """
    Load the FashionMNIST dataset
    """
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

    # Load training data
    train_set = FashionMNIST(root='./data', train=True, download=True, transform=transform)

    # Split into train and validation sets
    train_size = int(0.8 * len(train_set))
    val_size = len(train_set) - train_size
    train_dataset, val_dataset = random_split(train_set, [train_size, val_size])

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    # Load test data
    test_set = FashionMNIST(root='./data', train=False, download=True, transform=transform)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

    return train_loader, val_loader, test_loader

class CyclicLR:
    """
    Cyclical Learning Rate scheduler with exponential decay
    Based on the paper: "Cyclical Learning Rates for Training Neural Networks"
    """
    def __init__(self, optimizer, base_lr, max_lr, step_size, gamma=1.0):
        self.optimizer = optimizer
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.gamma = gamma
        self.clr_iterations = 0
        self.cycle_momentum = True

        if self.cycle_momentum:
            for group in self.optimizer.param_groups:
                group.setdefault('initial_momentum', group.get('momentum', 0))

    def reset(self):
        self.clr_iterations = 0

    def step(self):
        cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))
        x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)

        # Apply exponential decay
        decay_factor = self.gamma ** self.clr_iterations

        # Calculate the learning rate
        lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, (1 - x)) * decay_factor

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

            # Update momentum if cycle_momentum is True
            if self.cycle_momentum:
                if self.clr_iterations % (2 * self.step_size) < self.step_size:
                    momentum = param_group['initial_momentum'] + (0.99 - param_group['initial_momentum']) * (1 - x)
                else:
                    momentum = param_group['initial_momentum']
                param_group['momentum'] = momentum

        self.clr_iterations += 1
        return lr

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler=None, epochs=5, device=device):
    """
    Train the model and return training history
    """
    model.to(device)
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': [],
        'lr': []
    }

    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Track the current learning rate
            current_lr = optimizer.param_groups[0]['lr']

            # Update learning rate if using scheduler
            if scheduler is not None:
                scheduler.step()

            # Update statistics
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            # Print progress
            if batch_idx % 100 == 0:
                print(f'Epoch: {epoch+1}/{epochs} | Batch: {batch_idx}/{len(train_loader)} | '
                      f'Loss: {running_loss/(batch_idx+1):.3f} | '
                      f'Acc: {100.*correct/total:.3f}% | '
                      f'LR: {current_lr:.6f}')

        # Compute epoch-level statistics for training
        train_loss = running_loss / len(train_loader)
        train_acc = 100. * correct / total

        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                # Update statistics
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

        # Compute epoch-level statistics for validation
        val_loss = val_loss / len(val_loader)
        val_acc = 100. * correct / total

        # Update history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['lr'].append(current_lr)

        # Print epoch summary
        print(f'Epoch: {epoch+1}/{epochs} | '
              f'Train Loss: {train_loss:.3f} | '
              f'Train Acc: {train_acc:.3f}% | '
              f'Val Loss: {val_loss:.3f} | '
              f'Val Acc: {val_acc:.3f}% | '
              f'LR: {current_lr:.6f}')

    return history

def find_lr(model_class, train_loader, val_loader, epochs=5, start_lr=1e-9, end_lr=10, device=device):
    """
    Run training for different learning rates and find the optimal range
    """
    # Initialize arrays to store results
    learning_rates = np.logspace(np.log10(start_lr), np.log10(end_lr), num=10)
    final_losses = []

    for lr in learning_rates:
        print(f"\nTraining with learning rate: {lr}")

        # Initialize model and optimizer
        model = model_class().to(device)
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
        criterion = nn.CrossEntropyLoss()

        # Train for specified epochs
        history = train_model(model, train_loader, val_loader, criterion, optimizer, epochs=epochs)

        # Store the final training loss
        final_losses.append(history['train_loss'][-1])

    # Convert to numpy arrays for plotting
    log_lrs = np.log10(learning_rates)

    # Plot Loss vs Learning Rate
    plt.figure(figsize=(10, 6))
    plt.plot(log_lrs, final_losses, 'o-', markersize=8)
    plt.xlabel('Log Learning Rate')
    plt.ylabel('Final Training Loss')
    plt.title('Training Loss vs. Learning Rate')
    plt.grid(True)
    plt.savefig('lr_finder.png')
    plt.show()

    # Determine optimal lr_min and lr_max
    # Typically, lr_min is where the loss starts to decrease rapidly
    # lr_max is just before the loss starts to diverge
    best_loss_idx = np.argmin(final_losses)
    divergence_idx = np.argmax(final_losses)

    # Compute indices for lr_min and lr_max
    lr_min_idx = max(0, best_loss_idx - 1)  # One step before optimal
    lr_max_idx = best_loss_idx  # The optimal point

    lr_min = learning_rates[lr_min_idx]
    lr_max = learning_rates[lr_max_idx]

    print(f"Suggested lr_min: {lr_min}")
    print(f"Suggested lr_max: {lr_max}")

    return lr_min, lr_max, learning_rates, final_losses

def train_with_clr(model_class, train_loader, val_loader, lr_min, lr_max, step_size=500, epochs=20, device=device):
    """
    Train model with cyclical learning rate policy
    """
    # Initialize model and optimizer
    model = model_class().to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr_min, momentum=0.9)
    criterion = nn.CrossEntropyLoss()

    # Initialize the CLR scheduler
    clr = CyclicLR(optimizer, base_lr=lr_min, max_lr=lr_max, step_size=step_size, gamma=0.99)

    # Train with CLR
    history = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler=clr, epochs=epochs)

    # Plot training and validation metrics
    epochs_range = range(1, epochs + 1)

    plt.figure(figsize=(15, 10))

    # Plot loss
    plt.subplot(2, 2, 1)
    plt.plot(epochs_range, history['train_loss'], 'o-', label='Training Loss')
    plt.plot(epochs_range, history['val_loss'], 'o-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    # Plot accuracy
    plt.subplot(2, 2, 2)
    plt.plot(epochs_range, history['train_acc'], 'o-', label='Training Accuracy')
    plt.plot(epochs_range, history['val_acc'], 'o-', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.grid(True)

    # Plot learning rate
    plt.subplot(2, 2, 3)
    plt.plot(history['lr'], 'o-')
    plt.title('Learning Rate over Training Steps')
    plt.xlabel('Training Steps')
    plt.ylabel('Learning Rate')
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('clr_training.png')
    plt.show()

    return model, history

def batch_size_experiment(model_class, train_loader, val_loader, fixed_lr, device=device):
    """
    Test the effect of increasing batch size with fixed learning rate
    """
    batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096]
    losses = []
    accuracies = []

    for batch_size in batch_sizes:
        print(f"\nTraining with batch size: {batch_size}")

        # Reload data with the new batch size
        train_loader_bs, val_loader_bs, _ = load_fashion_mnist(batch_size=batch_size)

        # Initialize model and optimizer
        model = model_class().to(device)
        optimizer = optim.SGD(model.parameters(), lr=fixed_lr, momentum=0.9)
        criterion = nn.CrossEntropyLoss()

        # Train for a few epochs
        history = train_model(model, train_loader_bs, val_loader_bs, criterion, optimizer, epochs=5)

        # Store the final training loss and validation accuracy
        losses.append(history['train_loss'][-1])
        accuracies.append(history['val_acc'][-1])

    # Plot Loss vs Batch Size (log scale)
    plt.figure(figsize=(15, 6))

    plt.subplot(1, 2, 1)
    plt.plot(np.log2(batch_sizes), losses, 'o-', markersize=8)
    plt.xlabel('log2(Batch Size)')
    plt.ylabel('Final Training Loss')
    plt.title('Training Loss vs. Batch Size')
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(np.log2(batch_sizes), accuracies, 'o-', markersize=8)
    plt.xlabel('log2(Batch Size)')
    plt.ylabel('Validation Accuracy (%)')
    plt.title('Validation Accuracy vs. Batch Size')
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('batch_size_experiment.png')
    plt.show()

    return batch_sizes, losses, accuracies

def run_problem1():
    """
    Run Problem 1 experiments
    """
    print("\n\n" + "="*80)
    print("Problem 1: Learning Rate, Batch Size, FashionMNIST")
    print("="*80)

    # Load data
    train_loader, val_loader, test_loader = load_fashion_mnist(batch_size=64)

    # Part 1: Find optimal learning rate range
    print("\nPart 1: Finding optimal learning rate range...")
    lr_min, lr_max, learning_rates, final_losses = find_lr(LeNet5, train_loader, val_loader, epochs=5)

    # Part 2: Train with cyclical learning rate
    print("\nPart 2: Training with cyclical learning rate...")
    model_clr, history_clr = train_with_clr(LeNet5, train_loader, val_loader, lr_min, lr_max, epochs=15)

    # Part 3: Batch size experiment
    print("\nPart 3: Batch size experiment...")
    batch_sizes, losses, accuracies = batch_size_experiment(LeNet5, train_loader, val_loader, fixed_lr=lr_max)

    print("\nProblem 1 experiments completed.")

#                   Problem 2: CNN Architectures Analysis                    #

In [None]:


"""
Problem 2 requires theoretical calculations and analysis. See the separate
document for the detailed solutions for:

1. Number of parameters in AlexNet
2. VGG19 memory and parameters calculation
3. Receptive field calculation
4. Inception module analysis
5. Faster-RCNN architecture analysis
"""

## Problem 3 - Transfer learning: Shallow learning vs Finetuning, Pytorch

In [None]:
def download_visual_decathlon_dataset():
    """
    Download and prepare a dataset from the Visual Decathlon Challenge
    For this implementation, we'll use CIFAR-100 as our target dataset
    """
    # Define transformations
    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32, padding=4),
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))
    ])

    # Download CIFAR-100 dataset
    train_dataset = torchvision.datasets.CIFAR100(
        root='./data', train=True, download=True, transform=train_transform
    )

    test_dataset = torchvision.datasets.CIFAR100(
        root='./data', train=False, download=True, transform=test_transform
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

    # Analyze dataset
    class_counts = {}
    for _, label in train_dataset:
        if label not in class_counts:
            class_counts[label] = 0
        class_counts[label] += 1

    print(f"Dataset: CIFAR-100")
    print(f"Number of classes: {len(class_counts)}")
    print(f"Total training samples: {len(train_dataset)}")
    print(f"Total test samples: {len(test_dataset)}")
    print(f"Average samples per class: {len(train_dataset) / len(class_counts):.2f}")
    print(f"Min samples per class: {min(class_counts.values())}")
    print(f"Max samples per class: {max(class_counts.values())}")

    return train_loader, test_loader, len(class_counts)

def show_sample_images(data_loader, num_classes=2, images_per_class=2):
    """
    Display sample images from the dataset
    """
    # Get some random training images
    dataiter = iter(data_loader)
    images, labels = next(dataiter)

    # Create a dictionary to store images by class
    class_images = {}

    # Keep fetching batches until we have enough images for each class
    while len(class_images) < num_classes or any(len(imgs) < images_per_class for imgs in class_images.values()):
        for img, lbl in zip(images, labels):
            if lbl.item() not in class_images:
                if len(class_images) >= num_classes:
                    continue
                class_images[lbl.item()] = []

            if len(class_images[lbl.item()]) < images_per_class:
                class_images[lbl.item()].append(img)

        # Check if we have enough images
        if len(class_images) >= num_classes and all(len(imgs) >= images_per_class for imgs in class_images.values()):
            break

        # Fetch more images if needed
        try:
            images, labels = next(dataiter)
        except StopIteration:
            break

    # Plot images
    plt.figure(figsize=(10, 5))
    classes = list(class_images.keys())

    for i, class_idx in enumerate(classes[:num_classes]):
        for j, img in enumerate(class_images[class_idx][:images_per_class]):
            plt.subplot(num_classes, images_per_class, i * images_per_class + j + 1)
            img = img.numpy().transpose((1, 2, 0))
            # Denormalize
            mean = np.array([0.5071, 0.4867, 0.4408])
            std = np.array([0.2675, 0.2565, 0.2761])
            img = std * img + mean
            img = np.clip(img, 0, 1)
            plt.imshow(img)
            plt.title(f"Class {class_idx}")
            plt.axis('off')

    plt.tight_layout()
    plt.savefig('sample_images.png')
    plt.show()

def finetune_model(model, train_loader, test_loader, learning_rate, num_epochs=60, momentum=0.9, weight_decay=5e-4):
    """
    Finetune a pretrained model on the target dataset
    """
    # Define criterion and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)

    # Define learning rate scheduler with multi-step decay
    milestones = [int(num_epochs * 0.25), int(num_epochs * 0.5), int(num_epochs * 0.75)]
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)

    # Initialize best accuracy and model
    best_acc = 0.0
    best_model = None

    # Training and validation history
    history = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': [],
        'lr': []
    }

    # Train the model
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Update statistics
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            # Print progress
            if batch_idx % 50 == 0:
                print(f'Epoch: {epoch+1}/{num_epochs} | Batch: {batch_idx}/{len(train_loader)} | '
                      f'Loss: {running_loss/(batch_idx+1):.3f} | '
                      f'Acc: {100.*correct/total:.3f}% | '
                      f'LR: {optimizer.param_groups[0]["lr"]:.6f}')

        # Compute epoch-level statistics for training
        train_loss = running_loss / len(train_loader)
        train_acc = 100. * correct / total

        # Testing phase
        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                # Update statistics
                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

        # Compute epoch-level statistics for testing
        test_loss = test_loss / len(test_loader)
        test_acc = 100. * correct / total

        # Update history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['test_loss'].append(test_loss)
        history['test_acc'].append(test_acc)
        history['lr'].append(optimizer.param_groups[0]['lr'])

        # Print epoch summary
        print(f'Epoch: {epoch+1}/{num_epochs} | '
              f'Train Loss: {train_loss:.3f} | '
              f'Train Acc: {train_acc:.3f}% | '
              f'Test Loss: {test_loss:.3f} | '
              f'Test Acc: {test_acc:.3f}% | '
              f'LR: {optimizer.param_groups[0]["lr"]:.6f}')

        # Save best model
        if test_acc > best_acc:
            best_acc = test_acc
            best_model = copy.deepcopy(model.state_dict())

        # Step the scheduler
        scheduler.step()

    # Load best model
    model.load_state_dict(best_model)

    # Plot training and validation metrics
    epochs_range = range(1, num_epochs + 1)

    plt.figure(figsize=(15, 10))

    # Plot loss
    plt.subplot(2, 2, 1)
    plt.plot(epochs_range, history['train_loss'], 'o-', label='Training Loss')
    plt.plot(epochs_range, history['test_loss'], 'o-', label='Test Loss')
    plt.title(f'Training and Test Loss (LR={learning_rate})')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    # Plot accuracy
    plt.subplot(2, 2, 2)
    plt.plot(epochs_range, history['train_acc'], 'o-', label='Training Accuracy')
    plt.plot(epochs_range, history['test_acc'], 'o-', label='Test Accuracy')
    plt.title(f'Training and Test Accuracy (LR={learning_rate})')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.grid(True)

    # Plot learning rate
    plt.subplot(2, 2, 3)
    plt.plot(epochs_range, history['lr'], 'o-')
    plt.title('Learning Rate Schedule')
    plt.xlabel('Epochs')
    plt.ylabel('Learning Rate')
    plt.grid(True)

    plt.tight_layout()
    plt.savefig(f'finetuning_lr_{learning_rate}.png')
    plt.show()

    print(f"Final Test Accuracy: {best_acc:.2f}%")

    return model, history, best_acc

def train_last_layer(model, train_loader, test_loader, learning_rate, num_epochs=60, momentum=0.9, weight_decay=5e-4):
    """
    Train only the last layer of the model (feature extraction)
    """
    # Freeze all layers except the last one
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last layer
    for param in model.fc.parameters():
        param.requires_grad = True

    # Define criterion and optimizer (only train the last layer)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.fc.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)

    # Define learning rate scheduler with multi-step decay
    milestones = [int(num_epochs * 0.25), int(num_epochs * 0.5), int(num_epochs * 0.75)]
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)

    # Initialize best accuracy and model
    best_acc = 0.0
    best_model = None

    # Training and validation history
    history = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': [],
        'lr': []
    }

    # Train the model
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Update statistics
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

## Problem 4 - Weakly and Semi-Supervised Learning for Image Classification

(b) Parameters K and P in Stage 2 (Yalniz et al.)

K is the number of top‐ranked images that the teacher model selects per class from the unlabeled pool U. After running the teacher on every image, images are ranked by the teacher’s softmax score for each class, and the top K images for class l form the pseudo‐labeled set D̂ₗ for that class.

P is the number of highest‐scoring classes (per image) that the teacher retains when assigning candidates. For each unlabeled image, we take its P largest softmax outputs (rather than just the single top class), on the intuition that an image may legitimately contain multiple target classes—especially under‐represented or co‐occurring concepts that might be occluded by more prominent ones
arXiv
arXiv
.

Why P > 1? By allowing each image to contribute to up to P classes, the procedure avoids starving rare classes of examples when the overall collection U isn’t large enough. It also yields a more balanced D̂ by letting an image appear in multiple class‐specific rankings
arXiv
.

(c) Creating a New Labeled Dataset (Yalniz et al.)

Teacher predictions: Run the teacher model (trained on the small labeled set D) on each image in the unlabeled set U to get a softmax score vector.

Class shortlist per image: For each image, keep only the P classes with highest scores.

Per-class ranking: For each class l, rank all images by their score for l, and select the top K as positive examples for that class—this yields D̂ₗ.

Aggregate: Form the new dataset D̂ = ⋃ₗ D̂ₗ.

Replication for multi-class: If an image is in multiple D̂ₗ sets, it’s simply replicated (once per class) in D̂; the student is then trained as a standard multi‐class classifier on D̂
arXiv
.

Can an image belong to more than one class?
Yes. Because we allow P > 1 and then rank per class, an image can appear in multiple class‐specific top‐K lists. We handle this by replicating the image under each of its assigned classes, but still train the student with a softmax over the union D̂.

(d) Why Student Accuracy First Improves then Drops as K Increases (Figure 5)

Initial gains: As K grows from small values, the student sees more diverse and harder examples per class, which boosts its ability to generalize.

Plateau: Across a broad regime (e.g. 4 k–32 k), accuracy is stable—small adjustments to K matter less.

Decline: Beyond the sweet spot, including lower‐ranked images admits many false positives (label noise), which degrades student performance.

This trade‐off peaks around K = 8 k for ResNet-50 and a bit higher for larger students
arXiv
.

1. Weakly Supervised vs Semi-Supervised Pretraining

Weakly supervised pretraining (Mahajan et al.) uses the hashtags themselves as noisy labels. A model is trained directly on billions of hashtagged images W to predict those hashtags, then fine-tuned on a smaller, clean set V. This leverages large weakly annotated data end-to-end
arXiv
.

Semi-supervised pretraining (Yalniz et al.) starts from a small set of truly labeled images D to train a teacher. That teacher then pseudo-labels a large unlabeled pool U (ignoring hashtags), producing a new noisy but expansive D̂ on which a student is trained and finally fine-tuned back on D
arXiv
.

Same dataset, two uses: Both papers mine the same 1 B–image pool (IG-1B‐Targeted). Mahajan et al. treat each image’s hashtag as its “ground truth” (weak labels), whereas Yalniz et al. discard those hashtags and treat the images as unlabeled, relying instead on a teacher’s predictions for pseudo‐labeling
arXiv
.

2. From Mahajan et al.
(a) Robustness to Label Noise

They injected p % artificial label noise by randomly replacing p % of hashtags with random tags sampled from the overall hashtag distribution.

Findings: At p = 10 %, ImageNet top-1 accuracy drops by < 1 %; at p = 25 %, the drop is ≈ 2 %. This shows surprising resilience to noisy hashtag supervision when training on billions of examples
arXiv
.

(b) Importance of Resampling the Hashtag Distribution

Instagram hashtags follow a Zipfian (long‐tailed) distribution. If you sample “naturally,” head hashtags dominate training.

By using square-root or uniform resampling (i.e., under-sampling head tags, over-sampling tail tags), they observe 5–6 % absolute gains in transfer accuracy across target tasks, versus natural sampling. This balanced sampling is crucial to expose the model to rare concepts and improve feature quality
arXiv
.

3. Teacher–Student Distillation (Yalniz et al.)

Two Models:

Teacher: trained on the small, clean labeled set D.

Student: trained on the large pseudo-labeled set D̂ produced by the teacher.

Leverage: The student “distills” knowledge by mimicking the teacher’s predictions on unlabeled data. This extends classic model compression: instead of distilling soft logits on the same data, here we distill across a vast unlabeled corpus, effectively amplifying the teacher’s knowledge.

Why Distillation: In both cases, the student is optimized to reproduce the teacher’s outputs—either soft targets (logits) or hard pseudo-labels—on extra data. This is precisely the core of knowledge distillation, where a large (or strongly supervised) teacher guides a smaller (or target) student to reach higher accuracy than direct supervised training alone
arXiv
.


