# Self-Supervised Learning & Fine-Tuning Strategies

Self-supervised pretraining unlocks representations that transfer across tasks. This notebook explores contrastive and masked modeling objectives, then demonstrates adapter-style fine-tuning with differential learning rates.

## Learning Objectives

- Implement contrastive learning components (augmentations, projection head, InfoNCE loss).
- Outline masked prediction objectives for language modeling.
- Apply differential learning rates and adapters during fine-tuning.
- Build a fine-tuning helper that supports freezing, adapters, and evaluation.

## Contrastive Learning Primer

SimCLR-style contrastive learning pulls together positive views of the same sample while pushing apart negatives. Components:

1. Data augmentation pipeline to create positive pairs.
2. Encoder that maps inputs to latent vectors.
3. Projection head that improves the loss landscape.
4. InfoNCE loss over cosine similarities.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import matplotlib.pyplot as plt

torch.manual_seed(1)

class ProjectionHead(nn.Module):
    def __init__(self, input_dim, proj_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )

    def forward(self, x):
        return F.normalize(self.net(x), dim=-1)

encoder = nn.Sequential(nn.Linear(128, 256), nn.ReLU(), nn.Linear(256, 128))
projector = ProjectionHead(128, 64)

def info_nce_loss(z_i, z_j, temperature=0.1):
    batch = z_i.size(0)
    representations = torch.cat([z_i, z_j], dim=0)
    similarity = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=-1)
    labels = torch.arange(batch, device=z_i.device)
    labels = torch.cat([labels, labels], dim=0)
    mask = torch.eye(2 * batch, device=z_i.device, dtype=torch.bool)
    similarity = similarity.masked_fill(mask, float('-inf'))
    logits = similarity / temperature
    loss = F.cross_entropy(logits, labels)
    return loss

views_1 = torch.randn(32, 128)
views_2 = torch.randn(32, 128)
z1 = projector(encoder(views_1))
z2 = projector(encoder(views_2))
print(f"InfoNCE loss: {info_nce_loss(z1, z2):.4f}")


### Visualizing Cross-View Similarities

Positive pairs should trend toward high similarity (bright diagonal).

In [None]:
sim_matrix = torch.mm(z1, z2.t()).detach()
plt.imshow(sim_matrix, cmap="viridis")
plt.colorbar(label="cosine similarity")
plt.title("Cross-view similarity matrix")
plt.xlabel("View 2 index")
plt.ylabel("View 1 index")
plt.show()


## Masked Prediction Quickstart

Masked language modeling predicts missing tokens. Use a sentinel (e.g., vocabulary index for `[MASK]`) and ensure the loss ignores non-masked positions.

In [None]:
vocab_size = 100
embed = nn.Embedding(vocab_size, 32)
classifier = nn.Linear(32, vocab_size)
tokens = torch.randint(0, vocab_size, (4, 10))
mask = torch.rand_like(tokens.float()) < 0.3
masked_tokens = tokens.clone()
masked_tokens[mask] = vocab_size - 1  # assume last index is [MASK]
embeddings = embed(masked_tokens)
logits = classifier(embeddings)
loss = F.cross_entropy(logits[mask], tokens[mask])
print(f"Masked LM loss: {loss:.4f}")


## Mini Task – Adapter Layer

Adapters insert a bottleneck module (down-project → nonlinearity → up-project) into existing layers. Implement an adapter and a helper that injects adapters into every `nn.Linear` module.

In [None]:
class Adapter(nn.Module):
    def __init__(self, dim, bottleneck=32):
        super().__init__()
        # TODO: define down, activation, up projections

    def forward(self, x):
        raise NotImplementedError

def add_adapters(module, bottleneck=32):
    # TODO: recursively wrap linear layers with adapters
    raise NotImplementedError


In [None]:
class Adapter(nn.Module):
    def __init__(self, dim, bottleneck=32):
        super().__init__()
        self.down = nn.Linear(dim, bottleneck)
        self.activation = nn.GELU()
        self.up = nn.Linear(bottleneck, dim)

    def forward(self, x):
        return self.up(self.activation(self.down(x)))

def add_adapters(module, bottleneck=32):
    for name, child in module.named_children():
        if isinstance(child, nn.Linear):
            wrapped = nn.Sequential(child, Adapter(child.out_features, bottleneck))
            setattr(module, name, wrapped)
        else:
            add_adapters(child, bottleneck)
    return module

encoder_with_adapters = add_adapters(encoder, bottleneck=16)
print(encoder_with_adapters)


## Differential Learning Rates

When fine-tuning, use smaller learning rates for pretrained layers and larger ones for new heads.

In [None]:
def create_optimizer(model, head, base_lr=1e-4, head_lr=1e-3):
    return torch.optim.Adam([
        {"params": [p for p in model.parameters() if p.requires_grad], "lr": base_lr},
        {"params": head.parameters(), "lr": head_lr},
    ])

head = nn.Linear(128, 5)
optimizer = create_optimizer(encoder_with_adapters, head)
print([group["lr"] for group in optimizer.param_groups])


## Comprehensive Exercise – Fine-Tuning Helper

Implement a `FineTuner` class that can freeze layers, insert adapters, apply differential learning rates, and evaluate accuracy.

In [None]:
class FineTuner:
    def __init__(self, encoder, head, freeze_until=None, adapter_bottleneck=None, base_lr=1e-4, head_lr=1e-3):
        # TODO: clone encoder, freeze layers, add adapters, create optimizer
        raise NotImplementedError

    def train_step(self, batch):
        raise NotImplementedError

    def evaluate(self, loader):
        raise NotImplementedError


In [None]:
class FineTuner:
    def __init__(self, encoder, head, freeze_until=None, adapter_bottleneck=None, base_lr=1e-4, head_lr=1e-3):
        self.encoder = encoder
        if freeze_until is not None:
            for name, param in self.encoder.named_parameters():
                param.requires_grad = freeze_until not in name
        if adapter_bottleneck is not None:
            add_adapters(self.encoder, adapter_bottleneck)
        self.head = head
        params = [
            {"params": [p for p in self.encoder.parameters() if p.requires_grad], "lr": base_lr},
            {"params": self.head.parameters(), "lr": head_lr},
        ]
        self.optimizer = torch.optim.AdamW(params)

    def train_step(self, batch):
        self.encoder.train()
        self.head.train()
        xb, yb = batch
        feats = self.encoder(xb)
        logits = self.head(feats)
        loss = F.cross_entropy(logits, yb)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def evaluate(self, loader):
        self.encoder.eval()
        self.head.eval()
        correct = total = 0
        with torch.no_grad():
            for xb, yb in loader:
                preds = self.head(self.encoder(xb)).argmax(dim=-1)
                correct += (preds == yb).sum().item()
                total += yb.numel()
        return correct / max(total, 1)

tuner = FineTuner(encoder_with_adapters, head, adapter_bottleneck=16)
dummy_loader = [(torch.randn(16, 128), torch.randint(0, 5, (16,))) for _ in range(2)]
for batch in dummy_loader:
    loss = tuner.train_step(batch)
acc = tuner.evaluate(dummy_loader)
print(f"Accuracy: {acc:.3f}")


## Further Reading

- Chen et al. (2020) “A Simple Framework for Contrastive Learning of Visual Representations” (SimCLR)
- He et al. (2021) “Masked Autoencoders Are Scalable Vision Learners” (MAE)
- Hu et al. (2022) “LoRA: Low-Rank Adaptation of Large Language Models”
- OpenAI Fine-Tuning cookbooks and Hugging Face adapter libraries