<a href="https://colab.research.google.com/github/AvtnshM/SSL/blob/main/Self_Supervised_Learning-V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [2]:

# Used ONLY to load dataset
base_transform = transforms.ToTensor()

# Used to create SSL views
ssl_transform = transforms.Compose([
    transforms.RandomResizedCrop(32),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.4, 0.4, 0.4, 0.1),
    transforms.ToTensor()
])

to_pil = transforms.ToPILImage()

In [3]:

dataset = datasets.CIFAR10(
    root="./data",
    train=True,
    download=True,
    transform=base_transform   # ← important
)

100%|██████████| 170M/170M [00:05<00:00, 31.1MB/s]


In [4]:

loader = DataLoader(
    dataset,
    batch_size=256,
    shuffle=True,
    num_workers=2,
    drop_last=True
)

print("DataLoader created. Number of batches:", len(loader))

DataLoader created. Number of batches: 195


In [5]:

class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 64, 3, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)
        )

    def forward(self, x):
        x = self.net(x)
        return x.view(x.size(0), -1)

In [6]:

class Predictor(nn.Module):
    def __init__(self, dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )

    def forward(self, x):
        return self.net(x)

In [7]:

def ssl_loss(p, z):
    p = F.normalize(p, dim=1)
    z = F.normalize(z, dim=1)
    return F.mse_loss(p, z)

In [8]:

# Online encoder (learns with gradients)
encoder = Encoder().to(device)

# Predictor head (learns with gradients)
predictor = Predictor(dim=128).to(device)

# Target encoder (EMA, NO gradients)
target_encoder = Encoder().to(device)

# Initialize target encoder = online encoder
target_encoder.load_state_dict(encoder.state_dict())

# Disable gradients for target encoder
for param in target_encoder.parameters():
    param.requires_grad = False

# Optimizer (ONLY online encoder + predictor)
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(predictor.parameters()),
    lr=1e-3
)

# EMA decay rate (standard value)
ema_tau = 0.996

In [9]:
# =========================
# Collapse Diagnostics
# =========================

@torch.no_grad()
def feature_variance(z):
    """
    Average variance across embedding dimensions.
    Collapse => variance ~ 0
    """
    return z.var(dim=0).mean().item()


@torch.no_grad()
def cosine_similarity_mean(z1, z2):
    """
    Mean cosine similarity between embeddings of two views.
    Collapse => similarity ~ 1.0
    """
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)
    return (z1 * z2).sum(dim=1).mean().item()


In [10]:

@torch.no_grad()
def update_target_encoder(online_encoder, target_encoder, tau):
    for online_param, target_param in zip(
        online_encoder.parameters(),
        target_encoder.parameters()
    ):
        target_param.data = (
            tau * target_param.data +
            (1.0 - tau) * online_param.data
        )

In [11]:
# =========================
# Training Loop (with diagnostics)
# =========================

epochs = 10

for epoch in range(epochs):
    total_loss = 0.0
    total_var = 0.0
    total_cos = 0.0

    for images, _ in loader:
        images = images.to(device)

        # Two stochastic views
        view1 = torch.stack([
            ssl_transform(to_pil(img.cpu())) for img in images
        ]).to(device)

        view2 = torch.stack([
            ssl_transform(to_pil(img.cpu())) for img in images
        ]).to(device)

        # Online encoder
        z1 = encoder(view1)
        z2 = encoder(view2)

        # Predictor
        p1 = predictor(z1)
        p2 = predictor(z2)

        # Target encoder (EMA, no gradients)
        with torch.no_grad():
            t1 = target_encoder(view1)
            t2 = target_encoder(view2)

        # BYOL loss
        loss = ssl_loss(p1, t2.detach()) + ssl_loss(p2, t1.detach())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # EMA update
        update_target_encoder(encoder, target_encoder, ema_tau)

        # Diagnostics
        with torch.no_grad():
            batch_var = feature_variance(z1)
            batch_cos = cosine_similarity_mean(z1, z2)

        total_loss += loss.item()
        total_var += batch_var
        total_cos += batch_cos

    avg_loss = total_loss / len(loader)
    avg_var = total_var / len(loader)
    avg_cos = total_cos / len(loader)

    print(
        f"Epoch [{epoch+1}/{epochs}] | "
        f"Loss: {avg_loss:.4f} | "
        f"Var: {avg_var:.4f} | "
        f"CosSim: {avg_cos:.4f}"
    )


Epoch [1/10] | Loss: 0.0011 | Var: 0.0386 | CosSim: 0.9979
Epoch [2/10] | Loss: 0.0001 | Var: 0.0517 | CosSim: 0.9950
Epoch [3/10] | Loss: 0.0001 | Var: 0.0499 | CosSim: 0.9942
Epoch [4/10] | Loss: 0.0001 | Var: 0.0500 | CosSim: 0.9942
Epoch [5/10] | Loss: 0.0001 | Var: 0.0475 | CosSim: 0.9940
Epoch [6/10] | Loss: 0.0002 | Var: 0.0439 | CosSim: 0.9931
Epoch [7/10] | Loss: 0.0002 | Var: 0.0398 | CosSim: 0.9921
Epoch [8/10] | Loss: 0.0002 | Var: 0.0371 | CosSim: 0.9907
Epoch [9/10] | Loss: 0.0002 | Var: 0.0358 | CosSim: 0.9886
Epoch [10/10] | Loss: 0.0003 | Var: 0.0331 | CosSim: 0.9862
