<a href="https://colab.research.google.com/github/DL4CV-NPTEL/Deep-Learning-For-Computer-Vision/blob/main/notebooks/Week_12/Week_12_Lecture_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lecture 3: Self-Supervised Learning

In [1]:
#@title 
from ipywidgets import widgets
out1 = widgets.Output()
with out1:
  from IPython.display import YouTubeVideo
  video = YouTubeVideo(id=f"hTroTWINb5w", width=854, height=480, fs=1, rel=0)
  print("Video available at https://youtube.com/watch?v=" + video.id)
  display(video)
display(out1)

Output()

In [2]:
#@title 
from IPython import display as IPyDisplay
IPyDisplay.HTML(
    f"""
  <div>
    <a href= "https://github.com/DL4CV-NPTEL/Deep-Learning-For-Computer-Vision/blob/main/Slides/Week_12/DL4CV_Week12_Part02.pdf" target="_blank">
    <img src="https://github.com/DL4CV-NPTEL/Deep-Learning-For-Computer-Vision/blob/main/Data/Slides_Logo.png?raw=1"
  alt="button link to Airtable" style="width:200px"></a>
    </div>""" )


## Image Inpainting

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.utils.data import Subset, DataLoader

In [4]:
# Define the Context Encoder model
class ContextEncoder(nn.Module):
    def __init__(self):
        super(ContextEncoder, self).__init__()

        # Encoder layers
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True)
        )

        # Decoder layers
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(64, 3, kernel_size=4, stride=2, padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [5]:
# Define the Discriminator model
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True)
        )

        self.classifier = nn.Sequential(
            nn.Linear(512 * 2 * 2, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [6]:
# Define the training loop
def train(model, discriminator, optimizer_g, optimizer_d, dataloader, num_epochs):
    criterion = nn.MSELoss()
    adversarial_loss = nn.BCELoss()
    for epoch in range(num_epochs):
        for data in dataloader:
            img, _ = data
            img = Variable(img)
            
            # Generate a mask for inpainting
            mask = torch.zeros_like(img)
            mask[:, :, 16:48, 16:48] = 1

            # Generate masked image
            masked_img = img * (1 - mask)

            # Forward pass through generator
            generated_img = model(masked_img)

            # Train Discriminator
            real_labels = Variable(torch.ones(img.size(0), 1))
            fake_labels = Variable(torch.zeros(img.size(0), 1))

            # Discriminator on real images
            optimizer_d.zero_grad()
            real_outputs = discriminator(img)
            real_loss = adversarial_loss(real_outputs, real_labels)

            # Discriminator on generated images
            fake_outputs = discriminator(generated_img.detach())
            fake_loss = adversarial_loss(fake_outputs, fake_labels)

            d_loss = real_loss + fake_loss
            d_loss.backward()
            optimizer_d.step()

            # Train Generator
            optimizer_g.zero_grad()

            # Forward pass through generator
            generated_img = model(masked_img)

            # Discriminator on generated images
            outputs = discriminator(generated_img)
            g_loss = adversarial_loss(outputs, real_labels)

            # Reconstruction loss
            recon_loss = criterion(generated_img * mask, img * mask)

            total_loss = g_loss + 0.1 * recon_loss
            total_loss.backward()
            optimizer_g.step()
            


        # Print epoch loss
        print('Epoch [{}/{}], Generator Loss: {:.4f}, Discriminator Loss: {:.4f}'
              .format(epoch + 1, num_epochs, g_loss.item(), d_loss.item()))


In [7]:
# Hyperparameters
learning_rate = 0.001
batch_size = 100
num_epochs = 5
subset_size = 50

In [8]:
# Load the dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:01<00:00, 103549893.38it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


In [9]:
# Create a balanced subset of the dataset
class_counts = [0] * 10  # Count the number of samples per class
subset_indices = []
for idx, (data, target) in enumerate(train_dataset):
    if class_counts[target] < subset_size:
        subset_indices.append(idx)
        class_counts[target] += 1
    if all(count == subset_size for count in class_counts):
        break

subset_dataset = Subset(train_dataset, subset_indices)
train_dataloader = DataLoader(subset_dataset, batch_size=batch_size, shuffle=True)

In [10]:
# Create an instance of the ContextEncoder model and the Discriminator model
model = ContextEncoder()
discriminator = Discriminator()


In [11]:
# Define optimizers for generator and discriminator
optimizer_g = optim.Adam(model.parameters(), lr=learning_rate)
optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate)

In [12]:
# Train the model
train(model, discriminator, optimizer_g, optimizer_d, train_dataloader, num_epochs)

Epoch [1/5], Generator Loss: 7.4896, Discriminator Loss: 0.0178
Epoch [2/5], Generator Loss: 7.3318, Discriminator Loss: 0.0062
Epoch [3/5], Generator Loss: 8.6788, Discriminator Loss: 0.0016
Epoch [4/5], Generator Loss: 9.1010, Discriminator Loss: 0.0011
Epoch [5/5], Generator Loss: 8.3456, Discriminator Loss: 0.0008
