In [32]:
%pip install pycocotools safetensors

Note: you may need to restart the kernel to use updated packages.


In [33]:
import os 

# Use local dataset path from kagglehub download
base_path = "/home/loc-dang/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/2/coco2017"
print("Annotations:", os.listdir(os.path.join(base_path, "annotations")))
print("Training images:", len(os.listdir(os.path.join(base_path, "train2017"))))
print("Validation images:", len(os.listdir(os.path.join(base_path, "val2017"))))
print("Test images:", len(os.listdir(os.path.join(base_path, "test2017"))))

Annotations: ['captions_val2017.json', 'instances_val2017.json', 'captions_train2017.json', 'person_keypoints_val2017.json', 'instances_train2017.json', 'person_keypoints_train2017.json']


Training images: 118287
Validation images: 5000
Test images: 40670


In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

PyTorch version: 2.7.1+cu126
CUDA available: True
CUDA version: 12.6
Number of GPUs: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3060 Laptop GPU
Using device: cuda


In [35]:
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=0),
            nn.ReLU(inplace=True)
        )
    def forward(self, x):
        return self.double_conv(x)

In [36]:
# Test DoubleConv block
print("🧪 Testing DoubleConv:")
test_input = torch.randn(1, 3, 100, 100)  # [batch, channels, height, width]
double_conv = DoubleConv(3, 64)
output = double_conv(test_input)

print(f"Input shape: {test_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Size change: {test_input.shape[-1]} -> {output.shape[-1]}")
print(f"Pixels lost: {test_input.shape[-1] - output.shape[-1]} (due to unpadded conv)")

🧪 Testing DoubleConv:
Input shape: torch.Size([1, 3, 100, 100])
Output shape: torch.Size([1, 64, 96, 96])
Size change: 100 -> 96
Pixels lost: 4 (due to unpadded conv)


In [37]:
class Encoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.double_conv = DoubleConv(in_channels, out_channels)
    
    def forward(self, x):
        x = self.max_pool(x)
        x = self.double_conv(x)
        return x

In [53]:
class Decoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Decoder, self).__init__()
        self.up_conv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2)
        self.double_conv = DoubleConv(in_channels, out_channels)
    
    def forward(self, x1, x2):
        batch, channels, x2_height,x2_width = x2.shape
        up_sample = self.up_conv(x1)
        each_size = (x2_height - up_sample.shape[2]) // 2  

        x2_crop = x2[:,:,each_size:x2_height - each_size, each_size:x2_width - each_size]
        concatenate = torch.cat([up_sample, x2_crop], dim=1)

        return self.double_conv(concatenate)

In [39]:
class UNet(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(UNet, self).__init__()

        self.encoder_0 = DoubleConv(in_channels=in_channels, out_channels=64)
        self.encoder_1 = Encoder(64, 128)
        self.encoder_2 = Encoder(128, 256)
        self.encoder_3 = Encoder(256, 512)
        self.encoder_4 = Encoder(512, 1024)

        self.decoder_0 = Decoder(1024, 512)
        self.decoder_1 = Decoder(512, 256)
        self.decoder_2 = Decoder(256, 128)
        self.decoder_3 = Decoder(128, 64)

        self.out = nn.Conv2d(64, num_classes, kernel_size=1, padding=0)

    def forward(self, x):
        x1 = self.encoder_0(x)
        x2 = self.encoder_1(x1)
        x3 = self.encoder_2(x2)
        x4 = self.encoder_3(x3)
        x5 = self.encoder_4(x4)

        x = self.decoder_0(x5, x4)
        x = self.decoder_1(x, x3)
        x = self.decoder_2(x, x2)
        x = self.decoder_3(x, x1)
        return self.out(x)



In [54]:
# Test UNet Architecture
print("🚀 Testing Complete U-Net:")

# Create model
model = UNet(in_channels=3, num_classes=2)  # RGB input, binary segmentation
print(f"Model created: {model.__class__.__name__}")

# Test with dummy input
test_input = torch.randn(1, 3, 572, 572)  # Original paper input size
print(f"Input shape: {test_input.shape}")

# Forward pass
with torch.no_grad():
    output = model(test_input)
    
print(f"Output shape: {output.shape}")
print(f"Input size: {test_input.shape[-1]} -> Output size: {output.shape[-1]}")
print(f"Size reduction: {test_input.shape[-1] - output.shape[-1]} pixels")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nModel Parameters:")
print(f"Total: {total_params:,}")
print(f"Trainable: {trainable_params:,}")

🚀 Testing Complete U-Net:
Model created: UNet
Input shape: torch.Size([1, 3, 572, 572])
Output shape: torch.Size([1, 2, 388, 388])
Input size: 572 -> Output size: 388
Size reduction: 184 pixels

Model Parameters:
Total: 31,031,810
Trainable: 31,031,810
