## Task 2

### install required libraries

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
from PIL import ImageDraw
import os
import json
from tqdm import tqdm
import time  # For tracking training time
import numpy as  np

### create a custom dataset class to load images and masks 

In [2]:
class SegmentationDataset(Dataset):
    def __init__(self, image_dir, annotation_file, transform=None, resize_size=(224, 224)):
        self.image_dir = image_dir
        self.annotation_file = annotation_file
        self.transform = transform
        self.resize_size = resize_size  # Store resize size
        self.images = self._load_images()
        self.annotations = self._load_annotations()

    def _load_images(self):
        with open(self.annotation_file, 'r') as f:
            data = json.load(f)
        return {img['id']: img['file_name'] for img in data['images']}

    def _load_annotations(self):
        with open(self.annotation_file, 'r') as f:
            data = json.load(f)
        return {ann['image_id']: ann for ann in data['annotations']}

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image_id = list(self.images.keys())[idx]
        img_path = os.path.join(self.image_dir, self.images[image_id])
        image = Image.open(img_path).convert('RGB')

        mask = self._create_mask(image_id, image.size)

        # Resize both image and mask
        image = image.resize(self.resize_size, Image.BILINEAR)  # Use PIL's resize
        mask = Image.fromarray(mask).resize(self.resize_size, Image.NEAREST) # Resize mask
        mask = np.array(mask)

        if self.transform:
            image = self.transform(image)
            mask = torch.from_numpy(mask).long() # Ensure mask is LongTensor

        return image, mask

    def _create_mask(self, image_id, image_size):
        width, height = image_size
        mask = np.zeros((height, width), dtype=np.uint8)
        anns = [ann for ann in self.annotations.values() if ann['image_id'] == image_id]
        for ann in anns:
            if isinstance(ann['segmentation'], list):
                for polygon in ann['segmentation']:
                    polygon_points = [(polygon[i], polygon[i + 1]) for i in range(0, len(polygon), 2)]
                    ImageDraw.Draw(Image.fromarray(mask)).polygon(polygon_points, fill=ann['category_id'])
        return mask

### UNet model for training

In [3]:
class UNet(nn.Module):
    def __init__(self, in_channels=3, out_channels=None):
        super(UNet, self).__init__()
        if out_channels is None:
            out_channels = 81  # COCO has 80 classes + background
        # ... (UNet layers - encoder and decoder blocks) ...
        # Example (replace with actual UNet layers):
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)
        self.conv_out = nn.Conv2d(64, out_channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        # ... (UNet forward pass) ...
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv_out(x)
        return x

### Training loop

In [4]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for images, masks in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
            images = images.to(device)
            masks = masks.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Average Train Loss: {avg_train_loss:.4f}')

        # Validation (optional, but highly recommended)
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for images, masks in val_loader:
                images = images.to(device)
                masks = masks.to(device)
                outputs = model(images)
                loss = criterion(outputs, masks)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Average Val Loss: {avg_val_loss:.4f}')

        elapsed_time = time.time() - start_time
        print(f"Epoch time: {elapsed_time:.2f} seconds")
        if elapsed_time > 6 * 3600:  # Check for 6-hour limit
            print("Training stopped: Time limit exceeded.")
            break

    return model

### Main execution

In [None]:
import os

# Define base directory relative to the notebook location (which is vjti_solution)
# This assumes your 'src' folder is directly inside 'vjti_solution'
base_dir = 'src'

# Construct paths using os.path.join for better portability
image_dir = os.path.join(base_dir, 'train_subset')
annotation_file = os.path.join(base_dir, 'subset_train_annotations.json')
output_dir = os.path.join(base_dir, 'masks_2')

print(f"Current Working Directory: {os.getcwd()}") 
print(f"Image Directory: {image_dir}")
print(f"Annotation File: {annotation_file}")
print(f"Output Directory: {output_dir}")

Image Directory: src/train_subset
Annotation File: src/subset_train_annotations.json
Output Directory: src/masks_2


In [None]:
# Hyperparameters and data paths (adjust as needed)


batch_size = 8
num_epochs = 5
learning_rate = 0.001
resize_size = (224, 224)

# Transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    #transforms.Resize((224, 224), antialias=True), # Resize for efficiency
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Datasets and DataLoaders
full_dataset = SegmentationDataset(image_dir, annotation_file, transform=transform, resize_size=resize_size)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=64)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=64)

# Model, Criterion, Optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = UNet().to(device)  # Replace with your UNet implementation
criterion = nn.CrossEntropyLoss().to(device)  # Suitable for multi-class segmentation
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
trained_model = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)

# Save the model (optional)
torch.save(trained_model.state_dict(), 'segmentation_model.pth')

Epoch 1/5: 100%|██████████| 800/800 [04:00<00:00,  3.33it/s]

Epoch 1/5, Average Train Loss: 0.2785





Epoch 1/5, Average Val Loss: 0.0039
Epoch time: 266.28 seconds


Epoch 2/5: 100%|██████████| 800/800 [04:15<00:00,  3.13it/s]

Epoch 2/5, Average Train Loss: 0.0018





Epoch 2/5, Average Val Loss: 0.0007
Epoch time: 547.30 seconds


Epoch 3/5: 100%|██████████| 800/800 [04:36<00:00,  2.89it/s]

Epoch 3/5, Average Train Loss: 0.0005





Epoch 3/5, Average Val Loss: 0.0003
Epoch time: 849.57 seconds


Epoch 4/5: 100%|██████████| 800/800 [04:49<00:00,  2.76it/s]

Epoch 4/5, Average Train Loss: 0.0002





Epoch 4/5, Average Val Loss: 0.0001
Epoch time: 1164.87 seconds


Epoch 5/5: 100%|██████████| 800/800 [05:01<00:00,  2.65it/s]

Epoch 5/5, Average Train Loss: 0.0001





Epoch 5/5, Average Val Loss: 0.0001
Epoch time: 1493.20 seconds
