In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install wandb



In [3]:
import zipfile

zip_file_path = '/content/drive/MyDrive/OSCD.zip'
dataset_folder = '/content/drive/MyDrive/OSCD/'

if len(os.listdir(dataset_folder)) == 0:
  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      zip_ref.extractall(dataset_folder)

  print(f"Unzipped to: {dataset_folder}")

In [4]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import pathlib

dataset_path = pathlib.Path(os.path.join(dataset_folder, 'labelme'))
images_files = list(dataset_path.glob('**/*.jpg'))
annotations_files = list(dataset_path.glob('**/*.json'))


In [5]:
import torch
from torch.utils.data import Dataset, random_split
from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.tv_tensors import Mask, BoundingBoxes
from torchvision.transforms.v2 import functional as F
import json
from torchvision import transforms
import torchvision

from PIL import Image, ImageDraw

class OSCDDataset(Dataset):
  def __init__(self, images_files, annotations_files, transforms=None):
    super().__init__()
    self._file_keys = [i.stem for i in images_files]
    self._images_files = {i.stem: i for i in images_files}
    self._annotations_files = {a.stem: a for a in annotations_files}
    self._transforms = transforms

  def __len__(self):
    return len(self._file_keys)

  def __getitem__(self, idx):
    file_key = self._file_keys[idx]
    image, annotations_dict = self._read_data(file_key)
    shapes = annotations_dict['shapes']
    if len(shapes) == 0:
      target = self._get_background_only_target(image)
      pass
    else:
      target = self._get_target(image, shapes)

    # image = tv_tensors.Image(image)
    if self._transforms:
      image = self._transforms(image)
    return image, target

  def _read_data(self, file_key):
    image_file = self._images_files[file_key]
    image = Image.open(image_file).convert('RGB')

    annotations_file = self._annotations_files[file_key]
    with open(annotations_file, 'r') as f:
      annotations_dict = json.load(f)

    return image, annotations_dict

  def _get_background_only_target(self, image):
    mask_img = Image.new('L', image.size, 0)
    # negative_mask = Mask(mask_img, dtype=torch.bool)
    # bboxes = tv_tensors.BoundingBoxes([0, 0, image.size[0], image.size[1]], format="XYXY", canvas_size=F.get_size(image))
    # area = torch.tensor((image.size[0] * image.size[1]), dtype=torch.float32)
    target = {
        'boxes': torch.zeros((0, 4), dtype=torch.float32),
        'labels': torch.zeros(0, dtype=torch.int64),
        'masks': torch.zeros((0, image.size[1], image.size[0]), dtype=torch.bool),
        'area': torch.zeros(0, dtype=torch.float32),
        'iscrowd': torch.zeros(0, dtype=torch.int64),
    }
    return target

  def _get_target(self, image, shapes):
    labels = torch.ones((len(shapes), ), dtype=torch.int64)
    shape_points = [shape['points'] for shape in shapes]
    xy_coords = [[tuple(p) for p in points] for points in shape_points]
    ### masks
    masks = Mask(torch.concat([self._create_polygon_mask(image.size, xy) for xy in xy_coords]))
    boxes = masks_to_boxes(masks)
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    iscrowd = torch.zeros((len(shapes),), dtype=torch.int64)
    bboxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(image))
    target = {
        'boxes': bboxes,
        'labels': labels,
        'masks': Mask(masks),
        'area': area,
        'iscrowd': iscrowd
    }
    return target

  def _create_polygon_mask(self, image_size, vertices):
    mask_img = Image.new('L', image_size, 0)
    ImageDraw.Draw(mask_img, 'L').polygon(vertices, fill=(255))
    mask = Mask(mask_img, dtype=torch.bool)
    return mask

In [6]:

dataset = OSCDDataset(images_files, annotations_files, transforms=torchvision.transforms.ToTensor())
size = len(dataset)
train_size = int(0.8 * size)
val_size = int(0.1 * size)
test_size = size - train_size - val_size
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

In [7]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection.rpn import RegionProposalNetwork
from torchvision.models.detection.rpn import AnchorGenerator


def create_model(num_classes=2):

  model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")
  in_features = model.roi_heads.box_predictor.cls_score.in_features
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

  in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
  model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, num_classes)
  return model

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [14]:
import math
from tqdm.auto import tqdm

from torch.amp import autocast, GradScaler

def run_epoch(model, dataloader, optimizer, device, scaler, is_training):
    model.train()
    progress_bar = tqdm(total=len(dataloader), desc="Train" if is_training else "Valid")  # Initialize a progress bar
    epoch_total_loss = 0.
    epoch_losses = {
      'loss_classifier': 0,
      'loss_box_reg': 0.,
      'loss_mask': 0.,
      'loss_objectness': 0.,
      'loss_rpn_box_reg': 0.}
    for batch_id, (images, targets) in enumerate(dataloader):

        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        with autocast(device_type=device.type, dtype=torch.bfloat16):
            if is_training:
                losses = model(images, targets)
            else:
                with torch.no_grad():
                    losses = model(images, targets)

            total_loss = sum([loss for loss in losses.values()])

        if is_training:
            optimizer.zero_grad()
            if scaler:
                scaler.scale(total_loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                total_loss.backward()
                optimizer.step()

        epoch_losses = {k: v.item() + epoch_losses[k] for k, v in losses.items()}
        epoch_total_loss += total_loss.item()
        progress_bar_dict = dict(avg_loss=epoch_total_loss/(batch_id+1))
        progress_bar.set_postfix(progress_bar_dict)
        progress_bar.update()
        if is_training:
          assert not math.isnan(total_loss.item()) and math.isfinite(total_loss.item()), "Loss is NaN or infinite. Stopping training."
    progress_bar.close()
    epoch_losses = {k: v/(batch_id + 1) for k, v in epoch_losses.items()}
    return epoch_losses

In [15]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'valid/loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric
parameters_dict = {
    'epochs': {
        'values': [10]
        },
    'lr': {
        'values': [1e-4]
        },
    'weight_decay': {
          'values': [1e-2]
        },
    'bs': {
          'values': [2]
        },
    'save_model_every': {
          'values': [10]
        },
    'scheduler': {
          'values': ['step']
        },
    'step_size': {
          'values': [40]
        },
    'gamma': {
          'values': [0.1]
        },
    'optimizer_type': {
          'values': ['adamw']
        },
    }
sweep_config['parameters'] = parameters_dict

In [16]:
import wandb
sweep_id = wandb.sweep(sweep_config, project="box_segmentation")

Create sweep with ID: lt594in8
Sweep URL: https://wandb.ai/abdelrahman-farhan/box_segmentation/sweeps/lt594in8


In [17]:
from torch.optim import AdamW
import datetime
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


num_workers = 12

def train(config=None):
  with wandb.init(config=config):
    config = wandb.config

    model = create_model()
    model.to(device)
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer_type = config.optimizer_type

    if optimizer_type == 'sgd':
      optimizer = torch.optim.SGD(params, lr=config.lr, momentum=0.9, weight_decay=config.weight_decay)
    elif optimizer_type == 'adamw':
      optimizer = torch.optim.AdamW(params, lr=config.lr, weight_decay=config.weight_decay)

    if config.scheduler == 'step':
      lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config.step_size, gamma=config.gamma)
    elif config.scheduler == 'linear':
      lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=config.epochs)
    elif config.scheduler == 'cyclic':
      lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config.lr, total_steps=config.epochs)


    train_loader = DataLoader(train_data, batch_size=config.bs, shuffle=True, collate_fn= lambda batch: tuple(zip(*batch)), num_workers=num_workers)
    val_loader = DataLoader(val_data, batch_size=config.bs, shuffle=True, collate_fn= lambda batch: tuple(zip(*batch)), num_workers=num_workers)

    scaler = GradScaler()
    for epoch in tqdm(range(config.epochs), desc="Epochs"):

        train_losses = run_epoch(model, train_loader, optimizer, device, scaler, is_training=True)

        with torch.no_grad():
            valid_losses = run_epoch(model, val_loader, None, device, scaler, is_training=False)
        lr_scheduler.step()

        train_losses = {f'train/{k}': v for k, v in train_losses.items()}
        wandb.log(train_losses)
        train_loss = sum(train_losses.values())
        wandb.log({'train/loss': train_loss})

        valid_losses = {f'valid/{k}': v for k, v in valid_losses.items()}
        wandb.log(valid_losses)
        valid_loss = sum(valid_losses.values())
        wandb.log({'valid/loss': valid_loss})

        wandb.log({'lr': lr_scheduler.get_last_lr()[0]})
        model_name = f'model_{wandb.run.name}_{wandb.run.sweep_id}_{epoch}.pth'
        if (epoch+1) % config.save_model_every == 0:
          model_path = os.path.join(dataset_folder, 'model', model_name)
          torch.save(model.state_dict(), model_path)

In [19]:
wandb.agent(sweep_id, train, count=1)

[34m[1mwandb[0m: Agent Starting Run: ds41jnot with config:
[34m[1mwandb[0m: 	bs: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	gamma: 0.1
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer_type: adamw
[34m[1mwandb[0m: 	save_model_every: 10
[34m[1mwandb[0m: 	scheduler: step
[34m[1mwandb[0m: 	step_size: 40
[34m[1mwandb[0m: 	weight_decay: 0.01


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Train:   0%|          | 0/3360 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "<ipython-input-17-3864b8ec0b96>", line 38, in train
    train_losses = run_epoch(model, train_loader, optimizer, device, scaler, is_training=True)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-14-963ed2c56681>", line 16, in run_epoch
    for batch_id, (images, targets) in enumerate(dataloader):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 708, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1480, in _next_data
    return self._process_data(data)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1505, in _process_data
    data.reraise()
  File "/usr/local/lib/python3.11/dist-packages/torch/_utils.py", line 733, in reraise
    raise exception
RuntimeError: C

[34m[1mwandb[0m: [32m[41mERROR[0m Run ds41jnot errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-17-3864b8ec0b96>", line 38, in train
[34m[1mwandb[0m: [32m[41mERROR[0m     train_losses = run_epoch(model, train_loader, optimizer, device, scaler, is_training=True)
[34m[1mwandb[0m: [32m[41mERROR[0m                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-14-963ed2c56681>", line 16, in run_epoch
[34m[1mwandb[0m: [32m[41mERROR[0m     for batch_id, (images, targets) in enumerate(dataloader):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/to