<a href="https://colab.research.google.com/github/AbdElRahmanFarhan/box_semantic_segmentation/blob/main/hyperparameter_tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pycocotools
!pip install coco-eval
!pip install wandb

Collecting coco-eval
  Downloading coco_eval-0.0.4.tar.gz (4.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->coco-eval)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->coco-eval)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->coco-eval)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->coco-eval)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->coco-eval)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->coco-eval)
 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import wandb
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdofarhan75[0m ([33mabdelrahman-farhan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
import zipfile

zip_file_path = '/content/drive/MyDrive/OSCD.zip'
dataset_folder = '/content/drive/MyDrive/OSCD/'

if len(os.listdir(dataset_folder)) == 0:
  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      zip_ref.extractall(dataset_folder)

  print(f"Unzipped to: {dataset_folder}")

In [5]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

train_folder = os.path.join(dataset_folder, 'coco_carton/oneclass_carton/images/train2017')
val_folder = os.path.join(dataset_folder, 'coco_carton/oneclass_carton/images/val2017')
train_annotation = os.path.join(dataset_folder, 'coco_carton/oneclass_carton/annotations/instances_train2017.json')
val_annotation = os.path.join(dataset_folder, 'coco_carton/oneclass_carton/annotations/instances_val2017.json')

In [6]:
import torch
from torchvision.datasets import CocoDetection
import torchvision.transforms as T
from PIL import Image
from torchvision.tv_tensors import Mask
from torchvision.transforms.functional import to_tensor
from skimage.draw import polygon as sk_polygon


class OSCDDataset(CocoDetection):
    def __init__(self, img_folder, ann_file):
        super().__init__(img_folder, ann_file, transforms=None)

    def __len__(self) -> int:
       return super().__len__()

    def __getitem__(self, idx):
        img, anns = super().__getitem__(idx)

        if anns:
          labels = []
          areas = []
          iscrowd = []
          masks = []
          boxes = []
          ids = []
          for ann in anns:
              x, y, w, h = list(map(int, ann['bbox']))
              boxes.append([x, y, x + w, y + h])
              labels.append(ann['category_id'])
              areas.append(ann['area'])
              iscrowd.append(ann['iscrowd'])
              mask = self.get_mask(ann['segmentation'], img.size[1], img.size[0])
              masks.append(mask)
              ids.append(ann['id'])

          labels = torch.tensor(labels, dtype=torch.int64)
          areas = torch.tensor(areas, dtype=torch.float32)
          iscrowd = torch.tensor(iscrowd, dtype=torch.uint8)
          boxes = torch.tensor(boxes, dtype=torch.int64)
          masks = torch.stack(masks, dim=0)
          ids = torch.tensor(ids, dtype=torch.int64)
          img_id = torch.tensor(ann['image_id'], dtype=torch.int64)

          target = {
              "boxes": boxes,
              "labels": labels,
              "image_id": img_id,
              "ids": ids,
              "area": areas, # TODO: is it area or areas
              "iscrowd": iscrowd,
              "masks": Mask(masks),
          }
        else:
          target = {}
        img = to_tensor(img)
        return img, target

    def get_mask(self, segmentation, height, width):
        mask = torch.zeros((height, width), dtype=torch.bool)
        poly_x = segmentation[0][::2]
        poly_y = segmentation[0][1::2]
        rr, cc = sk_polygon(poly_y, poly_x, shape=(height, width))
        mask[rr, cc] = 1
        return mask


In [7]:
train_dataset = OSCDDataset(train_folder, train_annotation)
val_dataset = OSCDDataset(val_folder, val_annotation)
train_dataset_small = torch.utils.data.Subset(train_dataset, list(range(100)))
val_dataset_small = torch.utils.data.Subset(val_dataset, list(range(10)))

loading annotations into memory...
Done (t=2.99s)
creating index...
index created!
loading annotations into memory...
Done (t=0.43s)
creating index...
index created!


In [8]:
def collate_fn(batch):
    images, targets = [], []
    for (image, target) in batch:
      if not target:
        continue
      else:
        images.append(image)
        targets.append(target)
    return images, targets

In [9]:
# import torchvision
# from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

# model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")
# in_features = model.roi_heads.box_predictor.cls_score.in_features
# model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 2)

# in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
# model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, 2)

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
100%|██████████| 170M/170M [00:01<00:00, 174MB/s]


In [18]:
import torchvision
from torchvision.models.detection import MaskRCNN
from torchvision.models.detection.backbone_utils import BackboneWithFPN
from torchvision.models.resnet import resnet18
from torchvision.models.detection.backbone_utils import BackboneWithFPN

resnet = resnet18(pretrained=True)
backbone = torch.nn.Sequential(
    resnet.conv1,
    resnet.bn1,
    resnet.relu,
    resnet.maxpool,
    resnet.layer1,
    resnet.layer2,
    resnet.layer3,
    resnet.layer4
)

return_layers = {
    '4': '0',  # layer1
    '5': '1',  # layer2
    '6': '2',  # layer3
    '7': '3',  # layer4
}

in_channels_list = [64, 128, 256, 512]
out_channels = 256

fpn_backbone = BackboneWithFPN(
    backbone,
    return_layers=return_layers,
    in_channels_list=in_channels_list,
    out_channels=out_channels
)

model = MaskRCNN(backbone=fpn_backbone, num_classes=2)

In [19]:
model

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [21]:
import math
from tqdm.auto import tqdm
from torch.amp import autocast, GradScaler

def run_epoch(model, dataloader, optimizer, lr_scheduler, device, scaler, is_training):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(total=len(dataloader), desc="Train" if is_training else "Valid")  # Initialize a progress bar
    batch_counter = 0
    for batch_id, (images, targets) in enumerate(dataloader):
        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        if len(targets) == 0:
          continue
        batch_counter += 1
        with autocast(device.type):
            if is_training:
                losses = model(images, targets)
            else:
                with torch.no_grad():
                    losses = model(images, targets)

            loss = sum([loss for loss in losses.values()])

        if is_training:
            if scaler:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                old_scaler = scaler.get_scale()
                scaler.update()
                new_scaler = scaler.get_scale()
                if new_scaler >= old_scaler:
                    lr_scheduler.step()
            else:
                loss.backward()
                optimizer.step()
                lr_scheduler.step()

            optimizer.zero_grad()

        loss_item = loss.item()
        epoch_loss += loss_item

        progress_bar_dict = dict(loss=loss_item, avg_loss=epoch_loss/(batch_counter+1))
        if is_training:
            progress_bar_dict.update(lr=lr_scheduler.get_last_lr()[0])
        progress_bar.set_postfix(progress_bar_dict)
        progress_bar.update()

    progress_bar.close()

    return epoch_loss / (batch_counter + 1)

In [22]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'val_loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric
parameters_dict = {
    'epochs': {
        'values': [30]
        },
    'lr': {
        'values': [5e-5]
        },
    'bs': {
          'values': [4]
        },
    }

sweep_config['parameters'] = parameters_dict

In [23]:
sweep_id = wandb.sweep(sweep_config, project="box_segmentation")

Create sweep with ID: 0pywdcyt
Sweep URL: https://wandb.ai/abdelrahman-farhan/box_segmentation/sweeps/0pywdcyt


In [24]:
from torch.optim import AdamW
import datetime

def train(config=None):
  with wandb.init(config=config):
    config = wandb.config
    lr = config.lr
    epochs = config.epochs
    bs = config.bs

    model.to(device)
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.AdamW(params, lr=lr)
    train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=bs, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=2)

    # train_loader = DataLoader(train_dataset_small, batch_size=bs, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=2)
    # val_loader = DataLoader(val_dataset_small, batch_size=bs, shuffle=True, collate_fn=collate_fn, pin_memory=True)

    lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                      max_lr=lr,
                                                      total_steps=epochs*len(train_loader))
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step, gamma=gamma)

    scaler = GradScaler()
    for epoch in tqdm(range(epochs), desc="Epochs"):

        train_loss = run_epoch(model, train_loader, optimizer, lr_scheduler, device, scaler, is_training=True)

        with torch.no_grad():
            valid_loss = run_epoch(model, val_loader, None, None, device, scaler, is_training=False)
        # lr_scheduler.step()
        wandb.log({"train_loss": train_loss, 'val_loss': valid_loss, 'lr': lr_scheduler.get_last_lr()[0]})
        if (epoch+1) % 5 == 0:
          timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
          model_path = os.path.join(dataset_folder, 'model', f'model_{timestamp}_{wandb.run.sweep_id}_{(epoch+1)}.pth')
          torch.save(model.state_dict(), model_path)

In [25]:
wandb.agent(sweep_id, train, count=1)

[34m[1mwandb[0m: Agent Starting Run: v9pcb8uc with config:
[34m[1mwandb[0m: 	bs: 4
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 5e-05


Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

Valid:   0%|          | 0/250 [00:00<?, ?it/s]

Train:   0%|          | 0/1851 [00:00<?, ?it/s]

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


In [16]:
print("test")

test
