In [1]:
#!pip install -r /kaggle/input/pylit-wandb-smp-requirements/requirements.txt -q
!pip install segmentation_models_pytorch -q
!pip install icecream -q


In [2]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torchvision import transforms
import matplotlib.pyplot as plt
from datetime import time 
import torch
from torch import nn
import segmentation_models_pytorch as smp
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
import random
import metrics
import numpy as np
import cv2
import glob
import matplotlib.pyplot as plt
from icecream import ic

from idd_lite_helpers.idd_lite_helpers import IDD_Main_Dataset
from idd_lite_helpers.idd_lite_helpers import IDDRoadSegmentationDatamodule as dmidd

import wandb
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import typing 
import os
import math
from datetime import datetime

# Seed random generator for repeatibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


In [3]:
torch.set_float32_matmul_precision('medium' )

In [4]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
def decode_segmap(image, threshold=0.5):#changing single channel to 3 channel
    
    #print(image)#RGB
    image = image>threshold
    #print(image.shape)
    Background_scene = [255,255,255]
    Road = [51, 153, 255]

    label_colours = np.array([Road,Background_scene]).astype(np.uint8)
    
    #print(label_colours.shape)
    r = np.zeros_like(image).astype(np.uint8)
    g = np.zeros_like(image).astype(np.uint8)
    b = np.zeros_like(image).astype(np.uint8)
    
    for l in range(2):
        r[image == l] = label_colours[l, 0]
        g[image == l] = label_colours[l, 1]
        b[image == l] = label_colours[l, 2]

    rgb = np.zeros((image.shape[0], image.shape[1], 3)).astype(np.uint8)
    rgb[:, :, 0] = r
    rgb[:, :, 1] = g
    rgb[:, :, 2] = b
    return rgb


In [6]:
class BinarySegmentationForIdd(pl.LightningModule):
    def __init__(self,
                 model_name :str = 'unet',
                 encoder_name : str = 'efficientnet-b2',
                 encoder_weights :str = 'imagenet',
                 lr_e : float = 1e-1,
                 lr_d : float = 1e-3,
                    ):
        super().__init__()
        self.save_hyperparameters()
        self.tp, self.fp, self.fn, self.tn = 0,0,0,0
        self.loss_function = smp.losses.DiceLoss(smp.losses.BINARY_MODE, from_logits = True)
        self.name = model_name
        self.encoder_name = encoder_name
        self.encoder_weights = encoder_weights
        self.maxmiou = 1e-4
        self.start_time = 0
        if self.name == 'unet':
            self.model = smp.Unet(
                        encoder_name = self.encoder_name,
                        encoder_weights = self.encoder_weights,
                        in_channels = 3,
                        classes = 1
                        )
        elif self.name =='deeplabv3p':
            self.model = smp.DeepLabV3Plus(
                        encoder_name = self.encoder_name, 
                        encoder_depth = self.encoder
            )
    def forward(self,x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        imgs, masks = batch
        outputs = self(imgs)
        train_loss = self.loss_function(outputs, masks)
        self.log('train/train_loss', train_loss, on_step = True, on_epoch = True)
        return train_loss

    def on_validation_epoch_start(self): 
        self.val_step_outputs = []

    def validation_step(self,batch, batch_idx):
        imgs, masks = batch
        outputs = self(imgs)#batch,channel,height, width
        self.val_step_outputs.append(torch.sigmoid(outputs))
        val_loss = self.loss_function(outputs, masks)
        self.log('val/val_loss', val_loss, on_step=False, on_epoch=True)

        this_tp, this_fp, this_fn, this_tn = metrics.get_stats(outputs.squeeze(), masks, mode = 'binary', threshold=0.5)

        self.tp += this_tp
        self.fp += this_fp
        self.fn += this_fn
        self.tn += this_tn

        return val_loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam([
            {'params': self.model.encoder.parameters(), 'lr': self.hparams.lr_e},
            {'params': self.model.decoder.parameters(), 'lr': self.hparams.lr_d},
            ])
        scheduler = StepLR(optimizer, step_size=30, gamma=0.1, verbose=True)
        return {'optimizer':optimizer,'lr_scheduler':{'scheduler': scheduler, 'monitor': 'val_loss'}}

    def on_validation_epoch_end(self):
        val_miou = metrics.iou_score(sum(self.tp), sum(self.fp), sum(self.fn), sum(self.tn), reduction = 'micro')
        self.log('val/val_accuracy', val_miou)

        if val_miou> self.maxmiou:
            self.maxmiou = val_miou
            checkpoint = {
                'epochs': self.current_epoch,
                'state_dict': self.state_dict(),
                'miou': self.maxmiou,
                #to do add optimizer state dict if using lr scheduler 
            }
            torch.save(checkpoint, f'./{self.name}_{self.encoder_name}_accuracy{self.maxmiou:.4f}.pth')
            ckpt_artifact = wandb.Artifact(
                                f'{self.name}_artifact_ckpt', type = 'model'    
                                )
            ckpt_artifact.add_file(f'./{self.name}_{self.encoder_name}_accuracy{self.maxmiou:.4f}.pth')
            self.logger.experiment.log_artifact(ckpt_artifact)
            self.log('New best model saved with miou',self.maxmiou)

        self.tp, self.fp, self.fn, self.tn = 0,0,0,0

        flattened_prob = torch.flatten(torch.cat(self.val_step_outputs)).cpu().detach()
        try:
            self.logger.experiment.log({
            'valid/sigmoid': wandb.Histogram(flattened_prob),
            'epoch': self.current_epoch
            })
        except Exception as e:
            print(f"Error logging to WandB: {e}")
            
    def test_step(self, batch,batch_idx):
        imgs, masks = batch
        outputs = self(imgs)#batch,channel,height, width
        test_loss = self.loss_function(outputs, masks)
        self.log('test/test_loss', test_loss, on_step=False, on_epoch=True)

        this_tp, this_fp, this_fn, this_tn = metrics.get_stats(outputs.squeeze(), masks, mode = 'binary', threshold=0.5)

        self.tp += this_tp
        self.fp += this_fp
        self.fn += this_fn
        self.tn += this_tn

        return outputs 
                
    def on_test_epoch_start(self):
        self.start_time = datetime.now()
    def on_test_epoch_end(self):
        
        test_miou = metrics.iou_score(sum(self.tp), sum(self.fp), sum(self.fn), sum(self.tn), reduction = 'micro')
        self.log('test/test_accuracy', test_miou)
        final_time = datetime.now()-self.start_time 
        print('time taken for 1 epoch inference is {}'.format(final_time))
        self.start_time = 0
        self.tp, self.fp, self.fn, self.tn = 0,0,0,0
        
        #ic(self.trainer.max_epochs-1)
        #ic(self.current_epoch)
        if self.current_epoch == (self.trainer.max_epochs):
            dummy_input = torch.zeros((1,3,448,448), device=self.device)
            model_filename = f"model_{self.name}_{self.current_epoch}.onnx"
            torch.onnx.export(self, dummy_input, model_filename, opset_version=11)
            onnx_artifact = wandb.Artifact(name=f"model_{self.name}_onnx_maxiou{self.max_iou:.4f})", type="model")
            onnx_artifact.add_file(model_filename)
            self.logger.experiment.log_artifact(onnx_artifact)
            ic('exported')



In [7]:
class ImagePredictionLogger(pl.Callback):
    def __init__(self, val_samples, num_samples=3):
        super().__init__()
        self.X_img_samples, self.mask_samples = val_samples
        self.X_img_samples= self.X_img_samples[:num_samples]
        self.mask_samples= self.mask_samples[:num_samples] 

    def on_validation_epoch_end(self, trainer, pl_module):#remember model is now pl_module

        
        self.X_img_samples = self.X_img_samples.to(pl_module.device)
        output_samples = pl_module(self.X_img_samples)


        #output_samples = output_samples*torch.Tensor([0.2588, 0.2734, 0.2997]) + torch.Tensor([0.3606, 0.3771, 0.3724])

        table = wandb.Table(columns = ["images", "predictions", "targets"] 
            )
        for X_img, output, mask in zip(self.X_img_samples.to("cpu"), output_samples.to("cpu"), self.mask_samples.to("cpu")):
            segmap_pred = decode_segmap(output.squeeze().numpy())
            segmap_gt = decode_segmap(mask.numpy())

            table.add_data(wandb.Image(X_img.numpy().transpose(1,2,0)*255), 
                    wandb.Image(segmap_pred), 
                    wandb.Image(segmap_gt)
                    )    

        trainer.logger.experiment.log(
            {'val_images_table': table}
        )

In [8]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'New best model saved with miou',
    'goal': 'maximize'
    }

sweep_config['metric'] = metric

parameters_dict = {
    'batch_size':{
        'values':[4,8,16,32]
    },
    'lr_e': {
    'distribution': 'log_uniform_values',
    'min': 5e-3,
    'max': 5e-1
    },
    'lr_d':{
    'distribution': 'log_uniform_values',
    'min': 5e-5,
    'max': 5e-3   
    },
    'image_ip_size':{
        'values': [224,384,512]
    }
    }

parameters_dict.update({
    'epochs':{'value': 20 },
    'model_name': {'value':'unet'},
    'encoder_name' :  {'value':'mobilenet_v2'},
    'encoder_weights' :{'value':'imagenet'},
})

sweep_config['parameters'] = parameters_dict

In [9]:
sweep_id = wandb.sweep(sweep_config, project='multiprocessed dataloader,idd_lite_unet_binary_road_segmenation',
                )

Create sweep with ID: 2ajufm9l
Sweep URL: https://wandb.ai/dayaalex/multiprocessed%20dataloader%2Cidd_lite_unet_binary_road_segmenation/sweeps/2ajufm9l


In [10]:
def train_using_wandb():
    run = wandb.init(project = 'multiprocessed dataloader,idd_lite_unet_binary_road_segmenation',
                config = wandb.config
                )
    config = run.config
    run_name = f' 1 gpu lr {config.lr_d:.4f}, epochs {config.epochs}, batch_size: {config.batch_size}'
    wandb.run.name = run_name

    datamod = dmidd(batch_size=config.batch_size, size = config.image_ip_size)
    datamod.setup()

    model = BinarySegmentationForIdd(model_name= config.model_name,
                                     encoder_name = config.encoder_name,
                                     encoder_weights = config.encoder_weights,
                                     lr_e = config.lr_e,
                                     lr_d = config.lr_d)       

    logger = WandbLogger()
    wandb.watch(model, model.loss_function, log= 'all', log_freq = 160 )
    val_samples = next(iter(datamod.val_dataloader()))


    trainer = pl.Trainer(
        accelerator="gpu", devices=1,
        logger = logger,
        log_every_n_steps = 1,
        max_epochs = config.epochs,
        callbacks = [ImagePredictionLogger(val_samples)]#, gpu_stats]
    )
    
    trainer.fit(model,datamod)
    trainer.test(datamodule = datamod, ckpt_path='best')


In [None]:
wandb.agent(sweep_id, train_using_wandb, count=15)

[34m[1mwandb[0m: Agent Starting Run: ov4f4yzm with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	encoder_name: mobilenet_v2
[34m[1mwandb[0m: 	encoder_weights: imagenet
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	image_ip_size: 224
[34m[1mwandb[0m: 	lr_d: 0.0006664921399265583
[34m[1mwandb[0m: 	lr_e: 0.17509910307480864
[34m[1mwandb[0m: 	model_name: unet
[34m[1mwandb[0m: Currently logged in as: [33mdayaalex[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 195MB/s]
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.


Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.7510e-01.
Adjusting learning rate of group 1 to 6.6649e-04.


Validation: |          | 0/? [00:00<?, ?it/s]



Testing: |          | 0/? [00:00<?, ?it/s]

time taken for 1 epoch inference is 0:00:00.379008


  if h % output_stride != 0 or w % output_stride != 0:


VBox(children=(Label(value='131.600 MB of 131.600 MB uploaded (3.743 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
New best model saved with miou,▁▄▇█
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/train_loss_epoch,█▆▄▃▃▂▂▂▁▁▁▁▁▁▂▁▁▁▁▁
train/train_loss_step,█▆▆▇▃▃▄▄▃▃▃▃▁▃▁▂▂▄▂▁▁▁▁▄▂▂▁▂▃▃▁▂▂▄▃▁▂▂▂▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val/val_accuracy,▇▇▇▇████▅████▇▁█▇███
val/val_loss,▃▂▂▂▁▁▁▁▃▁▁▁▁▁█▁▂▁▁▁

0,1
New best model saved with miou,0.88401
epoch,19.0
train/train_loss_epoch,0.07981
train/train_loss_step,0.09317
trainer/global_step,3499.0
val/val_accuracy,0.85191
val/val_loss,0.08689


Run ov4f4yzm errored:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_35/3789722556.py", line 32, in train_using_wandb
    trainer.test(datamodule = datamod, ckpt_path='best')
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 754, in test
    return call._call_and_handle_interrupt(
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 794, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 987, in _run
    results = self._run_stage()
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.


Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0137e-02.
Adjusting learning rate of group 1 to 1.8916e-03.


Validation: |          | 0/? [00:00<?, ?it/s]



Testing: |          | 0/? [00:00<?, ?it/s]

time taken for 1 epoch inference is 0:00:00.308434


VBox(children=(Label(value='286.908 MB of 286.908 MB uploaded (5.515 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
New best model saved with miou,▁▃▄▅▅▆▇▇██
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/train_loss_epoch,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/train_loss_step,█▆▅▄▃▃▂▃▂▂▂▂▂▂▂▂▂▂▁▂▃▂▂▁▁▂▂▃▁▂▂▂▁▃▁▁▁▂▁▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val/val_accuracy,▂▄▃▄▄▆▆▁▅▆▆▇▇▇█▇▆█▇▆
val/val_loss,█▅▄▃▃▂▂▄▂▂▂▁▂▁▁▁▂▁▁▂

0,1
New best model saved with miou,0.90731
epoch,19.0
train/train_loss_epoch,0.0519
train/train_loss_step,0.06042
trainer/global_step,3499.0
val/val_accuracy,0.88831
val/val_loss,0.06104


Run gvcfx4ii errored:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_35/3789722556.py", line 32, in train_using_wandb
    trainer.test(datamodule = datamod, ckpt_path='best')
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 754, in test
    return call._call_and_handle_interrupt(
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 794, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 987, in _run
    results = self._run_stage()
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",



Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 8.2960e-02.
Adjusting learning rate of group 1 to 2.7387e-04.


Validation: |          | 0/? [00:00<?, ?it/s]



Testing: |          | 0/? [00:00<?, ?it/s]

time taken for 1 epoch inference is 0:00:00.487405


VBox(children=(Label(value='186.845 MB of 186.845 MB uploaded (7.598 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
New best model saved with miou,▁▆▇▇██
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/train_loss_epoch,█▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/train_loss_step,█▅▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▂▂▁▁▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val/val_accuracy,▁▆▇▃▇▇▁▄▇██▆███▇██▇█
val/val_loss,█▄▃▆▃▃▇▅▃▂▂▃▂▂▁▂▁▁▁▁

0,1
New best model saved with miou,0.88413
epoch,19.0
train/train_loss_epoch,0.20491
train/train_loss_step,0.19005
trainer/global_step,859.0
val/val_accuracy,0.87643
val/val_loss,0.21024


Run 6p2o2w37 errored:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_35/3789722556.py", line 32, in train_using_wandb
    trainer.test(datamodule = datamod, ckpt_path='best')
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 754, in test
    return call._call_and_handle_interrupt(
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 794, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 987, in _run
    results = self._run_stage()
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",



Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.1903e-01.
Adjusting learning rate of group 1 to 2.7560e-04.


Validation: |          | 0/? [00:00<?, ?it/s]



Testing: |          | 0/? [00:00<?, ?it/s]

time taken for 1 epoch inference is 0:00:00.202735


VBox(children=(Label(value='234.547 MB of 234.547 MB uploaded (4.382 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
New best model saved with miou,▁▄▅▇▇▇█
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/train_loss_epoch,█▆▅▅▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁
train/train_loss_step,█▆▆▆▅▄▅▅▆▄▅▅▃▅▄▃▂▄▃▃▂▃▂▃▃▂▂▂▂▂▂▂▂▂▃▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val/val_accuracy,▃▁▃▅▅▆▆▄▇▆▇█▆██▇█▇▇▇
val/val_loss,██▇▅▅▅▅▄▃▃▃▂▃▂▂▂▁▁▁▁

0,1
New best model saved with miou,0.91113
epoch,19.0
train/train_loss_epoch,0.09561
train/train_loss_step,0.10187
trainer/global_step,1739.0
val/val_accuracy,0.90225
val/val_loss,0.09408


Run sgpkdkz7 errored:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_35/3789722556.py", line 32, in train_using_wandb
    trainer.test(datamodule = datamod, ckpt_path='best')
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 754, in test
    return call._call_and_handle_interrupt(
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 794, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 987, in _run
    results = self._run_stage()
  File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",



Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

Adjusting learning rate of group 0 to 1.0560e-02.
Adjusting learning rate of group 1 to 5.3170e-05.


Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:

# lr_list = []
# for lr in checkpoint['optimizer.state_dict']:
#     lr_list.append(lr)
# print(lr)

In [None]:
run = wandb.init(project = 'IDD_lite_hyperparametersweep_for_unet_binary_road_segmenation',
                config = {'model_name':'unet',
                          'encoder_name':'mobilenet_v2',
                          'encoder_weights':'imagenet',
                          'lr_e': 1.5720e-02,
                          'lr_d': 1.0047e-03,
                          'epochs':1,
                          'batch_size':32,
                          'image_ip_size':224
                         }
                )
config = run.config
run_name = f'inference 1 gpu lr {config.lr_d:.4f}, epochs {config.epochs}, batch_size: {config.batch_size}'
wandb.run.name = run_name

datamod = IDDRoadSegmentationDatamodule(batch_size=config.batch_size, size = config.image_ip_size)
datamod.setup()

model = BinarySegmentationForIdd(model_name= config.model_name,
                                 encoder_name = config.encoder_name,
                                 encoder_weights = config.encoder_weights,
                                 lr_e = 1.5720e-02,
                                 lr_d = config.lr_d)       

#wandb.watch(model, model.loss_function, log= 'all', log_freq = 1800 )#log every 360th batch, the grad, weights



    
    

In [None]:
checkpoint = torch.load('/kaggle/input/unet-mobilenetv2/pytorch/91miou/1/unet_mobilenet_v2_accuracy0.9138.pth')
model.load_state_dict(checkpoint['state_dict'])
logger = WandbLogger()


In [None]:
trainer.test(datamodule = datamod, ckpt_path = '/kaggle/input/unet-mobilenetv2/pytorch/91miou/1/unet_mobilenet_v2_accuracy0.9138.pth')

In [None]:
def process_images(model, image, h, w):
  
    image = cv2.resize(image,(224,224))
    
    #print(image.shape)
    image_tensor = torch.tensor(image, dtype=torch.float32)
    image_tensor = image_tensor / 255.0  # Normalize to [0, 1]
    image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0) 
    model.eval()
    with torch.inference_mode():
        pred_mask = model(image_tensor)
        
    mask = pred_mask.squeeze()>0.5
    zero_image = np.zeros_like(mask)
    mask = np.stack((mask, mask, mask), axis=-1)*255
    mask = np.asarray(mask, np.uint8)
#     print("Image shape:", image.shape)
#     print("Mask shape:", mask.shape)
    
    
    final_image = cv2.addWeighted(image, 1.0,mask,0.5,0.0)
    final_image = cv2.resize(final_image, (w, h))
    return final_image

    
    

In [None]:
path = '/kaggle/input/inf-vid-2/A_ one_ minute_tour_of_RIT_2k17_(www.KeepVid.to)_BIG.mp4'
vid_object = cv2.VideoCapture(path)
frame_width = int(vid_object.get(3))
frame_height = int(vid_object.get(4))

fourcc = cv2.VideoWriter_fourcc('m','p','4','v')
fps =vid_object.get(cv2.CAP_PROP_FPS)
print(fps)
output = cv2.VideoWriter(
          '/kaggle/working/A_one_ minute_tour_of_RIT_2k17_(www.KeepVid.to)_BIG.mp4',
          fourcc,
          fps,
          (frame_width,frame_height)
        )

while(vid_object.isOpened()):
    
    ret, frame = vid_object.read()
    if ret == True:
#         print('working')
        tqdm(output.write(process_images(model,frame,frame_height, frame_width)))
    else:
        break
vid_object.release()
output.release()


In [None]:
!pip install torchsummary
