Video Link: https://www.youtube.com/watch?v=IHq1t7NxS8k&t=1450s

## **Unzipping data**

In [1]:
import os
from PIL import Image
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms.functional as TF
from torch.utils.data import Dataset
import numpy as np
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm # progress bar
import torch.optim as optim

In [2]:
os.makedirs('/kaggle/working/TrainPhotos')
os.makedirs('/kaggle/working/ValidPhotos')

### Training dataset

In [3]:
#unzipping photos
path_to_zip_file = "/kaggle/input/carvana-image-masking-challenge/train.zip"
directory_to_extract_to = "/kaggle/working/TrainPhotos"

import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
  zip_ref.extractall(directory_to_extract_to)

In [4]:
# unzipping masks
path_to_zip_file = "/kaggle/input/carvana-image-masking-challenge/train_masks.zip"
directory_to_extract_to = "/kaggle/working/TrainPhotos"

import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
  zip_ref.extractall(directory_to_extract_to)

For validation dataset

In [5]:
os.makedirs('/kaggle/working/ValidPhotos/valid')
os.makedirs('/kaggle/working/ValidPhotos/valid_masks')

In [6]:
train_photos_path = "/kaggle/working/TrainPhotos/train"
train_masks_path = "/kaggle/working/TrainPhotos/train_masks"


valid_photos_path = "/kaggle/working/ValidPhotos/valid"
valid_masks_path = "/kaggle/working/ValidPhotos/valid_masks"



photos = os.listdir(train_photos_path)


valid_count = int(len(photos) * 0.2)
valid_photos = photos[:valid_count]


for photo in valid_photos:
    
    im = Image.open(os.path.join(train_photos_path, photo))
    im = im.convert("RGB")
    jpg_path = os.path.join(valid_photos_path, os.path.splitext(photo)[0] + ".jpg")
    im.save(jpg_path)
    
    
    mask_name = os.path.splitext(photo)[0] + '_mask.gif'
    mask_path = os.path.join(train_masks_path, mask_name)
    
    
    mask_im = Image.open(mask_path)
    mask_im = mask_im.convert("RGB")
    jpg_mask_path = os.path.join(valid_masks_path, os.path.splitext(mask_name)[0] + ".gif")
    mask_im.save(jpg_mask_path)

## Dataset

In [7]:
# this class acts as out dataset creator in pytorch, 
# we basically have to do it in pytorch its like mandatory thing.
# To create our dataset class and to get this in our dataloader

class CarvanaDataset(Dataset):
    def __init__(self,image_dir,mask_dir,transform = None):
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.transform = transform
        self.images = os.listdir(image_dir)
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self,index):
        img_path = os.path.join(self.image_dir,self.images[index])
        mask_path = os.path.join(self.mask_dir,self.images[index].replace(".jpg","_mask.gif"))
        
        image = np.array(Image.open(img_path).convert("RGB")) 
        # we are using np array because we are going to use albumentations library for
        # data augmentation. so PIL has to be converted to np array
        
        mask = np.array(Image.open(mask_path).convert("L"),dtype = np.float32)
        # mask is grayscale so we are converting to L
        # 0.0 or 255 black and white respectively
        
        mask[mask == 255.0] = 1.0
        # we are using sigmoid as our last activation to label it nicely we are 
        # normalizing it to 1 and 0
        
        if self.transform is not None:
            augmentations = self.transform(image=image, mask=mask)
            image = augmentations["image"]
            mask = augmentations["mask"]
        return image,mask
            
        
        
        
        
        

## Creating Model

In [8]:
# Image segmentation tutorial with U network from scratch !!!
# creating U net based on the 2015 paper with slight variations i have noted them in the code
class DoubleConv(nn.Module):
    def __init__(self,in_channels,out_channels):
        super(DoubleConv,self).__init__()
        self.conv = nn.Sequential(
            # Same convolution means when you pad, the output size is the same as the input size.
            # Basically you pad,let's say a 6 by 6 image in such a way
            # that the output should also be a 6 by 6 image.
            # bias is false cuz we are using BatchNormalization based on the 2016 paper
            nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )
    def forward(self,x):
        return self.conv(x)

In [9]:
class UNET(nn.Module):
    def __init__(self,in_channels = 3,out_channels = 1, features = [64,128,256,512] ):
        super(UNET,self).__init__()
        self.downs = nn.ModuleList()
        self.ups = nn.ModuleList()
        self.pool = nn.MaxPool2d(kernel_size=2,stride=2)
        # the pooling layer will floor the size of height and width when divided like
        # 161x161 when going down -> 80x80 -> becomes 160x160,
        # so yes we can't concatenate this way that's why we must add padding
        # i'll add a notes file for this one point [1]

        # Downward Part of UNET
        for feature in features:
            self.downs.append(DoubleConv(in_channels,feature))
            # we defined doubleconv layer  above as 2 conv layers
            # together in each step of u-net.
            in_channels = feature

        # Bottle Neck layer in UNET in = 512 and out = 1024
        self.bottleneck = DoubleConv(features[-1],features[-1]*2) # it is saved as a method
        self.final_conv = nn.Conv2d(features[0],out_channels,kernel_size=1)

        # Upward Part of UNET
        for feature in reversed(features):
            self.ups.append(
                nn.ConvTranspose2d(
                    feature*2,feature,kernel_size=2,stride=2,
                )
            )
            self.ups.append(DoubleConv(feature*2,feature))
    def forward(self,x):
        skip_connections = [] # we will store all the skipped connections
        # first we are going down.
        # go down and then collect skipped connections and then pool
        for down in self.downs:
            x = down(x)
            skip_connections.append(x)
            x = self.pool(x)
        # Then apply the bottleneck layer
        x = self.bottleneck(x)
        # reverse the skipped connections collected when going down
        skip_connections = skip_connections[::-1]
        # Then from the bottleneck,
        # we go upwards collect skipped connections and concatenate
        # then go up again.
        for idx in range(0,len(self.ups),2):
            x = self.ups[idx](x)
            skip_connection = skip_connections[idx//2] # // refers to integer division

            if x.shape != skip_connection.shape:
                x = TF.resize(x, size=skip_connection.shape[2:])
                # we are only taking height and width and skipping batch_size and num of channels

            concat_skip = torch.cat((skip_connection,x),dim = 1)
            x = self.ups[idx+1](concat_skip)
        # then we deliver the resulted features from the conv to the final convolution layer
        # and return it.
        return self.final_conv(x)


## Training part

In [19]:
# Hyperparameters
LEARNING_RATE = 1e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32
NUM_EPOCHS = 3
NUM_WORKERS = 2
IMAGE_HEIGHT = 160 # 1280 originally
IMAGE_WIDTH = 240  # 1918 originally
PIN_MEMORY = True
LOAD_MODEL = True
TRAIN_IMG_DIR = "/kaggle/working/TrainPhotos/train"
TRAIN_MASK_DIR = "/kaggle/working/TrainPhotos/train_masks"
VAL_IMG_DIR = "/kaggle/working/ValidPhotos/valid"
VAL_MASK_DIR = "/kaggle/working/ValidPhotos/valid_masks"

In [11]:
# train function will do 1 epoch of training
# we will use tqdm here for the progress bar
def train_fn(loader,model,optimizer,loss_fn,scaler):
    loop = tqdm(loader)
    
    for batch_idx,(data,targets) in enumerate(loop):
        data = data.to(device = DEVICE)
        targets = targets.float().unsqueeze(1).to(device=DEVICE)
        
        # forward pass
        #we will use float 16 training check the video for that
        with torch.cuda.amp.autocast():
            predictions = model(data)
            loss = loss_fn(predictions,targets)
        
        # backward pass
        optimizer.zero_grad() # zero all the gradients from previous
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        # uodate tqdm loop
        loop.set_postfix(loss = loss.item())
        

## Utils

In [12]:
os.mkdir("resultPhotos")

In [16]:
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)
    
def load_checkpoint(chekcpoint, model):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])

def get_loaders(train_dir,train_maskdir,val_dir,val_maskdir,batch_size,
                train_transform,val_transform,num_workers=4,pin_memory=True,):
    train_ds = CarvanaDataset(
        image_dir = train_dir,
        mask_dir = train_maskdir,
        transform = train_transform,
    )
    
    train_loader = DataLoader(
        train_ds,
        batch_size = batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        shuffle=True
    )
    
    
    val_ds = CarvanaDataset(
        image_dir = val_dir,
        mask_dir = val_maskdir,
        transform = val_transform
    )
    
    val_loader = DataLoader(
        val_ds,
        batch_size = batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        shuffle=False,
    )
    
    return train_loader, val_loader

def check_accuracy(loader,model,device="cuda"):
    num_correct = 0
    num_pixels = 0 
    dice_score = 0
    # for segmentation we are outputing a prediction 
    # for each individual pixel for each class
    model.eval()
    
    with torch.no_grad():
        for x,y in loader:
            x = x.to(device)
            y = y.to(device).unsqueeze(1)
            preds = torch.sigmoid(model(x))
            preds = (preds>0.5).float() # since its a binary segmentation
            # for more classes we need to adapt the check accuracy method accordingly
            num_correct += (preds == y).sum()
            num_pixels += torch.numel(preds) # number of elements in preds
            dice_score +=(2*(preds*y).sum())/((preds+y).sum() + 1e-8)
    print(
        f"Got {num_correct}/{num_pixels} with acc {num_correct/num_pixels*100:.2f}"
    )
    print(f"Dice score: {dice_score/len(loader)}")
    # there are better metrics compared to accuracy
    # acc is flawed if we just output black pixels it will have output of greater than 80%
    
    
def save_predictions_as_imgs(loader, model, folder="resultPhotos/", device="cuda"):
    model.eval()
    for idx, (x, y) in enumerate(loader):
        x = x.to(device=device)
        with torch.no_grad():
            preds = torch.sigmoid(model(x))
            preds = (preds > 0.5).float()
        torchvision.utils.save_image(
            preds, f"{folder}/pred_{idx}.png"
        )
        torchvision.utils.save_image(y.unsqueeze(1), f"{folder}")    
    

Accuracy vs. Dice Score in Binary Segmentation:

Accuracy:

**What it measures:** Accuracy is a metric that calculates the overall correctness of predictions, considering both true positives and true negatives.

**Why it may not be ideal:** In binary segmentation tasks, where you're separating an image into two parts (e.g., object vs. background), accuracy can be misleading if one class is much smaller than the other. If you have a lot of background pixels and only a few object pixels, a model predicting all pixels as background can still have a high accuracy.

**Dice Score:**

**What it measures:** The Dice coefficient specifically looks at the overlap between what your model predicts as the object and what is actually the object in the real data.

**Why it's useful:** It's better suited for imbalanced situations. If your model correctly identifies the small object region, even if there are many more background pixels, the Dice score will still reflect the good performance.

**Example:**
Imagine you're looking at medical images to find tumors. If a tumor is tiny compared to the healthy tissue, accuracy might be high even if your model is missing the tumors. Dice score, on the other hand, cares about how well your model finds the actual tumor region, making it more sensitive to the performance on the small but crucial areas.

In summary, in binary segmentation tasks, where one class is much smaller than the other, the Dice score gives you a better picture of how well your model is doing on the important regions you care about, especially when accuracy might be misleading due to class imbalance.

## Main class (need to be in last)

In [20]:
def main():
    train_transform = A.Compose([
        A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
        A.Rotate(limit=35,p=1.0),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.1),
        A.Normalize(
        mean=[0.0,0.0,0.0],
        std=[1.0,1.0,1.0],
        max_pixel_value=255.0,
        ),
        ToTensorV2(),
    ],)
    val_transforms = A.Compose([
        A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
        A.Normalize(
        mean=[0.0,0.0,0.0],
        std=[1.0,1.0,1.0],
        max_pixel_value=255.0,
        ),
        ToTensorV2(),
    ],)
    model = UNET(in_channels = 3,out_channels=1).to(DEVICE)
    loss_fn = nn.BCEWithLogitsLoss() 
    """
    
    * LogitsLoss is used since we did not keep sigmoid function in the output of model
    * If you have multi class segmentation we change out_channels to different number
    loss function to cross entropy loss
    * Ours is binary segmentation black or white mask rn
    
    """
    optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)
    train_loader,val_loader = get_loaders(
        TRAIN_IMG_DIR,
        TRAIN_MASK_DIR,
        VAL_IMG_DIR,
        VAL_MASK_DIR,
        BATCH_SIZE,
        train_transform,
        val_transforms,
        NUM_WORKERS,
        PIN_MEMORY,
    )
    scaler = torch.cuda.amp.GradScaler()
    for epoch in range(NUM_EPOCHS):
        train_fn(train_loader,model,optimizer,loss_fn,scaler)
        
        #save model
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict()
        }
        save_checkpoint(checkpoint)
        
        # check accuracy
        check_accuracy(val_loader, model, device=DEVICE)
        
        # print some examples
        """save_predictions_as_imgs(
            val_loader, model, folder="resultPhotos/", device=DEVICE
        )"""

In [21]:
main()

100%|██████████| 159/159 [01:30<00:00,  1.75it/s, loss=0.222]


=> Saving checkpoint
Got 38628870/39052800 with acc 98.91
Dice score: 0.9746503233909607


100%|██████████| 159/159 [01:30<00:00,  1.76it/s, loss=0.0876]


=> Saving checkpoint
Got 37789080/39052800 with acc 96.76
Dice score: 0.9279659986495972


100%|██████████| 159/159 [01:31<00:00,  1.74it/s, loss=0.046] 


=> Saving checkpoint
Got 38512778/39052800 with acc 98.62
Dice score: 0.9678897261619568
