In [1]:
import os
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torchvision.transforms.functional as TF
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch.optim as optim
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split

import torchvision

from tqdm import tqdm



In [2]:
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )
        
    def forward(self, x):
        return self.conv(x)

In [3]:
# class BilinearInterpolationConv(nn.Module):
#     def __init__(self, in_channels, out_channels, kernel_size):
#         super(BilinearInterpolationConv, self).__init__()
        
#         # Bilinear interpolation layer
#         self.bilinear = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        
#         # Convolutional layer
#         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, padding=kernel_size // 2)

#     def forward(self, x):
#         x = self.bilinear(x)
#         x = self.conv(x)
#         return x

In [4]:
class UNet(nn.Module):
    def __init__(self, in_channels=3, out_channels=1, features=[64, 128, 256, 512]):
        super(UNet, self).__init__()
        self.ups = nn.ModuleList()
        self.downs = nn.ModuleList()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # down part of the UNet
        for feature in features:
            self.downs.append(DoubleConv(in_channels, feature))
            in_channels = feature
            
        # up part of the UNet:
        for feature in reversed(features):
        # originally, in the UNet paper, they used bilinear then a conv layer but we will use ConvTranspose2d as it creates artifcats and would be a better approach
            self.ups.append(
                # we are doing features*2 as we will be appending the skip connections
                nn.ConvTranspose2d(
                    feature*2, feature, kernel_size=2, stride=2
                ),
            )
            self.ups.append(DoubleConv(feature*2, feature))
            
        self.bottleneck = DoubleConv(features[-1], features[-1] * 2)
        
        self.final_conv = nn.Conv2d(features[0], out_channels, kernel_size=1)
        
    def forward(self, x):
        skip_connections = []
        
        for down in self.downs:
            x = down(x)
            skip_connections.append(x)
            x = self.pool(x)
            
        x = self.bottleneck(x)
        
        skip_connections = skip_connections[::-1]
        
        # doing two steps for each skip connection
        for idx in range(0, len(self.ups), 2):
            x = self.ups[idx](x)
            skip_connection = skip_connections[idx//2]
            
            # if the image was not divisible by 16 (max-pool will floor), we cannot just concatenate; in the paper they had used cropping to address this issue, but we can use padding also
            if x.shape != skip_connection.shape:
                x = TF.resize(x, size=skip_connection.shape[2:], antialias=True)
            
            concat_skip = torch.cat((skip_connection, x), dim=1)
            
            x = self.ups[idx + 1](concat_skip)
            
        return self.final_conv(x)

In [5]:
# Hyperparameters etc.
LEARNING_RATE = 1e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
NUM_EPOCHS = 8
NUM_WORKERS = os.cpu_count()
# IMAGE_HEIGHT = 160  # 1280 originally
# IMAGE_WIDTH = 240  # 1918 originally
IMAGE_HEIGHT = 320  # 1280 originally
IMAGE_WIDTH = 480  # 1918 originally
PIN_MEMORY = True
TRAIN_IMG_DIR = "/kaggle/working/train"
TEST_IMG_DIR = "/kaggle/working/test"
MASK_DIR = "/kaggle/working/train_masks"

# clear the GPUs
torch.cuda.empty_cache()

In [6]:
# !unzip -q /kaggle/input/carvana-image-masking-challenge/train.zip
# !unzip -q /kaggle/input/carvana-image-masking-challenge/test.zip
# !unzip -q /kaggle/input/carvana-image-masking-challenge/train_masks.zip

In [7]:
class CarvanaDatasetLoader(Dataset):
    def __init__(self, image_dir, mask_dir):
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.images = os.listdir(image_dir)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        img_path = os.path.join(self.image_dir, self.images[index])
        mask_path = os.path.join(self.mask_dir, self.images[index].replace(".jpg", "_mask.gif"))
        image = np.array(Image.open(img_path).convert("RGB"))
        mask = np.array(Image.open(mask_path).convert("L"), dtype=np.float32)
        mask[mask == 255.0] = 1.0 # we are using a sigmoid in the last activation, built-in with BCEWithLogitsLoss

        return image, mask
    
class CarvanaDataset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform
        
    def __getitem__(self, index):
        image, mask = self.subset[index]
        
        # data to augmentations as named arguments
        if self.transform:
            augmentations = self.transform(image=image, mask=mask)
            image = augmentations["image"]
            mask = augmentations["mask"]

        return image, mask
        
    def __len__(self):
        return len(self.subset)

In [8]:
train_transform = A.Compose(
    [
        A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
        A.Rotate(limit=35, p=1.0),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.1),
        A.Normalize(
            mean=[0.0, 0.0, 0.0],
            std=[1.0, 1.0, 1.0],
            max_pixel_value=255.0,
        ),
        ToTensorV2(),
    ],
)

val_transforms = A.Compose(
    [
        A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
        A.Normalize(
            mean=[0.0, 0.0, 0.0],
            std=[1.0, 1.0, 1.0],
            max_pixel_value=255.0,
        ),
        ToTensorV2(),
    ],
)

In [9]:
dataset = CarvanaDatasetLoader(TRAIN_IMG_DIR, MASK_DIR)

train_set_size = int(len(dataset) * 0.8)
val_set_size = len(dataset) - train_set_size

train_split, val_split = random_split(dataset, [train_set_size, val_set_size])
train_dataset, val_dataset = CarvanaDataset(train_split, transform=train_transform), CarvanaDataset(val_split, transform=val_transforms)

In [10]:
# class CarvanaDataset(Dataset):
#     def __init__(self, image_dir, mask_dir, transform=None):
#         self.image_dir = image_dir
#         self.mask_dir = mask_dir
#         self.images = os.listdir(image_dir)
#         self.transform = transform

#     def __len__(self):
#         return len(self.images)

#     def __getitem__(self, index):
#         img_path = os.path.join(self.image_dir, self.images[index])
#         mask_path = os.path.join(self.mask_dir, self.images[index].replace(".jpg", "_mask.gif"))
#         image = np.array(Image.open(img_path).convert("RGB"))
#         mask = np.array(Image.open(mask_path).convert("L"), dtype=np.float32)
#         mask[mask == 255.0] = 1.0 # we are using a sigmoid in the last activation, built-in with BCEWithLogitsLoss

#         if self.transform:
#             augmentations = self.transform(image=image, mask=mask)
#             image = augmentations["image"]
#             mask = augmentations["mask"]
            
#         return image, mask

In [11]:
# train_dataset, val_dataset = CarvanaDataset(TRAIN_IMG_DIR, MASK_DIR, transform=train_transform), CarvanaDataset(TEST_IMG_DIR, MASK_DIR, transform=val_transforms)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

In [13]:
def train_fn(loader, model, optimizer, loss_fn, scaler):
    loop = tqdm(loader)

    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=DEVICE)
        targets = targets.float().unsqueeze(1).to(device=DEVICE) # adding as extra-dimension to compensate for the batch-size

        # forward
        with torch.cuda.amp.autocast():
            predictions = model(data)
            loss = loss_fn(predictions, targets)

        # backward
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # update tqdm loop
        loop.set_postfix(loss=loss.item())

In [14]:
def check_accuracy(loader, model, device="cuda"):
    num_correct = 0
    num_pixels = 0
    dice_score = 0
    
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device).unsqueeze(1)
            preds = torch.sigmoid(model(x))
            preds = (preds > 0.5).float()
            num_correct += (preds == y).sum()
            num_pixels += torch.numel(preds)
            # IoU = Intersection / Union = TP/(TP+FP+FN) = sigma(y*y_hat)/sigma(y_hat+y-y_hat*y) -> soft probabilities version
            # dice = 2 * Intersection / (Union + Intersection) = (2 * TP) / (2*TP+FN+FP) = 2*sigma(y_hat*y)/sigma(y_hat+y) -> soft probabilities version
            # dice score or dice coefficient is equal to F1 score; F1 score = harmonic mean of precision and recall;
            # precision = TP/(TP+FP); recall = TP/(TP+FN)
            # harmonic mean gives the mean but is a little biased towards the lower side, i.e., it penalizes the worst score between precision and recall, so it tries to balance the both
            dice_score += (2 * (preds * y).sum()) / (
                (preds + y).sum() + 1e-8
            ) # this is a better metric, as giving black pixel will always result in accuracy over 70%, similar to object dectetion, where intersection over union is better

    print(
        f"Got {num_correct}/{num_pixels} with acc {num_correct/num_pixels*100:.2f}"
    )
    print(f"Dice score: {dice_score/len(loader)}")
    model.train()

In [15]:
!mkdir "saved_images"
# !rm -rf "saved_images"

mkdir: cannot create directory ‘saved_images’: File exists


In [16]:
def save_predictions_as_imgs(
    loader, model, folder="saved_images/", device="cuda"
):
    model.eval()
    for idx, (x, y) in enumerate(loader):
        x = x.to(device=device)
        with torch.no_grad():
            preds = torch.sigmoid(model(x))
            preds = (preds > 0.5).float()
        torchvision.utils.save_image(
            preds, f"{folder}/{idx}_pred.png"
        )
        torchvision.utils.save_image(y.unsqueeze(1), f"{folder}{idx}.png")

    model.train()

In [17]:
model = UNet(in_channels=3, out_channels=1).to(DEVICE)
loss_fn = nn.BCEWithLogitsLoss() # change to cross-entropy loss if using for mulit-class classification; this may not be the best loss function as about 70% of the image is background and the model just collapse and it would still be 70%
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# check_accuracy(val_loader, model, device=DEVICE)
scaler = torch.cuda.amp.GradScaler()

for epoch in range(NUM_EPOCHS):
    train_fn(train_loader, model, optimizer, loss_fn, scaler)

    # check accuracy
    check_accuracy(val_loader, model, device=DEVICE)

    # print some examples to a folder
    if epoch == NUM_EPOCHS - 1:
        save_predictions_as_imgs(
            val_loader, model, folder="saved_images/", device=DEVICE
        )

100%|██████████| 1017/1017 [04:14<00:00,  3.99it/s, loss=0.0657]


Got 155222289/156364800 with acc 99.27
Dice score: 0.9830320477485657


100%|██████████| 1017/1017 [04:13<00:00,  4.01it/s, loss=0.0275]


Got 155416469/156364800 with acc 99.39
Dice score: 0.9859538674354553


100%|██████████| 1017/1017 [04:14<00:00,  4.00it/s, loss=0.0177]


Got 155435943/156364800 with acc 99.41
Dice score: 0.9861006140708923


100%|██████████| 1017/1017 [04:13<00:00,  4.01it/s, loss=0.0147]


Got 154356814/156364800 with acc 98.72
Dice score: 0.9703584313392639


100%|██████████| 1017/1017 [04:14<00:00,  4.00it/s, loss=0.0156] 


Got 155768090/156364800 with acc 99.62
Dice score: 0.9910872578620911


100%|██████████| 1017/1017 [04:12<00:00,  4.02it/s, loss=0.014]  


Got 155835419/156364800 with acc 99.66
Dice score: 0.9920828938484192


100%|██████████| 1017/1017 [04:13<00:00,  4.02it/s, loss=0.0107] 


Got 155820364/156364800 with acc 99.65
Dice score: 0.9918399453163147


100%|██████████| 1017/1017 [04:13<00:00,  4.02it/s, loss=0.00676]


Got 155759194/156364800 with acc 99.61
Dice score: 0.9909412264823914


In [18]:
!zip -qr saved.zip /kaggle/working/saved_images