In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
# from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchvision.utils import save_image
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torch.optim as optim
from tqdm import tqdm

In [2]:
# ignoring the warning messages
import warnings
from IPython.display import display
warnings.filterwarnings('ignore')

In [3]:
class TrainValDataset(Dataset):
    def __init__(self, image_data_path,depth_image_path, image_transform=None, depth_transform=None):
        self.image_data_path = image_data_path
        self.depth_image_path = depth_image_path
        self.image_transform = image_transform
        self.depth_transform = depth_transform
        self.images_list = os.listdir(image_data_path) #[ images_ png]
        # self.depth_images_list = os.listdir(depth_image_path)

    def __len__(self):
        return len(self.images_list)

    def __getitem__(self, idx):

        noraml_img = os.path.join(self.image_data_path, self.images_list[idx])
        # find the same image in the depth image folder which is in the same as the images_list[idx]
        # if self.images_list[idx] in self.depth_images_list:
        depth_img = os.path.join(self.depth_image_path, self.images_list[idx])

        # if normal_img is not None and depth_img is not None:

            # depth_img = os.path.join(self.depth_image_path, self.depth_images_list[idx])

        normal_img = Image.open(noraml_img).convert('RGB') # RGB is for 3 channel image
        depth_image = Image.open(depth_img).convert('L') # L is for grayscale


        if self.image_transform:
            normal_img = self.image_transform(normal_img)

        if self.depth_transform:
            depth_image = self.depth_transform(depth_image)
        
        # depth_image = (depth_image -torch.min(depth_image))/(torch.max(depth_image)-torch.min(depth_image))
            
        # else:
        #     raise ValueError("Image not found in Depth Image folder {images_list[idx]}")

        return normal_img, depth_image
        


In [4]:
class TestDataset(Dataset):
    def __init__(self, test_path, test_transform=None):
        self.test_path = test_path
        self.test_transform = test_transform
        self.images_list = os.listdir(test_path)

    def __len__(self):       
        return len(self.images_list)

    def __getitem__(self, idx):
        name_image = self.images_list[idx]
        img_name = os.path.join(self.test_path, name_image)
        test_image = Image.open(img_name).convert('RGB') # RGB is for 3 channel image

        if self.test_transform:
            test_image = self.test_transform(test_image)

        return test_image,name_image

In [5]:
# Augmentaion for the norma images and depth images
noraml_image_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    # transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

depth_image_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    # transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    # transforms.Normalize(mean=[0.5], std=[0.5])
])

In [6]:
normal_train_img_pth = '/kaggle/input/dlp-week-10/competition-data/competition-data/training/images'
depth_train_img_pth = '/kaggle/input/dlp-week-10/competition-data/competition-data/training/depths'

# VAlidadion path
normal_val_img_pth = '/kaggle/input/dlp-week-10/competition-data/competition-data/validation/images'
depth_val_img_pth = '/kaggle/input/dlp-week-10/competition-data/competition-data/validation/depths'

#test path
normal_test_img_pth = '/kaggle/input/dlp-week-10/competition-data/competition-data/testing/images'

In [7]:
#call the custom dataset
dataset = TrainValDataset(normal_train_img_pth, depth_train_img_pth, noraml_image_transform, depth_image_transform)
# Validation dataset
val_dataset = TrainValDataset(normal_val_img_pth, depth_val_img_pth, noraml_image_transform, depth_image_transform)
# Test dataset
test_dataset = TestDataset(normal_test_img_pth, noraml_image_transform )


In [8]:
batch_size = 32

In [9]:
# Dataloader
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
# Validation dataloader
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
# Test Dataloader
test_loader = DataLoader(dataset = test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
# for idx, (imag, names) in enumerate(test_loader):
#     count = 0
#     if count < 50:
#         print(names)
#         count +=1

In [11]:
# class CustomDepthEstimationModel(nn.Module):
#     def __init__(self):
#         super(CustomDepthEstimationModel, self).__init__()
#         #channels: 3->64,
#         self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
#         self.conv2 = nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
#         self.maxpool1 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
#         #channels: 64->128
#         self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
#         self.conv4 = nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
#         self.maxpool2 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
#         #channels: 128->256
#         self.conv5 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
#         self.conv6 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
#         self.maxpool3 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
#         #channels: 256->512
#         self.conv7 = nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
#         self.conv8 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
#         self.maxpool4 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
#         #channels:512->1024
#         self.conv9 = nn.Conv2d(in_channels = 512, out_channels = 1024, kernel_size = 3, stride = 1, padding = 1)
#         self.conv10 = nn.Conv2d(in_channels = 1024, out_channels = 1024, kernel_size = 3, stride = 1, padding = 1)
#         self.upconv1 = nn.ConvTranspose2d(in_channels = 1024, out_channels = 512, kernel_size = 2, stride = 2)
#         #channels:1024->512
#         self.conv11 = nn.Conv2d(in_channels = 1024, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
#         self.conv12 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
#         self.upconv2 = nn.ConvTranspose2d(in_channels = 512, out_channels = 256, kernel_size = 2, stride = 2)
#         #channels: 512->256
#         self.conv13 = nn.Conv2d(in_channels = 512, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
#         self.conv14 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
#         self.upconv3 = nn.ConvTranspose2d(in_channels = 256, out_channels = 128, kernel_size = 2, stride = 2)
#         #channels:256->128
#         self.conv15 = nn.Conv2d(in_channels = 256, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
#         self.conv16 = nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
#         self.upconv4 = nn.ConvTranspose2d(in_channels = 128, out_channels = 64, kernel_size = 2, stride = 2)
#         #channels:128->64
#         self.conv17 = nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
#         self.conv18 = nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
#         #channels:64->1
#         self.conv19 = nn.Conv2d(in_channels = 64, out_channels = 1, kernel_size = 1, stride = 1, padding = 0)

#         #relu: for non-linearity relu(x)=max(0,x)
#         self.relu = nn.ReLU()
#         #group_normalisation
#         self.gn1 = nn.GroupNorm(16, 64)
#         self.gn2 = nn.GroupNorm(16, 128)
#         self.gn3 = nn.GroupNorm(16, 256)
#         self.gn4 = nn.GroupNorm(16, 512)
#         self.gn5 = nn.GroupNorm(16, 1024)
#         #drop out layers; set the fraction of neurons to 0
#         self.dropout = nn.Dropout(0.5)
#         self.dropout1 = nn.Dropout(0.25)

#     def forward(self, x):
#         x = self.conv1(x)
#         x = self.gn1(x)
#         x = self.relu(x)
#         x = self.conv2(x)
#         x = self.gn1(x)
#         x = self.relu(x)
#         out1 = x
#         x = self.maxpool1(x)
#         x = self.dropout1(x)
#         x = self.conv3(x)
#         x = self.gn2(x)
#         x = self.relu(x)
#         x = self.conv4(x)
#         x = self.gn2(x)
#         x = self.relu(x)
#         out2 = x
#         x = self.maxpool2(x)
#         x = self.dropout(x)
#         x = self.conv5(x)
#         x = self.gn3(x)
#         x = self.relu(x)
#         x = self.conv6(x)
#         x = self.gn3(x)
#         x = self.relu(x)
#         out3 = x
#         x = self.maxpool3(x)
#         x = self.dropout(x)
#         x = self.conv7(x)
#         x = self.gn4(x)
#         x = self.relu(x)
#         x = self.conv8(x)
#         x = self.gn4(x)
#         x = self.relu(x)
#         out4 = x
#         x = self.maxpool4(x)
#         x = self.dropout(x)
#         x = self.conv9(x)
#         x = self.gn5(x)
#         x = self.relu(x)
#         x = self.conv10(x)
#         x = self.gn5(x)
#         x = self.relu(x)
#         x = self.upconv1(x)
#         x = torch.cat((x, out4), 1)
#         x = self.dropout(x)
#         x = self.conv11(x)
#         x = self.gn4(x)
#         x = self.relu(x)
#         x = self.conv12(x)
#         x = self.gn4(x)
#         x = self.relu(x)
#         x = self.upconv2(x)
#         x = torch.cat((x, out3), 1)
#         x = self.dropout(x)
#         x = self.conv13(x)
#         x = self.gn3(x)
#         x = self.relu(x)
#         x = self.conv14(x)
#         x = self.gn3(x)
#         x = self.relu(x)
#         x = self.upconv3(x)
#         x = torch.cat((x, out2), 1)
#         x = self.dropout(x)
#         x = self.conv15(x)
#         x = self.gn2(x)
#         x = self.relu(x)
#         x = self.conv16(x)
#         x = self.gn2(x)
#         x = self.relu(x)
#         x = self.upconv4(x)
#         x = torch.cat((x, out1), 1)
#         x = self.dropout(x)
#         x = self.conv17(x)
#         x = self.gn1(x)
#         x = self.relu(x)
#         x = self.conv18(x)
#         x = self.gn1(x)
#         x = self.relu(x)
#         x = self.conv19(x)
#         x = torch.nn.functional.sigmoid(x) #since depth is between 0-1
#         return x


In [12]:
# class CustomDepthEstimationModel(nn.Module):
#     def __init__(self, input_channels=3, output_channels=1):
#         super(CustomDepthEstimationModel, self).__init__()

#         # Encoder: Custom DenseNet-like feature extractor (Dense blocks)
#         self.encoder = nn.Sequential(
#             self._dense_block(input_channels, 64, num_layers=6),
#             self._dense_block(64, 128, num_layers=12),
#             self._dense_block(128, 256, num_layers=24),
#             self._dense_block(256, 512, num_layers=16)
#         )

#         # Decoder: Upsample and refine features progressively
#         self.upconv1 = self._conv_block(512, 256)
#         self.upconv2 = self._conv_block(256, 128)
#         self.upconv3 = self._conv_block(128, 64)
#         self.upconv4 = self._conv_block(64, 32)

#         # Final Depth Map Prediction
#         self.final_conv = nn.Conv2d(32, output_channels, kernel_size=(1, 1))  # Output single channel depth map

#     def forward(self, x):
#         # Encoder: Extract features
#         x = self.encoder(x)

#         # Decoder: Upsample and refine feature maps
#         x = self.upconv1(x)
#         x = self.upconv2(x)
#         x = self.upconv3(x)
#         x = self.upconv4(x)

#         # Output: Depth map prediction
#         depth_map = self.final_conv(x)
#         return depth_map

#     def _dense_block(self, in_channels, out_channels, num_layers):
#         """Create a dense block with multiple convolutional layers."""
#         layers = []
#         for i in range(num_layers):
#             layers.append(self._conv_block(in_channels, out_channels))
#             in_channels = out_channels  # Each new layer takes all previous outputs

#         return nn.Sequential(*layers)

#     def _conv_block(self, in_channels, out_channels):
#         """Convolution block with ReLU activation, BatchNorm, and Dropout."""
#         block = nn.Sequential(
#             nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
#             nn.ReLU(inplace=True),
#             nn.BatchNorm2d(out_channels),
#             nn.Dropout(0.2)
#         )
#         return block

# # Instantiate the model
# # model = DepthEstimationModel(input_channels=3, output_channels=1)

In [13]:
class CustomDepthEstimationModel(nn.Module):
    def __init__(self):
        super(CustomDepthEstimationModel, self).__init__()
        #channels: 3->64,
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
        self.conv2 = nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool1 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
        #channels: 64->128
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
        self.conv4 = nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool2 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
        #channels: 128->256
        self.conv5 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
        self.conv6 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool3 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
        #channels: 256->512
        self.conv7 = nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
        self.conv8 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool4 = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
        #channels:512->1024
        self.conv9 = nn.Conv2d(in_channels = 512, out_channels = 1024, kernel_size = 3, stride = 1, padding = 1)
        self.conv10 = nn.Conv2d(in_channels = 1024, out_channels = 1024, kernel_size = 3, stride = 1, padding = 1)
        self.upconv1 = nn.ConvTranspose2d(in_channels = 1024, out_channels = 512, kernel_size = 2, stride = 2)
        #channels:1024->512
        self.conv11 = nn.Conv2d(in_channels = 1024, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
        self.conv12 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
        self.upconv2 = nn.ConvTranspose2d(in_channels = 512, out_channels = 256, kernel_size = 2, stride = 2)
        #channels: 512->256
        self.conv13 = nn.Conv2d(in_channels = 512, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
        self.conv14 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
        self.upconv3 = nn.ConvTranspose2d(in_channels = 256, out_channels = 128, kernel_size = 2, stride = 2)
        #channels:256->128
        self.conv15 = nn.Conv2d(in_channels = 256, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
        self.conv16 = nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
        self.upconv4 = nn.ConvTranspose2d(in_channels = 128, out_channels = 64, kernel_size = 2, stride = 2)
        #channels:128->64
        self.conv17 = nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
        self.conv18 = nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
        #channels:64->1
        self.conv19 = nn.Conv2d(in_channels = 64, out_channels = 1, kernel_size = 1, stride = 1, padding = 0)

        #relu: for non-linearity relu(x)=max(0,x)
        self.relu = nn.ReLU()
        #group_normalisation
        self.gn1 = nn.GroupNorm(16, 64)
        self.gn2 = nn.GroupNorm(16, 128)
        self.gn3 = nn.GroupNorm(16, 256)
        self.gn4 = nn.GroupNorm(16, 512)
        self.gn5 = nn.GroupNorm(16, 1024)
        #drop out layers; set the fraction of neurons to 0
        self.dropout = nn.Dropout(0.5)
        self.dropout1 = nn.Dropout(0.25)

    def forward(self, x):
        x = self.conv1(x)
        x = self.gn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.gn1(x)
        x = self.relu(x)
        out1 = x
        x = self.maxpool1(x)
        x = self.dropout1(x)
        x = self.conv3(x)
        x = self.gn2(x)
        x = self.relu(x)
        x = self.conv4(x)
        x = self.gn2(x)
        x = self.relu(x)
        out2 = x
        x = self.maxpool2(x)
        x = self.dropout(x)
        x = self.conv5(x)
        x = self.gn3(x)
        x = self.relu(x)
        x = self.conv6(x)
        x = self.gn3(x)
        x = self.relu(x)
        out3 = x
        x = self.maxpool3(x)
        x = self.dropout(x)
        x = self.conv7(x)
        x = self.gn4(x)
        x = self.relu(x)
        x = self.conv8(x)
        x = self.gn4(x)
        x = self.relu(x)
        out4 = x
        x = self.maxpool4(x)
        x = self.dropout(x)
        x = self.conv9(x)
        x = self.gn5(x)
        x = self.relu(x)
        x = self.conv10(x)
        x = self.gn5(x)
        x = self.relu(x)
        x = self.upconv1(x)
        x = torch.cat((x, out4), 1)
        x = self.dropout(x)
        x = self.conv11(x)
        x = self.gn4(x)
        x = self.relu(x)
        x = self.conv12(x)
        x = self.gn4(x)
        x = self.relu(x)
        x = self.upconv2(x)
        x = torch.cat((x, out3), 1)
        x = self.dropout(x)
        x = self.conv13(x)
        x = self.gn3(x)
        x = self.relu(x)
        x = self.conv14(x)
        x = self.gn3(x)
        x = self.relu(x)
        x = self.upconv3(x)
        x = torch.cat((x, out2), 1)
        x = self.dropout(x)
        x = self.conv15(x)
        x = self.gn2(x)
        x = self.relu(x)
        x = self.conv16(x)
        x = self.gn2(x)
        x = self.relu(x)
        x = self.upconv4(x)
        x = torch.cat((x, out1), 1)
        x = self.dropout(x)
        x = self.conv17(x)
        x = self.gn1(x)
        x = self.relu(x)
        x = self.conv18(x)
        x = self.gn1(x)
        x = self.relu(x)
        x = self.conv19(x)
        x = torch.nn.functional.sigmoid(x) #since depth is between 0-1
        return x


In [14]:
# class CustomDepthEstimationModel(nn.Module):
#     def __init__(self):
#         super(CustomDepthEstimationModel, self).__init__()
#         self.encoder = nn.Sequential(
#             nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(kernel_size=2, stride=2)
#         )
#         self.decoder = nn.Sequential(
#             nn.ConvTranspose2d(64, 32, kernel_size=2, stride=2),
#             nn.ReLU(),
#             nn.Conv2d(32, 1, kernel_size=3, stride=1, padding=1),
#         )

#     def forward(self, x):
#         x = self.encoder(x)
#         x = self.decoder(x)
#         return x


In [15]:
# import torch
# import torch.nn as nn

# class CustomDepthEstimationModel(nn.Module):
#     def __init__(self, dropout_rate=0.3):
#         super(CustomDepthEstimationModel, self).__init__()

#         # Encoder
#         self.encoder_block1 = self._conv_block(3, 64, dropout_rate)  # Input channels = 3 (RGB)
#         self.encoder_block2 = self._conv_block(64, 128, dropout_rate)
#         self.encoder_block3 = self._conv_block(128, 256, dropout_rate)
#         self.encoder_block4 = self._conv_block(256, 512, dropout_rate)

#         # Bottleneck
#         self.bottleneck = nn.Sequential(
#             nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
#             nn.BatchNorm2d(512),
#             nn.GELU(),
#             nn.Dropout(p=dropout_rate)  # Regularization in bottleneck
#         )

#         # Decoder
#         self.decoder_block1 = self._up_conv_block(512, 256, dropout_rate)
#         self.decoder_block2 = self._up_conv_block(256, 128, dropout_rate)
#         self.decoder_block3 = self._up_conv_block(128, 64, dropout_rate)
#         self.decoder_block4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)  # Output channel = 1 (depth map)

#         # Skip Connections
#         self.skip_conv1 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
#         self.skip_conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
#         self.skip_conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

#     def _conv_block(self, in_channels, out_channels, dropout_rate):
#         return nn.Sequential(
#             nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
#             nn.BatchNorm2d(out_channels),
#             nn.GELU(),
#             nn.Dropout(p=dropout_rate),
#             nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
#             nn.BatchNorm2d(out_channels),
#             nn.GELU(),
#             nn.Dropout(p=dropout_rate),
#             nn.MaxPool2d(kernel_size=2, stride=2)
#         )

#     def _up_conv_block(self, in_channels, out_channels, dropout_rate):
#         return nn.Sequential(
#             nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2, padding=0),
#             nn.GELU(),
#             nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
#             nn.BatchNorm2d(out_channels),
#             nn.GELU(),
#             nn.Dropout(p=dropout_rate)  # Add dropout for decoder too
#         )

#     def forward(self, x):
#         # Encoder
#         enc1 = self.encoder_block1(x)  # 64 channels
#         enc2 = self.encoder_block2(enc1)  # 128 channels
#         enc3 = self.encoder_block3(enc2)  # 256 channels
#         enc4 = self.encoder_block4(enc3)  # 512 channels

#         # Bottleneck
#         bottleneck = self.bottleneck(enc4)

#         # Decoder with skip connections
#         dec1 = self.decoder_block1(bottleneck) + self.skip_conv3(enc3)
#         dec2 = self.decoder_block2(dec1) + self.skip_conv2(enc2)
#         dec3 = self.decoder_block3(dec2) + self.skip_conv1(enc1)
#         dec4 = self.decoder_block4(dec3)  # Final depth map output

#         return dec4


In [16]:
# import torch
# import torch.nn as nn

# class CustomDepthEstimationModel(nn.Module):
#     def __init__(self):
#         super(CustomDepthEstimationModel, self).__init__()

#         # Encoder
#         self.encoder_block1 = self._conv_block(3, 64)  # Input channels = 3 (RGB)
#         self.encoder_block2 = self._conv_block(64, 128)
#         self.encoder_block3 = self._conv_block(128, 256)
#         self.encoder_block4 = self._conv_block(256, 512)

#         # Decoder
#         self.decoder_block1 = self._up_conv_block(512, 256)
#         self.decoder_block2 = self._up_conv_block(256, 128)
#         self.decoder_block3 = self._up_conv_block(128, 64)
#         self.decoder_block4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)  # Output channel = 1 (depth map)

#         # Skip Connections
#         self.skip_conv1 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
#         self.skip_conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
#         self.skip_conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

#     def _conv_block(self, in_channels, out_channels):
#         return nn.Sequential(
#             nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
#             nn.BatchNorm2d(out_channels),
#             nn.ReLU(),
#             nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
#             nn.BatchNorm2d(out_channels),
#             nn.ReLU(),
#             nn.MaxPool2d(kernel_size=2, stride=2)
#         )

#     def _up_conv_block(self, in_channels, out_channels):
#         return nn.Sequential(
#             nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2),
#             nn.ReLU(),
#             nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
#             nn.ReLU()
#         )

#     def forward(self, x):
#         # Encoder
#         enc1 = self.encoder_block1(x)  # 64 channels
#         enc2 = self.encoder_block2(enc1)  # 128 channels
#         enc3 = self.encoder_block3(enc2)  # 256 channels
#         enc4 = self.encoder_block4(enc3)  # 512 channels

#         # Decoder with skip connections
#         dec1 = self.decoder_block1(enc4) + self.skip_conv3(enc3)  # Add skip connection
#         dec2 = self.decoder_block2(dec1) + self.skip_conv2(enc2)
#         dec3 = self.decoder_block3(dec2) + self.skip_conv1(enc1)
#         dec4 = self.decoder_block4(dec3)  # Final depth map output

#         return dec4


In [17]:
def minmaxscaler(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [45]:
def training_loop(model, criterion, optimizer, train_loader,epoch, epochs, device, scheduler=None):
    
    running_mse_loss = 0.0

    model.train()

    with tqdm(total=len(train_loader), desc=f'Epoch{epoch+1}/{epochs}', unit='batch') as tepoch:
        for idx, (normal_img, depth_img) in enumerate(train_loader):  # Add 'idx' for batch index tracking
            # Move images to device
            normal_img = normal_img.to(device)
            depth_img = depth_img.to(device)
            # Zero the gradients
            optimizer.zero_grad()
            output = model(normal_img)
            # loss = criterion(output, depth_img, normal_img)
            loss = criterion(output, depth_img)
            loss.backward()
            optimizer.step()

            running_mse_loss += loss.item()
            
            tepoch.set_postfix({'custom loss error': running_mse_loss})
            tepoch.update(1)
            
        if scheduler:
            scheduler.step()

    return running_mse_loss


In [46]:
# Validation loop
def validation_loop(model, criterion, val_loader, device, epoch, epochs):
    
    running_val_loss = 0
    model.eval()
    with torch.no_grad():
        with tqdm(total=len(val_loader), desc=f'Validation {epoch+1}/{epochs}', unit='batch') as tepoch:
            for idx, (val_normal_img, val_depth_img) in enumerate(val_loader):
                # Move images to device
                val_normal_img = val_normal_img.to(device)
                val_depth_img = val_depth_img.to(device)
                # Forward pass
                output = model(val_normal_img)
                # Calculate loss
                loss = criterion(output, val_depth_img, val_normal_img)
                # loss = criterion(output, val_depth_img)
                # Update running loss
                running_val_loss += loss.item()
                # average_val_loss = running_val_loss / (idx + 1)  # Calculate average loss so far
                # Update tqdm bar with current and average loss
                tepoch.set_postfix({'custom validation loss': running_val_loss})
                tepoch.update(1)
                
                # Visualization Logic
                # if idx %  == 0:
                #     one_width = output.shape[3]
                #     total_width = one_width * 3  # Create space for 3 images
                #     new_val_image = Image.new('RGB', (total_width, output.shape[2]))
                #     # Input Image.
                #     new_val_image.paste(
                #         Image.fromarray(np.uint8(minmaxscaler(val_normal_img[0].permute(1, 2, 0).detach().cpu().numpy()) * 255.)), (0, 0))
                #     # Ground Truth (Labels).
                #     new_val_image.paste(
                #         Image.fromarray(np.uint8(minmaxscaler(val_depth_img[0].repeat(3, 1, 1).permute(1, 2, 0).detach().cpu().numpy()) * 255.)), (one_width, 0))
                #     # Model's Output.
                #     new_val_image.paste(
                #         Image.fromarray(np.uint8(minmaxscaler(output[0].repeat(3, 1, 1).permute(1, 2, 0).detach().cpu().numpy()) * 255.)), (2 * one_width, 0))
                #     # Save Combined Image.
                #     new_val_image.save(f'combined_val_image_{epoch}_{idx}.png')

    # return running_val_loss/len(val_loader)  
    return running_val_loss

In [38]:
# trianing model
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs, device, scheduler=None):
    best_val_loss = np.inf

    for epoch in range(epochs):
        # Training Loop
        running_mse_loss = training_loop(model, criterion, optimizer, train_loader, epoch, epochs, device, scheduler)

        # Validation Loop
        val_loss = validation_loop(model, criterion, val_loader, device, epoch, epochs)
        # Update best validation loss

        # if scheduler:
        #     scheduler.step(val_loss)  # Pass validation loss as the metric
            
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'best_model_{epoch+41}.pth')
            
        print(f'Validation Loss: {val_loss}')
        print(f'Mean squared erro: {best_val_loss}')

    return model




In [21]:
model = CustomDepthEstimationModel()

In [22]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = nn.DataParallel(model) 
model.to(device)

DataParallel(
  (module): CustomDepthEstimationModel(
    (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv5): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv7): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv8): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool4): MaxPool2d(kerne

In [39]:
def scale_invariant_loss(y_pred, y_true):
    diff = y_true - y_pred
    return torch.mean(diff*2) - 0.5 * torch.mean(diff)*2

In [40]:
def edge_aware_smoothness_loss(pred, image):
    dx_pred = torch.abs(pred[:, :, :, :-1] - pred[:, :, :, 1:])
    dy_pred = torch.abs(pred[:, :, :-1, :] - pred[:, :, 1:, :])
    dx_img = torch.mean(torch.abs(image[:, :, :, :-1] - image[:, :, :, 1:]), dim=1, keepdim=True)
    dy_img = torch.mean(torch.abs(image[:, :, :-1, :] - image[:, :, 1:, :]), dim=1, keepdim=True)
    loss = torch.mean(dx_pred * torch.exp(-dx_img)) + torch.mean(dy_pred * torch.exp(-dy_img))
    return loss

In [47]:
# def combined_loss(y_pred, y_true, image):
def combined_loss(y_pred, y_true):
    mse_loss = nn.MSELoss()(y_pred, y_true)
    scale_inv = scale_invariant_loss(y_pred, y_true)
    edge_loss = edge_aware_smoothness_loss(y_pred, image)
    return mse_loss + 0.1 * scale_inv + 0.1 * edge_loss
    # return mse_loss + 0.1*scale_inv

In [32]:
num_epochs = 3
# criterion = nn.MSELoss()
crieterion = combined_loss

optimizer = optim.AdamW(model.parameters(), lr=0.0001)

# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.0001)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=1, verbose=True)

model = train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs, device, scheduler)

Epoch1/10: 100%|██████████| 209/209 [04:26<00:00,  1.27s/batch, Mean Squared Error=1.22] 
Validation 1/10: 100%|██████████| 27/27 [00:14<00:00,  1.86batch/s, Mean Squared Error=0.163] 


Validation Loss: 0.16323752515017986
Mean squared erro: 0.16323752515017986


Epoch2/10: 100%|██████████| 209/209 [03:50<00:00,  1.10s/batch, Mean Squared Error=1.17] 
Validation 2/10: 100%|██████████| 27/27 [00:12<00:00,  2.12batch/s, Mean Squared Error=0.188] 


Validation Loss: 0.18791671842336655
Mean squared erro: 0.16323752515017986


Epoch3/10: 100%|██████████| 209/209 [03:59<00:00,  1.15s/batch, Mean Squared Error=1.17] 
Validation 3/10: 100%|██████████| 27/27 [00:13<00:00,  2.06batch/s, Mean Squared Error=0.168] 


Validation Loss: 0.1683440306223929
Mean squared erro: 0.16323752515017986


Epoch4/10: 100%|██████████| 209/209 [03:56<00:00,  1.13s/batch, Mean Squared Error=1.14] 
Validation 4/10: 100%|██████████| 27/27 [00:12<00:00,  2.12batch/s, Mean Squared Error=0.161] 


Validation Loss: 0.16068708570674062
Mean squared erro: 0.16068708570674062


Epoch5/10: 100%|██████████| 209/209 [03:51<00:00,  1.11s/batch, Mean Squared Error=1.11] 
Validation 5/10: 100%|██████████| 27/27 [00:12<00:00,  2.08batch/s, Mean Squared Error=0.17]  


Validation Loss: 0.16976424679160118
Mean squared erro: 0.16068708570674062


Epoch6/10: 100%|██████████| 209/209 [03:50<00:00,  1.10s/batch, Mean Squared Error=1.12] 
Validation 6/10: 100%|██████████| 27/27 [00:12<00:00,  2.09batch/s, Mean Squared Error=0.154] 


Validation Loss: 0.15384415700100362
Mean squared erro: 0.15384415700100362


Epoch7/10: 100%|██████████| 209/209 [03:52<00:00,  1.11s/batch, Mean Squared Error=1.09] 
Validation 7/10: 100%|██████████| 27/27 [00:12<00:00,  2.08batch/s, Mean Squared Error=0.149] 


Validation Loss: 0.14911336870864034
Mean squared erro: 0.14911336870864034


Epoch8/10: 100%|██████████| 209/209 [03:50<00:00,  1.10s/batch, Mean Squared Error=1.05] 
Validation 8/10: 100%|██████████| 27/27 [00:12<00:00,  2.11batch/s, Mean Squared Error=0.15]  


Validation Loss: 0.15009353985078633
Mean squared erro: 0.14911336870864034


Epoch9/10: 100%|██████████| 209/209 [03:49<00:00,  1.10s/batch, Mean Squared Error=1.05] 
Validation 9/10: 100%|██████████| 27/27 [00:12<00:00,  2.12batch/s, Mean Squared Error=0.157] 


Validation Loss: 0.15718174213543534
Mean squared erro: 0.14911336870864034


Epoch10/10: 100%|██████████| 209/209 [03:51<00:00,  1.11s/batch, Mean Squared Error=1.03] 
Validation 10/10: 100%|██████████| 27/27 [00:12<00:00,  2.11batch/s, Mean Squared Error=0.152] 

Validation Loss: 0.1516664857044816
Mean squared erro: 0.14911336870864034





In [24]:
os.makedirs('/kaggle/working/outputs', exist_ok = True)
path = '/kaggle/working/outputs/'

In [25]:
def minmaxscaler_prediction(tensor):
        """
        Scale a PyTorch tensor to the range [0, 1].
        """
        return (tensor - tensor.min()) / (tensor.max() - tensor.min())

In [26]:
# # # test prediction code
def test_prediction(model, test_loader, device, path):
    model.eval()
    with torch.no_grad():
        for test_img, names in test_loader:
            test_img = test_img.to(device)
            output = model(test_img)

            # print(output.shape)
            for idx in range(output.shape[0]):
                # Scale the output tensor to [0, 1]
                scaled_output = minmaxscaler_prediction(output[idx].detach().cpu().squeeze())
    
                # Convert the scaled output to a NumPy array and scale to 0-255
                scaled_output = (scaled_output.numpy() * 255.0).astype(np.uint8)
    
                # Create an image from the scaled output
                out_image = Image.fromarray(scaled_output, mode='L')
                
                
                # Save the image with the given name
                out_path = os.path.join(path, f"{names[idx]}")
                out_image.save(out_path)
            
            # save_image(output, f'output_{idx}.png')
            # return output
            

In [33]:
#  loading the best model
# model.load_state_dict(torch.load('/kaggle/working/best_model_6.pth'))
# model.to(device)

DataParallel(
  (module): CustomDepthEstimationModel(
    (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv5): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv7): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv8): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (maxpool4): MaxPool2d(kerne

In [34]:
test_prediction(model, test_loader, device, path)

In [35]:
import os
import cv2
import pandas as pd
import numpy as np

def images_to_csv_with_metadata(image_folder, output_csv):
    # Initialize an empty list to store image data and metadata
    data = []

    # Loop through all images in the folder
    for idx, filename in enumerate(sorted(os.listdir(image_folder))):
        if filename.endswith(".png"):
            filepath = os.path.join(image_folder, filename)
            # Read the image
            image = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)
            image = cv2.resize(image, (128, 128))
            image = image / 255.
            image = (image - np.min(image)) / (np.max(image) - np.min(image) + 1e-6)
            image = np.uint8(image * 255.)
            # Flatten the image into a 1D array
            image_flat = image.flatten()
            # Add ID, ImageID (filename), and pixel values
            row = [idx, filename] + image_flat.tolist()
            data.append(row)
    
    # Create a DataFrame
    num_columns = len(data[0]) - 2 if data else 0
    column_names = ["id", "ImageID"] + [indx for indx in range(num_columns)]
    df = pd.DataFrame(data, columns=column_names)

    # Save to CSV
    df.to_csv(output_csv, index=False)

# Paths for prediction and ground truth images
predictions_folder = "/kaggle/working/outputs"

# Output CSV paths
predictions_csv = "predictions.csv"

# Convert prediction images to CSV
images_to_csv_with_metadata(predictions_folder, predictions_csv)