<a href="https://colab.research.google.com/github/EmaMule/Computer-Vision/blob/main/CVUSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import and installing dependencies

In [1]:
%%capture

# @title Installing dependencies

!pip install tqdm
!pip install pytorch_lightning
!pip install patool

In [2]:
# @title Importing libraries

import pandas as pd
import numpy as np
import random
import warnings
import matplotlib.pyplot as plt
import os
from PIL import Image
from tqdm import tqdm
from google.colab import drive
import shutil
import csv
import cv2
import gdown
import patoolib

# pytorch
import torch
from torch.utils.data import Dataset, DataLoader, Sampler, random_split
from torch.utils.data.sampler import SubsetRandomSampler, SequentialSampler, RandomSampler, BatchSampler
from torch import nn
import torch.nn.functional as F
import torchvision.models as models
from torchvision import transforms
from torchtext.data.metrics import bleu_score

# pytorch lighting
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar, RichProgressBar, ModelPruning
from pytorch_lightning import loggers

In [3]:
# @title Folders Setup

shutil.rmtree('/content/input', ignore_errors = True)
os.mkdir('/content/input')

shutil.rmtree('/content/output', ignore_errors = True)
os.mkdir('/content/output')

shutil.rmtree('/content/output/log', ignore_errors = True)
os.mkdir('/content/output/log')

shutil.rmtree('/content/lightning_logs', ignore_errors = True)
os.mkdir('/content/lightning_logs')

In [4]:
# @title Downloading Dataset

url = 'https://drive.google.com/uc?id=17W9VEPMneRlb6igtSxa--Xh4fSZs3RS_'
output_file = '/content/input/CVUSA_subset.rar'
output_dir = '/content/input/data'

gdown.download(url, output_file)
patoolib.extract_archive(output_file, outdir = output_dir)

url = 'https://drive.google.com/uc?id=19fD1WMGTmusYk8E7ygT6nAJTluf3a_oH'
output_file = '/content/input/train.csv'
gdown.download(url, output_file)

url = 'https://drive.google.com/uc?id=1Rt6waJ6f-kM12Q2A9mgRxAcKfZPdg9IY'
output_file = '/content/input/val.csv'
gdown.download(url, output_file)

Downloading...
From (original): https://drive.google.com/uc?id=17W9VEPMneRlb6igtSxa--Xh4fSZs3RS_
From (redirected): https://drive.google.com/uc?id=17W9VEPMneRlb6igtSxa--Xh4fSZs3RS_&confirm=t&uuid=5e00e325-be87-49a5-a458-2b155f2356d2
To: /content/input/CVUSA_subset.rar
100%|██████████| 4.38G/4.38G [01:02<00:00, 70.2MB/s]
INFO patool: Extracting /content/input/CVUSA_subset.rar ...
INFO:patool:Extracting /content/input/CVUSA_subset.rar ...
INFO patool: ... creating output directory `/content/input/data'.
INFO:patool:... creating output directory `/content/input/data'.
INFO patool: running /usr/bin/unrar x -- /content/input/CVUSA_subset.rar
INFO:patool:running /usr/bin/unrar x -- /content/input/CVUSA_subset.rar
INFO patool:     with cwd='/content/input/data', input=''
INFO:patool:    with cwd='/content/input/data', input=''
INFO patool: ... /content/input/CVUSA_subset.rar extracted to `/content/input/data'.
INFO:patool:... /content/input/CVUSA_subset.rar extracted to `/content/input/data'.

'/content/input/val.csv'

In [5]:
# @title Settings

pl.seed_everything(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

INFO:lightning_fabric.utilities.seed:Seed set to 42


#Dataset and DataModule

We need to also possibly add polar and segmap! not done right now because there is a problem with the csv files. Also no test set, should we use validation or split the training and use the current validation as test?

In [53]:
# @title Dataset definition: without using polar transforms (neither segmentation)

# Expected dataset structure: the input_dir contains the split cvs files and a
# subdirectory named 'data' with the CVUSA dataset

class CVUSADataset(Dataset):

    def __init__(self, input_dir, split = 'train'):
        self.split = split
        self.data = self.load_data(input_dir + f'/{split}.csv')


    def load_data(self, csv_path):
        data = []
        with open(csv_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader) #skip header
            for row in csv_reader:
                streetview_path = row[1]
                satmap_path = row[0] #we are using normal images
                #segmap = row[2]
                data.append({"streetview_path": streetview_path, "satmap_path": satmap_path})

        return data


    def __len__(self):
        return len(self.data)


    def __getitem__(self, index):
        dictionary = self.data[index]
        streetview_path = dictionary['streetview_path']
        satmap_path = dictionary['satmap_path']
        return streetview_path, satmap_path


    def __str__(self):
        return f"CVUSA-Dataset-{self.split}: {len(self.data)} samples"

In [54]:
# @title Data module definition: without using polar transforms (neither segmentation)

class CVUSADataModule(pl.LightningDataModule):

    def __init__(self, input_dir, batch_size=8, resize_grd = None, resize_sat = None):
        # Initialize the CustomDataModule
        super(CVUSADataModule, self).__init__()
        self.batch_size = batch_size
        self.input_dir = input_dir
        self.data_dir = input_dir + '/data'
        self.resize_grd = resize_grd
        self.resize_sat = resize_sat
        self.grd_size = None
        self.sat_size = None
        self.transform = None


    def setup(self, stage=None):

        # Load the entire dataset
        self.train_dataset = CVUSADataset(input_dir=self.input_dir, split='train')
        print(self.train_dataset)

        #self.test_dataset = CVUSADataset(input_dir=self.input_dir, split='test')
        #print(self.test_dataset)

        self.val_dataset = CVUSADataset(input_dir=self.input_dir, split='val')
        print(self.val_dataset)

        grd_sample, sat_sample = self.train_dataset[0]
        grd_image = Image.open(os.path.join(self.data_dir, grd_sample))
        sat_image = Image.open(os.path.join(self.data_dir, sat_sample))

        if self.resize_grd:
          self.grd_size = transforms.Resize((self.resize_grd))(grd_image).size
        else:
          self.grd_size = grd_image.size

        if self.resize_sat:
          self.sat_size = transforms.Resize((self.resize_sat))(sat_image).size
        else:
          self.sat_size = sat_image.size


    #collate function is useful so we don't overuse RAM, training is a little bit slower tho...
    def collate_fn(self,batch):

        streetview_path, satmap_path = zip(*batch)

        # Load and transform each image in the batch
        streetview_ids, streetview_images_tensor = self.__images_to_tensor(streetview_path, 'grd')
        satmap_ids, satmap_images_tensor = self.__images_to_tensor(satmap_path, 'sat')

        streetviews = {'imgs': streetview_images_tensor, 'imgs_id': streetview_ids}
        satmaps = {'imgs': satmap_images_tensor, 'imgs_id': satmap_ids}

        return streetviews, satmaps


    # we could add transformations (first of all normalization of the input!)
    def __images_to_tensor(self, paths, img_type):
        images = []
        ids = []
        resize = self.resize_grd if img_type == 'grd' else self.resize_sat
        for img_path in paths:
            img = Image.open(os.path.join(self.data_dir, img_path))
            if resize:
              img = transforms.Resize((resize))(img)
            img_tensor = transforms.ToTensor()(img)
            images.append(img_tensor)
            ids.append(int(img_path[-11:-4]))

        # Stack the image tensors along the batch dimension
        images_tensor = torch.stack(images)
        ids_tensor = torch.tensor(ids, dtype=int)
        return ids_tensor, images_tensor


    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size=self.batch_size,collate_fn=self.collate_fn,shuffle=True,num_workers=2)


    def val_dataloader(self):
        return DataLoader(self.val_dataset,batch_size=self.batch_size,collate_fn=self.collate_fn,shuffle=False,num_workers=2)


    #def test_dataloader(self):
    #    return DataLoader(self.test_dataset,batch_size=self.batch_size, collate_fn=self.collate_fn,shuffle=True,num_workers=2)

In [73]:
# @title Creating dataloaders: without using polar transforms (neither segmentation)
input_dir = '/content/input'

data_module = CVUSADataModule(input_dir = input_dir, batch_size = 64, resize_grd = 64, resize_sat = 128)
data_module.setup()

train_loader = data_module.train_dataloader()
val_loader = data_module.val_dataloader()
#test_loader = data_module.test_dataloader()

CVUSA-Dataset-train: 6647 samples
CVUSA-Dataset-val: 2215 samples


#Losses and other utilities

In [24]:
# # @title TripletLoss implementation
# class TripletLoss(pl.LightningModule):
#     def __init__(self, margin=1.0):
#         super(TripletLoss, self).__init__()
#         self.margin = margin

#     def calc_euclidean(self, x1, x2):
#         return (x1 - x2).pow(2).sum(1)

#     def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
#         distance_positive = self.calc_euclidean(anchor, positive)
#         distance_negative = self.calc_euclidean(anchor, negative)
#         losses = torch.relu(distance_positive - distance_negative + self.margin)

#         return losses.mean()

In [25]:
# # @title TripletLoss implementation
# class TripletLoss(pl.LightningModule):
#     def __init__(self, margin=1.0):
#         super(TripletLoss, self).__init__()
#         self.margin = margin

#     def forward(self, image_features1, image_features2, k = None):
#         N = len(image_features1)
#         if k is None:
#           k = N-1
#         # Calcolare le distanze Euclidee tra le features
#         distances_per_image1 = torch.cdist(image_features1, image_features2, p=2)  # p=2 per distanza Euclidea
#         distances_per_image2 = distances_per_image1.T  # Per simmetria
#         #in common between the losses:
#         mask = torch.eye(distances_per_image1.size(0), dtype=torch.bool, device = device)
#         distances_diag = torch.masked_select(distances_per_image1, mask) @ torch.eye(n=N, device = device)

#         distances_positive_matrix = distances_diag @ torch.ones(size=(N,k), device = device) #da controllare
#         margin_matrix = torch.ones(size = (N,k), device = device) * self.margin


#         distances_no_diag = torch.masked_select(distances_per_image1, mask.logical_not()).view(distances_per_image1.size(0), -1) #need to see if is NxN-1
#         distances_no_diag, _ = torch.topk(distances_no_diag, k = k, dim = 1, largest = False)
#         loss_matrix = torch.relu(distances_positive_matrix - distances_no_diag + margin_matrix)

#         loss1 = torch.mean(loss_matrix, dim=(0,1))

#         distances_no_diag = torch.masked_select(distances_per_image2, mask.logical_not()).view(distances_per_image2.size(0), -1) #need to see if is NxN-1
#         distances_no_diag, _ = torch.topk(distances_no_diag, k = k, dim = 1, largest = False)
#         loss_matrix = torch.relu(distances_positive_matrix - distances_no_diag + margin_matrix)

#         loss2 = torch.mean(loss_matrix, dim=(0,1))

#         return (loss1 + loss2)/2

In [26]:
#@title Implementation TripletLoss more stable

# RIVEDI (confronto con triplet torchreid)

class TripletLoss(pl.LightningModule):
    def __init__(self, loss_weight = 1e-2):
        super().__init__()
        self.loss_weight = loss_weight

    def forward(self, image_features1, image_features2):
        #image_features1 = F.normalize(image_features1, dim=-1)
        #image_features2 = F.normalize(image_features2, dim=-1)
        dist_array = 2.0 - 2.0 * torch.matmul(image_features2, image_features1.T)
        n = len(image_features1)
        pos_dist = torch.diag(dist_array)
        pair_n = n * (n - 1.0)
        triplet_dist_g2s = pos_dist - dist_array
        loss_g2s = torch.sum(torch.log(1.0 + torch.exp(triplet_dist_g2s * self.loss_weight)))/pair_n
        triplet_dist_s2g = torch.unsqueeze(pos_dist, 1) - dist_array
        loss_s2g = torch.sum(torch.log(1.0 + torch.exp(triplet_dist_s2g * self.loss_weight)))/pair_n
        loss = (loss_g2s + loss_s2g) / 2.0

        return loss

In [27]:
#@title InfoNCE implementation
class InfoNCE(pl.LightningModule):

    def __init__(self, loss_function):
        super().__init__()

        self.loss_function = loss_function #we can use a generic loss function!

    def forward(self, image_features1, image_features2, logit_scale):

        image_features1 = F.normalize(image_features1, dim=-1)
        image_features2 = F.normalize(image_features2, dim=-1)

        logits_per_image1 = logit_scale * image_features1 @ image_features2.T #similarity matrix
        logits_per_image2 = logits_per_image1.T

        labels = torch.arange(len(logits_per_image1), dtype=torch.long, device = device)
        loss = (self.loss_function(logits_per_image1, labels) + self.loss_function(logits_per_image2, labels))/2

        return loss

In [28]:
# @title Top-K Rank Accuracy: takes embeddings in input

def top_k_rank_accuracy(emb1, emb2, k=1):
    if k > len(emb1) :
      return 0.0 #might happen at the end of the dataset (batch less then the chosen one)
    # Calculate cosine similarity
    correct_in_top_k = 0
    for index, elem in enumerate(emb1):
        cosine_sim = F.cosine_similarity(elem, emb2, dim=1)

        # Find the rank K similarity
        top_k_similarities, top_k_indices = torch.topk(cosine_sim, k, largest=True)

        correct_in_top_k += index in top_k_indices.tolist()

    accuracy = correct_in_top_k / len(emb1)
    return accuracy

In [29]:
# @title Generation of negatives: modularized so that we can experiment with a lot of versions!
def generate_negatives(positives):
  shuffled_indices = torch.randperm(positives.size(0))
  negatives = positives[shuffled_indices]

  return negatives

In [30]:
class Attention(pl.LightningModule):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

# Training Architectures

In [56]:
# @title Resnet

class ResNet50Branch(pl.LightningModule):

    def __init__(self):
        super(ResNet50Branch, self).__init__()
        self.resnet50 = models.resnet50(weights = models.ResNet50_Weights.DEFAULT)
        # Modify the last layer for your specific task
        self.resnet50.fc = torch.nn.Linear(self.resnet50.fc.in_features, 512)


    def forward(self, x, featuremaps = False):

        # to print the featuremap we need to return the last conv layer output
        if featuremaps:
            x = self.resnet50.conv1(x)
            x = self.resnet50.bn1(x)
            x = self.resnet50.relu(x)
            x = self.resnet50.maxpool(x)
            x = self.resnet50.layer1(x)
            x = self.resnet50.layer2(x)
            x = self.resnet50.layer3(x)
            x = self.resnet50.layer4(x)
            return x

        else:
            return self.resnet50(x)



class DualResNet50Model(pl.LightningModule):

    def __init__(self):
        super(DualResNet50Model, self).__init__()
        self.branch1 = ResNet50Branch()
        self.branch2 = ResNet50Branch()

        self.loss1 = TripletLoss()
        self.loss2 = InfoNCE(loss_function=nn.CrossEntropyLoss())


    def forward(self, x1, x2):
        out1 = self.branch1(x1['imgs'])
        out2 = self.branch2(x2['imgs'])
        return out1, out2


    def training_step(self, batch, batch_idx):
        streetview, bingmap = batch
        out1, out2 = self(streetview, bingmap)

        loss = self.loss2(out1, out2, logit_scale = 3.0)
        #loss = self.loss1(out1, out2) + self.loss2(out1, out2, logit_scale = 3.0)

        accuracy_top_1 = top_k_rank_accuracy(out1, out2, k = 1)

        self.log('train_top1', accuracy_top_1, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        streetview, bingmap = batch
        out1, out2 = self(streetview, bingmap)

        accuracy_top_1 = top_k_rank_accuracy(out1, out2, k = 1)
        accuracy_top_3 = top_k_rank_accuracy(out1, out2, k = 3)
        accuracy_top_10 = top_k_rank_accuracy(out1, out2, k = 10)

        self.log('val_top1', accuracy_top_1, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_top3', accuracy_top_3, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_top10', accuracy_top_10, on_step=True, on_epoch=True, prog_bar=True)

        return accuracy_top_1, accuracy_top_3, accuracy_top_10


    def test_step(self, batch, batch_idx):
        streetview, bingmap = batch
        out1, out2 = self(streetview, bingmap)

        accuracy_top_1 = top_k_rank_accuracy(out1, out2, k = 1)
        accuracy_top_3 = top_k_rank_accuracy(out1, out2, k = 3)
        accuracy_top_10 = top_k_rank_accuracy(out1, out2, k = 10)

        self.log('test_top1', accuracy_top_1, on_step=True, on_epoch=True, prog_bar=True)
        self.log('test_top3', accuracy_top_3, on_step=True, on_epoch=True, prog_bar=True)
        self.log('test_top10', accuracy_top_10, on_step=True, on_epoch=True, prog_bar=True)

        return accuracy_top_1, accuracy_top_3, accuracy_top_10


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [95]:
# P nel disegno indica la dimensione di ogni patch
# devi usare lo stride per diminuire la W and H dell'input
# però aumentando il numero di channel (dimensione embedding)

# i positional embedding sono un vettore di vettori da imparare direttamente
# self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
# self.pos_drop = nn.Dropout(p=drop_rate)

class ConvBnReluBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(ConvBnReluBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x



class Block(pl.LightningModule):

    def __init__(
        self,
        dim,
        num_heads,
        qkv_bias=False,
        qk_scale=None,
        drop=0.,
        attn_drop=0.,
        dropout=0.,
        norm_layer=nn.LayerNorm
    ):

        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop
        )
        # check what is droppath
        self.dropout = nn.Dropout(dropout) if dropout > 0. else nn.Identity()


    def forward(self, x):
        B, N, C = x.shape
        x = x + self.dropout(self.attn(self.norm1(x)))
        return x



class SAIGBranch(pl.LightningModule):

    def __init__(self, img_size, patch_size=16, in_channels=3, embed_dim=768, norm_layer=None, flatten=True):
        super(SAIGBranch, self).__init__()
        embed_dim = 384
        num_heads = 8
        depth = 2
        qkv_bias = True
        qk_scale = None
        drop_rate = 0
        attn_drop_rate = 0

        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size, img_size[1] // patch_size)

        if img_size[0] % patch_size != 0 or img_size[1] % patch_size != 0:
          print("Warning: image size is not divisible for patch size")

        self.num_patches = self.grid_size[0] * self.grid_size[1]

        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

        self.conv_bn_relu_blocks = nn.Sequential(
            ConvBnReluBlock(in_channels = 3, out_channels = 64, stride = 2),
            ConvBnReluBlock(in_channels = 64, out_channels = 128, stride = 2),
            ConvBnReluBlock(in_channels = 128, out_channels = 128, stride = 1),
            ConvBnReluBlock(in_channels = 128, out_channels = 256, stride = 2),
            ConvBnReluBlock(in_channels = 256, out_channels = 256, stride = 1),
            ConvBnReluBlock(in_channels = 256, out_channels = 512, stride = 2),
        )
        self.patch_block = nn.Conv2d(in_channels = 512, out_channels = embed_dim, kernel_size=1, stride=1 ,padding=0)
        self.attn_blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate)
            for i in range(depth)])

        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim))
        #self.pos_embed = nn.Parameter(torch.zeros(1, embed_dim, self.num_patches))
        self.pos_drop = nn.Dropout(p=drop_rate)

        #self.GAP = nn.AdaptiveAvgPool1d(1)

        #self.logits = nn.Linear(in_features = embed_dim, out_features = 512)

        self.smd = nn.Sequential(
            nn.Linear(self.num_patches, self.num_patches*4),
            nn.GELU(),
            nn.Linear(self.num_patches*4, self.num_patches),
            nn.Linear(self.num_patches, 8)
        )


    def forward(self, x, featuremaps = False):

      # extract patch embeddings
      x = self.conv_bn_relu_blocks(x)
      x = self.patch_block(x)
      x = x.flatten(2).transpose(1,2)
      #x = self.norm(x) CHECK

      # add position embeddings
      x = x + self.pos_embed
      x = self.pos_drop(x)

      # pass through sequence of attention blocks
      for blk in self.attn_blocks:
          x = blk(x)

      x = self.norm(x)
      # x = self.GAP(x.transpose(-1, -2)).squeeze(2)
      # x = self.logits(x)

      # x: b x 88 x 384
      x = x.transpose(-1, -2)
      x = self.smd(x)
      x = x.transpose(-1, -2)
      x = x.flatten(-2, -1)

      return x



class DualSAIGModel(pl.LightningModule):

    def __init__(self, grd_size, sat_size):
        super(DualSAIGModel, self).__init__()
        self.branch1 = SAIGBranch(grd_size)
        self.branch2 = SAIGBranch(sat_size)

        self.loss1 = TripletLoss()
        self.loss2 = InfoNCE(loss_function=nn.CrossEntropyLoss())


    def forward(self, x1, x2):
        out1 = self.branch1(x1['imgs'])
        out2 = self.branch2(x2['imgs'])
        return out1, out2


    def training_step(self, batch, batch_idx):
        streetview, bingmap = batch
        out1, out2 = self(streetview, bingmap)

        loss = self.loss2(out1, out2, logit_scale = 3.0)
        #loss = self.loss1(out1, out2) + self.loss2(out1, out2, logit_scale = 3.0)

        accuracy_top_1 = top_k_rank_accuracy(out1, out2, k = 1)

        self.log('train_top1', accuracy_top_1, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        streetview, bingmap = batch
        out1, out2 = self(streetview, bingmap)

        accuracy_top_1 = top_k_rank_accuracy(out1, out2, k = 1)
        accuracy_top_3 = top_k_rank_accuracy(out1, out2, k = 3)
        accuracy_top_10 = top_k_rank_accuracy(out1, out2, k = 10)

        self.log('val_top1', accuracy_top_1, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_top3', accuracy_top_3, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_top10', accuracy_top_10, on_step=True, on_epoch=True, prog_bar=True)

        return accuracy_top_1, accuracy_top_3, accuracy_top_10


    def test_step(self, batch, batch_idx):
        streetview, bingmap = batch
        out1, out2 = self(streetview, bingmap)

        accuracy_top_1 = top_k_rank_accuracy(out1, out2, k = 1)
        accuracy_top_3 = top_k_rank_accuracy(out1, out2, k = 3)
        accuracy_top_10 = top_k_rank_accuracy(out1, out2, k = 10)

        self.log('test_top1', accuracy_top_1, on_step=True, on_epoch=True, prog_bar=True)
        self.log('test_top3', accuracy_top_3, on_step=True, on_epoch=True, prog_bar=True)
        self.log('test_top10', accuracy_top_10, on_step=True, on_epoch=True, prog_bar=True)

        return accuracy_top_1, accuracy_top_3, accuracy_top_10


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [96]:
model = DualSAIGModel(data_module.grd_size, data_module.sat_size)

In [None]:
model = DualResNet50Model()

In [97]:
trainer = pl.Trainer(
    max_epochs = 30,
    devices = 1,
    callbacks = [RichProgressBar()],
    log_every_n_steps = 3
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(
    model = model,
    train_dataloaders = train_loader,
    val_dataloaders = val_loader
)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [None]:
trainer.test(
    dataloaders = val_loader
)



ValueError: `.test(ckpt_path="best")` is set but `ModelCheckpoint` is not configured to save the best model.

#Depth estimation

In [None]:
from transformers import pipeline

checkpoint = "vinvino02/glpn-nyu"
depth_estimator = pipeline("depth-estimation", model=checkpoint)

In [None]:
from PIL import Image
import requests

url = "https://unsplash.com/photos/HwBAsSbPBDU/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MzR8fGNhciUyMGluJTIwdGhlJTIwc3RyZWV0fGVufDB8MHx8fDE2Nzg5MDEwODg&force=true&w=640"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
predictions = depth_estimator(image)

In [None]:
predictions["depth"]

In [None]:
from transformers import AutoImageProcessor, AutoModelForDepthEstimation

checkpoint = "vinvino02/glpn-nyu"

image_processor = AutoImageProcessor.from_pretrained(checkpoint)
model = AutoModelForDepthEstimation.from_pretrained(checkpoint)

In [None]:
image = Image.open("/content/drive/MyDrive/CV Project/CVUSA/streetview/0000052.jpg")
with torch.no_grad():
      pixel_values = image_processor(image, return_tensors="pt").pixel_values
      outputs = model(pixel_values)
      predicted_depth = outputs.predicted_depth

In [None]:
import numpy as np

# interpolate to original size
prediction = torch.nn.functional.interpolate(
    predicted_depth.unsqueeze(1),
    size=image.size[::-1],
    mode="bicubic",
    align_corners=False,
).squeeze()
output = prediction.numpy()

formatted = (output * 255 / np.max(output)).astype("uint8")
depth = Image.fromarray(formatted)
depth

In [None]:
#road
#building
#low vegetation
#trees
#car
#clutter

In [None]:
#negative sampling basato sulla similarità ovvero computo la similarità di un sample con tutti gli altri e uso quelli più vicini come negativi

In [None]:
#soft margin triplet loss and weighted soft margin triplet loss

# Visualization Functions

In [None]:
# @title Visualize Heatmap

# to visualize the images correctly, they must be de-normalized, so we must give
# in input their mean and std

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
GRID_SPACING = 10

@torch.no_grad()
def visactmap(
    model,
    data_loader,
    save_dir,
    use_gpu,
    img_mean=None,
    img_std=None
):

    if img_mean is None or img_std is None:
        img_mean = IMAGENET_MEAN
        img_std = IMAGENET_STD

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    model.eval()

    print('Visualizing activation maps')

    for batch_idx, data in enumerate(data_loader):

        # for now visualize only streetviews
        streetview_imgs, streetview_ids = data[0]['imgs'], data[0]['imgs_id']
        satmap_imgs, satmap_ids = data[1]['imgs'], data[1]['imgs_id']

        if use_gpu:
            streetview_imgs = streetview_imgs.cuda()
            satmap_imgs = satmap_imgs.cuda()

        try:
            streetview_outputs = model.branch1(streetview_imgs, featuremaps=True)
            satmap_outputs = model.branch2(satmap_imgs, featuremaps=True)
        except TypeError:
            raise TypeError('model.forward() doesn\'t have featuremaps field')
        if streetview_outputs.dim() != 4 or satmap_outputs.dim() != 4:
            raise ValueError('model output is supposed to have 4 dimensions')

        # compute activation maps for streetview (try adding square root?)
        streetview_outputs = (streetview_outputs**2).sum(1)
        b, h, w = streetview_outputs.size()
        streetview_outputs = streetview_outputs.view(b, h * w)
        streetview_outputs = nn.functional.normalize(streetview_outputs, p=2, dim=1)
        streetview_outputs = streetview_outputs.view(b, h, w)

        # compute activation maps for satmap
        satmap_outputs = (satmap_outputs**2).sum(1)
        b, h, w = satmap_outputs.size()
        satmap_outputs = satmap_outputs.view(b, h * w)
        satmap_outputs = nn.functional.normalize(satmap_outputs, p=2, dim=1)
        satmap_outputs = satmap_outputs.view(b, h, w)

        if use_gpu:
            streetview_imgs, streetview_outputs = streetview_imgs.cpu(), streetview_outputs.cpu()
            satmap_imgs, satmap_outputs = satmap_imgs.cpu(), satmap_outputs.cpu()

        for j in range(streetview_outputs.size(0)):

            # get image name
            imname = str(int(streetview_ids[j])).zfill(7)

            # RGB image (from the normalized input image)
            img = streetview_imgs[j, ...]
            for t, m, s in zip(img, img_mean, img_std):
                t.mul_(s).add_(m).clamp_(0, 1)
            img = np.uint8(np.floor(img.numpy() * 255))
            img = img.transpose((1, 2, 0))

            height, width, _ = img.shape

            # activation map (from the output image)
            am = streetview_outputs[j, ...].numpy()
            am = cv2.resize(am, (width, height))
            am = 255 * (am - np.min(am)) / (np.max(am) - np.min(am) + 1e-12)
            am = np.uint8(np.floor(am))
            am = cv2.applyColorMap(am, cv2.COLORMAP_JET)

            # overlapping between the two images
            overlapped = img*0.5 + am*0.5
            overlapped[overlapped > 255] = 255
            overlapped = overlapped.astype(np.uint8)

            # save images in a single figure (add white spacing between images)
            grid = 255 * np.ones((3*height + 2*GRID_SPACING, width, 3), dtype=np.uint8)
            grid[:height, :, :] = img[:, :, ::-1]
            grid[height + GRID_SPACING:2*height + GRID_SPACING, :, :] = am
            grid[2*height + 2*GRID_SPACING:, :, :] = overlapped
            cv2.imwrite(os.path.join(save_dir, imname + '_streetview.jpg'), grid)

        for j in range(satmap_outputs.size(0)):

            # get image name
            imname = str(int(satmap_ids[j])).zfill(7)

            # RGB image (input image)
            img = satmap_imgs[j, ...]
            for t, m, s in zip(img, img_mean, img_std):
                t.mul_(s).add_(m).clamp_(0, 1)
            img = np.uint8(np.floor(img.numpy() * 255))
            img = img.transpose((1, 2, 0))

            height, width, _ = img.shape

            # activation map
            am = satmap_outputs[j, ...].numpy()
            am = cv2.resize(am, (width, height))
            am = 255 * (am - np.min(am)) / (np.max(am) - np.min(am) + 1e-12)
            am = np.uint8(np.floor(am))
            am = cv2.applyColorMap(am, cv2.COLORMAP_JET)

            # overlapped
            overlapped = img*0.5 + am*0.5
            overlapped[overlapped > 255] = 255
            overlapped = overlapped.astype(np.uint8)

            # save images in a single figure (add white spacing between images)
            grid = 255 * np.ones((3*height + 2*GRID_SPACING, width, 3), dtype=np.uint8)
            grid[:height, :, :] = img[:, :, ::-1]
            grid[height + GRID_SPACING:2*height + GRID_SPACING, :, :] = am
            grid[2*height + 2*GRID_SPACING:, :, :] = overlapped
            cv2.imwrite(os.path.join(save_dir, imname + '_satmap.jpg'), grid)

        if (batch_idx+1) % 10 == 0:
            print('- done batch {}/{}'.format(batch_idx + 1, len(data_loader)))

In [None]:
!rm -f -r '/content/output/log/visactmap'
!mkdir '/content/output/log/visactmap'

visactmap(
    model = model,
    data_loader = val_loader,
    save_dir = '/content/output/log/visactmap',
    use_gpu = True,
    img_mean = [0, 0, 0],
    img_std = [1, 1, 1]
)