# Installations (timm + FAISS)

In [1]:
!pip install -q timm faiss-gpu
!pip install -q Pillow==9.0.0
!pip uninstall -y torchtext

# Imports

In [2]:
import math
from typing import Callable, Dict, Optional, Tuple
from pathlib import Path
import cv2
import time
import random
import math
from scipy import spatial
from tqdm import tqdm
import warnings

from albumentations.pytorch import transforms
import albumentations as A

import faiss
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from timm.data.transforms_factory import create_transform
from timm.optim import create_optimizer_v2
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import normalize, LabelEncoder

# Paths & Settings

In [3]:
#@title Download images
import os
SOURCE_URL = 'https://storage.googleapis.com/dm-turtle-recall/images.tar'
IMAGE_DIR = './turtle_recall/images'
TAR_PATH = os.path.join(IMAGE_DIR, os.path.basename(SOURCE_URL))
EXPECTED_IMAGE_COUNT = 13891

%sx mkdir --parents "{IMAGE_DIR}"
if len(os.listdir(IMAGE_DIR)) != EXPECTED_IMAGE_COUNT:
  %sx wget --no-check-certificate -O "{TAR_PATH}" "{SOURCE_URL}"
  %sx tar --extract --file="{TAR_PATH}" --directory="{IMAGE_DIR}"
  %sx rm "{TAR_PATH}"

print(f'The total number of images is: {len(os.listdir(IMAGE_DIR))}')

In [4]:
import pandas as pd
import requests
import io
import urllib.parse

BASE_URL = 'https://storage.googleapis.com/dm-turtle-recall/'

N_SPLITS = 5

def read_csv_from_web(file_name):
    url = urllib.parse.urljoin(BASE_URL, file_name)
    content = requests.get(url).content
    return pd.read_csv(io.StringIO(content.decode('utf-8')))


# Read in csv files.
train_df = read_csv_from_web('train.csv')
extra_train = read_csv_from_web('extra_images.csv')
test_df = read_csv_from_web('test.csv')
sample_submission = read_csv_from_web('sample_submission.csv')

# Convert image_location strings to lowercase.
for df in [train_df, test_df]:
    df.image_location = df.image_location.apply(lambda x: x.lower())
    assert set(df.image_location.unique()) == set(['left', 'right', 'top'])

In [5]:
train_df['image_id']   = train_df['image_id'] + '.JPG'
test_df['image_id'] = test_df['image_id'] + '.JPG'
extra_train['image_id'] = extra_train['image_id'] + '.JPG'

train_df['image_path']   = train_df['image_id'].apply(lambda x : f'./turtle_recall/images/{str(x)}')
test_df['image_path'] = test_df['image_id'].apply(lambda x : f'./turtle_recall/images/{str(x)}')
extra_train['image_path'] = extra_train['image_id'].apply(lambda x : f'./turtle_recall/images/{str(x)}')

In [6]:
extra_train['image_location'] = "unknown"

In [7]:
train_df = pd.concat([train_df, extra_train], axis=0, ignore_index=True)
train_df

In [8]:
train_df.turtle_id.nunique()

In [9]:
train_df['image_location'] = train_df['image_location'].map({'top':0, 'left':1, 'right':2, 'unknown' : 3})
test_df['image_location'] = test_df['image_location'].map({'top':0, 'left':1, 'right':2, 'unknown': 3})

In [10]:
label_counts = dict(train_df.turtle_id.value_counts())

In [11]:
train_df.head()

In [12]:
train_df['label_count'] = [label_counts[x] for x in train_df.turtle_id]
train_df

In [13]:
train_df.label_count.min(), train_df.label_count.max()

In [14]:
extra_train.turtle_id.value_counts()

In [15]:
extra_train.turtle_id.nunique()

In [16]:
# drop classes with counts less than 10
train_df = train_df[train_df['label_count'] > 10].reset_index(drop=True)

In [17]:
train_df.head()

In [18]:
train_df.shape

## Train DataFrame

In [19]:
encoder = LabelEncoder()
train_df["turtle_id"] = encoder.fit_transform(train_df["turtle_id"])
np.save('turtle_ids.npy', encoder.classes_)

skf = StratifiedKFold(n_splits=N_SPLITS)
for fold, (_, val_) in enumerate(skf.split(X=train_df, y=train_df.turtle_id)):
    train_df.loc[val_, "kfold"] = fold

train_df = train_df.astype({'kfold': 'int8'})
train_df.to_csv('./train_folds.csv', index=False)
display(train_df.head())

## Test DataFrame

In [20]:
test_df["turtle_id"] = 0
test_df.to_csv('./test.csv', index=False)
test_df.head()

In [21]:
train_df['image_name'] = train_df['image_id'].apply(lambda x : x.replace('.JPG', ''))
test_df['image_name']  = test_df['image_id'].apply(lambda x : x.replace('.JPG', ''))

In [22]:
train_df.to_csv('./train_folds.csv', index=False)
test_df.to_csv('./test.csv', index=False)

In [23]:
train_df

# Lightning DataModule

In [24]:
class TurtleDataset(Dataset):
    
    def __init__(self, csv, trainFlag):
        '''Module to create the PyTorch Dataset.
        csv: full dataframe (train or test)
        trainFlag: True if csv is a training/validation dataset, False otherwise
        return: image and class target if trainFlag, otherwise only image'''
        
        self.csv = csv
        self.trainFlag = trainFlag
        if self.trainFlag:
            self.transform = A.Compose([
                                        A.Affine(rotate=(-15, 15), translate_percent=(0.0, 0.25), 
                                                 shear=(-3, 3), p=0.5),
                                        A.RandomResizedCrop(512, 512, scale=(0.9, 1.0),
                                                            ratio=(0.75, 1.3333333333)),
                                        A.ToGray(p=0.1),
                                        A.GaussianBlur(blur_limit=(3, 7), p=0.05),
                                        A.GaussNoise(p=0.05),
                                        A.RandomGridShuffle(grid=(2, 2), p=0.3),
                                        A.Posterize(p=0.2),
                                        A.RandomBrightnessContrast(p=0.5),
                                        A.Cutout(p=0.05),
                                        A.RandomSnow(p=0.1),
                                        A.RandomRain(p=0.05),
                                        A.Normalize()

            ])
        else:
            self.transform = A.Compose([
                A.RandomResizedCrop(512, 512),
                A.Normalize()
            ])

            
    def __len__(self):
        return self.csv.shape[0]

    
    def __getitem__(self, index):
        # Get data
        row = self.csv.iloc[index]
        
        # Read and transform the image
        image = cv2.imread(row.image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        transformed_img = self.transform(image=image)['image'].astype(np.float32)
        image = transformed_img.transpose(2, 0, 1)
        image = torch.tensor(image)            
        loc_target = torch.tensor(row.image_location)
        
        image_name = row.image_name

        if self.trainFlag:
            target = torch.tensor(row.turtle_id)
            return {'image': image, 'image_name': image_name,'target': target, 'loc_target' : loc_target }
        
        else:
            return {'image' :image,'image_name': image_name, 'loc_target':loc_target}

In [25]:
class LitDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_csv_encoded_folded: str,
        test_csv: str,
        image_size: int,
        batch_size: int,
        num_workers: int,
        val_split: float = 0.1,
        val_fold: int = None,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.train_df = pd.read_csv(train_csv_encoded_folded)
        self.test_df = pd.read_csv(test_csv)
        
        self.transform = create_transform(
            input_size=(self.hparams.image_size, self.hparams.image_size),
            crop_pct=1.0,
        )
        self.num_classes = len(set(
            list(self.train_df["turtle_id"].values) + list(self.test_df["turtle_id"].values)
        ))
        
    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            nb_train = int(self.hparams.val_split * len(self.train_df))
            # Split train df using fold
            if self.hparams.val_fold is not None:
                train_df = self.train_df[self.train_df.kfold != self.hparams.val_fold].reset_index(drop=True)
                val_df = self.train_df[self.train_df.kfold == self.hparams.val_fold].reset_index(drop=True)
            else:
                train_df = self.train_df[:-nb_train].reset_index(drop=True)
                val_df = self.train_df[-nb_train:].reset_index(drop=True)

            self.train_dataset = TurtleDataset(train_df, trainFlag = True)
            self.val_dataset = TurtleDataset(val_df, trainFlag = True)

        if stage == "test" or stage is None:
            self.test_dataset = TurtleDataset(self.test_df,trainFlag=False)

    def train_dataloader(self) -> DataLoader:
        return self._dataloader(self.train_dataset, train=True)

    def val_dataloader(self) -> DataLoader:
        return self._dataloader(self.val_dataset)

    def test_dataloader(self) -> DataLoader:
        return self._dataloader(self.test_dataset)

    def _dataloader(self, dataset: TurtleDataset, train: bool = False) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.hparams.batch_size,
            shuffle=train,
            num_workers=self.hparams.num_workers,
            pin_memory=True,
            drop_last=train,
        )

# Lightning Module

In [26]:
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        s: norm of input feature
        m: margin
        cos(theta + m)
    """

    def __init__(
        self,
        in_features: int,
        out_features: int,
        s: float,
        m: float,
        easy_margin: bool,
        ls_eps: float,
    ):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input: torch.Tensor, label: torch.Tensor, device: str = "cuda") -> torch.Tensor:
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        # Enable 16 bit precision
        cosine = cosine.to(torch.float32)

        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

In [27]:
# https://github.com/haqishen/Google-Landmark-Recognition-2020-3rd-Place-Solution/blob/main/landmark-recognition-2020-third-place-submission.ipynb
class ArcMarginProduct_subcenter(nn.Module):
    def __init__(self, in_features, out_features, k=3):
        super().__init__()
        self.weight = nn.Parameter(torch.FloatTensor(out_features*k, in_features))
        self.reset_parameters()
        self.k = k
        self.out_features = out_features
        
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        
    def forward(self, features):
        cosine_all = F.linear(F.normalize(features), F.normalize(self.weight))
        cosine_all = cosine_all.view(-1, self.out_features, self.k)
        cosine, _ = torch.max(cosine_all, dim=2)
        return cosine 

In [28]:
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss

In [29]:
class LitModule(pl.LightningModule):
    def __init__(
        self,
        model_name: str,
        pretrained: bool,
        drop_rate: float,
        embedding_size: int,
        num_classes: int,
        arc_s: float,
        arc_m: float,
        arc_easy_margin: bool,
        arc_ls_eps: float,
        optimizer: str,
        learning_rate: float,
        weight_decay: float,
        len_train_dl: int,
        batch_size: int,
        epochs: int
    ):
        super().__init__()

        self.save_hyperparameters()

        self.model = timm.create_model(model_name, pretrained=pretrained, drop_rate=drop_rate)
        self.embedding = nn.Linear(self.model.classifier.in_features, embedding_size)
        self.model.reset_classifier(num_classes=0, global_pool="avg")

        self.arc = ArcMarginProduct_subcenter(embedding_size, num_classes)
        
        self.arc_aux = ArcMarginProduct(
            in_features=embedding_size,
            out_features=4,
            s=arc_s,
            m=arc_m,
            easy_margin=arc_easy_margin,
            ls_eps=arc_ls_eps,
        )
        
        self.fc = nn.Linear(embedding_size, 4)

        self.loss_fn2 = nn.CrossEntropyLoss(ignore_index=3) # do not compute loss for unknown image location 3 ("unknown")
        self.loss_fn1 = FocalCosineLoss()

    def forward(self, images: torch.Tensor) -> torch.Tensor:
        features = self.model(images)
        embeddings = self.embedding(features)
        return embeddings

    def configure_optimizers(self):
        optimizer = create_optimizer_v2(
            self.parameters(),
            opt=self.hparams.optimizer,
            lr=self.hparams.learning_rate,
            weight_decay=self.hparams.weight_decay,
        )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            self.hparams.learning_rate,
            steps_per_epoch=self.hparams.len_train_dl,
            epochs=self.hparams.epochs,
        )
        scheduler = {"scheduler": scheduler, "interval": "step"}

        return [optimizer], [scheduler]

    def training_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, "train")

    def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, "val")

    def _step(self, batch: Dict[str, torch.Tensor], step: str) -> torch.Tensor:
        images, targets , loc_targets = batch["image"], batch["target"], batch['loc_target']
        
        embeddings = self(images)
        outputs = self.arc(embeddings)
        
        outputs2 = self.arc_aux(embeddings, loc_targets)
        outputs3 = self.fc(embeddings)
        
        aux_outputs= 0.9*outputs2 + 0.1*outputs3 

        loss1 = self.loss_fn1(outputs, targets)
        loss2 = self.loss_fn2(aux_outputs, loc_targets)
        
        loss = 0.95*loss1 + 0.05*loss2
        
        self.log(f"{step}_loss", loss)
        
        return loss

# Training

In [30]:
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import StochasticWeightAveraging


def train(
    train_csv_encoded_folded: str = './train_folds.csv',
    test_csv: str = './test.csv',
    val_fold: int = 0,
    image_size: int = 256,
    batch_size: int = 32,
    num_workers: int = 4,
    model_name: str = "tf_efficientnet_b4",
    pretrained: bool = True,
    drop_rate: float = 0.0,
    embedding_size: int = 512,
    arc_s: float = 30.0,
    arc_m: float = 0.5,
    arc_easy_margin: bool = False,
    arc_ls_eps: float = 0.0,
    optimizer: str = "adamw",
    learning_rate: float = 3e-4,
    weight_decay: float = 1e-6,
    checkpoints_dir: str = '.',
    accumulate_grad_batches: int = 1,
    auto_scale_batch_size: bool = True,
    gpus: int = 1,
    max_epochs: int = 15,
    precision: int = 16,
):

    datamodule = LitDataModule(
        train_csv_encoded_folded=train_csv_encoded_folded,
        test_csv=test_csv,
        val_fold = val_fold,
        image_size=image_size,
        batch_size=batch_size,
        num_workers=num_workers,
    )
    
    datamodule.setup()
    print(datamodule.num_classes)
    len_train_dl = len(datamodule.train_dataloader())

    model = LitModule(
        model_name=model_name,
        pretrained=pretrained,
        drop_rate=drop_rate,
        embedding_size=embedding_size,
        num_classes=datamodule.num_classes,
        arc_s=arc_s,
        arc_m=arc_m,
        arc_easy_margin=arc_easy_margin,
        arc_ls_eps=arc_ls_eps,
        optimizer=optimizer,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        len_train_dl=len_train_dl,
        batch_size=batch_size,
        epochs=max_epochs
    )
    
    model_checkpoint = ModelCheckpoint(
        checkpoints_dir,
        filename=f"{model_name}_{image_size}_{val_fold}",
        monitor="val_loss",
    )
    
    swa = StochasticWeightAveraging(swa_epoch_start=0.6)
    logger = CSVLogger(save_dir='logs/')
        
    trainer = pl.Trainer(
        accumulate_grad_batches=accumulate_grad_batches,
        benchmark=True,
        logger=logger,
        callbacks=[model_checkpoint],
        gpus=gpus,
        max_epochs=max_epochs,
        precision=precision,
    )

    trainer.fit(model, datamodule=datamodule)
    
    return model, trainer

In [None]:
MODEL_NAME = "tf_efficientnet_b5"
IMAGE_SIZE = 512
BATCH_SIZE = 4

for fold in range(5):
    model, trainer = train(model_name=MODEL_NAME,val_fold=fold, image_size=IMAGE_SIZE, batch_size=BATCH_SIZE)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn

metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
del metrics["step"]
metrics.set_index("epoch", inplace=True)
display(metrics.dropna(axis=1, how="all").head())
g = sn.relplot(data=metrics, kind="line")
plt.gcf().set_size_inches(12, 4)
plt.grid()

# Inference

In [None]:
def load_eval_module(checkpoint_path: str, device: torch.device) -> LitModule:
    module = LitModule.load_from_checkpoint(checkpoint_path)
    module.to(device)
    module.eval()

    return module


def load_encoder() -> LabelEncoder:
    encoder = LabelEncoder()
    encoder.classes_ = np.load('./turtle_ids.npy', allow_pickle=True)

    return encoder

def create_and_search_index(embedding_size: int, train_embeddings: np.ndarray, val_embeddings: np.ndarray, k: int):
    index = faiss.IndexFlatIP(embedding_size)
    index.add(train_embeddings)
    D, I = index.search(val_embeddings, k=k)  

    return D, I


def create_val_targets_df(
    train_targets: np.ndarray, val_image_names: np.ndarray, val_targets: np.ndarray
) -> pd.DataFrame:

    allowed_targets = np.unique(train_targets)
    val_targets_df = pd.DataFrame(np.stack([val_image_names, val_targets], axis=1), columns=["image", "target"])
    val_targets_df.loc[~val_targets_df.target.isin(allowed_targets), "target"] = "new_turtle"

    return val_targets_df


def create_distances_df(
    image_names: np.ndarray, targets: np.ndarray, D: np.ndarray, I: np.ndarray, stage: str 
) -> pd.DataFrame:

    distances_df = []
    for i, image_name in tqdm(enumerate(image_names), desc=f"Creating {stage}_df"):
        target = targets[I[i]]
        distances = D[i]
        subset_preds = pd.DataFrame(np.stack([target, distances], axis=1), columns=["target", "distances"])
        subset_preds["image"] = image_name
        distances_df.append(subset_preds)

    distances_df = pd.concat(distances_df).reset_index(drop=True)
    distances_df = distances_df.groupby(["image", "target"]).distances.max().reset_index()
    distances_df = distances_df.sort_values("distances", ascending=False).reset_index(drop=True)

    return distances_df


def get_best_threshold(val_targets_df: pd.DataFrame, valid_df: pd.DataFrame) -> Tuple[float, float]:
    best_th = 0
    best_cv = 0
    for th in [0.1 * x for x in range(11)]:
        all_preds = get_predictions(valid_df, threshold=th)

        cv = 0
        for i, row in val_targets_df.iterrows():
            target = row.target
            preds = all_preds[row.image]
            val_targets_df.loc[i, th] = map_per_image(target, preds)

        cv = val_targets_df[th].mean()

        print(f"th={th} cv={cv}")

        if cv > best_cv:
            best_th = th
            best_cv = cv

    print(f"best_th={best_th}")
    print(f"best_cv={best_cv}")

    return best_th, best_cv


def get_predictions(df: pd.DataFrame, threshold: float = 0.2):

    predictions = {}
    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Creating predictions for threshold={threshold}"):
        if row.image in predictions:
            if len(predictions[row.image]) == 5:
                continue
            predictions[row.image].append(row.target)
        elif row.distances > threshold:
            predictions[row.image] = [row.target, "new_turtle"]
        else:
            predictions[row.image] = ["new_turtle", row.target]

    for x in tqdm(predictions):
        if len(predictions[x]) < 5:
            predictions[x] = (predictions[x] + ['new_turtle']*5)[:5]

    return predictions


def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0


def create_predictions_df(test_df: pd.DataFrame, best_th: float) -> pd.DataFrame:
    predictions = get_predictions(test_df, best_th)

    predictions = pd.Series(predictions).reset_index()
    predictions.columns = ["image", "predictions"]
    predictions["predictions"] = predictions["predictions"].apply(lambda x: " ".join(x))

    return predictions


In [None]:
def load_dataloaders(
    train_csv_encoded_folded: str,
    test_csv: str,
    val_fold: float,
    image_size: int,
    batch_size: int,
    num_workers: int,
) -> Tuple[DataLoader, DataLoader, DataLoader]:

    datamodule = LitDataModule(
        train_csv_encoded_folded=train_csv_encoded_folded,
        test_csv=test_csv,
        val_fold=val_fold,
        image_size=image_size,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    datamodule.setup()

    train_dl = datamodule.train_dataloader()
    val_dl = datamodule.val_dataloader()
    test_dl = datamodule.test_dataloader()

    return train_dl, val_dl, test_dl


@torch.inference_mode()
def get_embeddings(
    module: pl.LightningModule, dataloader: DataLoader, encoder: LabelEncoder, stage: str
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:


    if stage in ['train', 'val']:
        
        all_image_names = []
        all_embeddings = []
        all_targets = []
    

        for batch in tqdm(dataloader, desc=f"Creating {stage} embeddings"):

            image_names = batch["image_name"]
            images = batch["image"].to(module.device)
            targets = batch["target"].to(module.device)
            loc_targets = batch["loc_target"].to(module.device)

            embeddings = module(images)

            all_image_names.append(image_names)
            all_embeddings.append(embeddings.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

        all_image_names = np.concatenate(all_image_names)
        all_embeddings = np.vstack(all_embeddings)
        all_targets = np.concatenate(all_targets)

        all_embeddings = normalize(all_embeddings, axis=1, norm="l2")
        all_targets = encoder.inverse_transform(all_targets)

        return all_image_names, all_embeddings, all_targets
    
    else:
        all_image_names = []
        all_embeddings = []
        
        for batch in tqdm(dataloader, desc=f"Creating {stage} embeddings"):

            image_names = batch["image_name"]
            images = batch["image"].to(module.device)

            embeddings = module(images)

            all_image_names.append(image_names)
            all_embeddings.append(embeddings.cpu().numpy())

        all_image_names = np.concatenate(all_image_names)
        all_embeddings = np.vstack(all_embeddings)

        all_embeddings = normalize(all_embeddings, axis=1, norm="l2")

        return all_image_names, all_embeddings

In [None]:
def infer(
    checkpoint_path: str,
    train_csv_encoded_folded: str = './train_folds.csv',
    test_csv: str = './test.csv',
    val_fold: float = 0,
    image_size: int = 256,
    batch_size: int = 64,
    num_workers: int = 2,
    k: int = 50,
):
    module = load_eval_module(checkpoint_path, torch.device("cuda"))

    train_dl, val_dl, test_dl = load_dataloaders(
        train_csv_encoded_folded=train_csv_encoded_folded,
        test_csv=test_csv,
        val_fold=val_fold,
        image_size=image_size,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    encoder = load_encoder()

    train_image_names, train_embeddings, train_targets = get_embeddings(module, train_dl, encoder, stage="train")
    val_image_names, val_embeddings, val_targets = get_embeddings(module, val_dl, encoder, stage="val")
    test_image_names, test_embeddings = get_embeddings(module, test_dl, encoder, stage="test")

    D, I = create_and_search_index(module.hparams.embedding_size, train_embeddings, val_embeddings, k)
    print("Created index with train_embeddings")

    val_targets_df = create_val_targets_df(train_targets, val_image_names, val_targets)
    print(f"val_targets_df=\n{val_targets_df.head()}")

    val_df = create_distances_df(val_image_names, train_targets, D, I, "val")
    print(f"val_df=\n{val_df.head()}")

    best_th, best_cv = get_best_threshold(val_targets_df, val_df)
    print(f"val_targets_df=\n{val_targets_df.describe()}")

    train_embeddings = np.concatenate([train_embeddings, val_embeddings])
    train_targets = np.concatenate([train_targets, val_targets])
    print("Updated train_embeddings and train_targets with val data")

    D, I = create_and_search_index(module.hparams.embedding_size, train_embeddings, test_embeddings, k)  
    print("Created index with train_embeddings")

    test_df = create_distances_df(test_image_names, train_targets, D, I, "test")
    print(f"test_df=\n{test_df.head()}")

    predictions = create_predictions_df(test_df, best_th)
    print(f"predictions.head()={predictions.head()}")

    predictions.to_csv(f'./submission_{val_fold}.csv', index=False)

In [None]:
for fld in range(5):
    
    infer(checkpoint_path=f"./{MODEL_NAME}_{IMAGE_SIZE}_{fld}.ckpt",
          val_fold = fld, image_size=IMAGE_SIZE, batch_size=BATCH_SIZE)

In [None]:
sub_files = [
                 './submission_0.csv',
                 './submission_1.csv',
                 './submission_2.csv',
                 './submission_3.csv',
                 './submission_4.csv',

]

# Weights of the individual subs
# NB : Weights (best CV score per fold) are computed from get_best_threshold search
# Check best cv scores from the get_best_threshold outputs and insert for each placeholder below

sub_weight = [
                best_fold_1_cv_score**2,
                best_fold_2_cv_score**2,
                best_fold_3_cv_score**2,
                best_fold_4_cv_score**2,
                best_fold_5_cv_score**2
            ]

In [None]:
import csv

In [None]:
Hlabel = 'image' 
Htarget = 'predictions'
npt = 5
place_weights = {}
for i in range(npt):
    place_weights[i] = ( 1 / (i + 1) )
    
print(place_weights)

lg = len(sub_files)
sub = [None]*lg
for i, file in enumerate( sub_files ):
   
    print("Reading {}: w={} - {}". format(i, sub_weight[i], file))
    reader = csv.DictReader(open(file,"r"))
    sub[i] = sorted(reader, key=lambda d: str(d[Hlabel]))

out = open("sub_ens.csv", "w", newline='')
writer = csv.writer(out)
writer.writerow([Hlabel,Htarget])

for p, row in enumerate(sub[0]):
    target_weight = {}
    for s in range(lg):
        row1 = sub[s][p]
        for ind, trgt in enumerate(row1[Htarget].split(' ')):
            target_weight[trgt] = target_weight.get(trgt,0) + (place_weights[ind]*sub_weight[s])
    tops_trgt = sorted(target_weight, key=target_weight.get, reverse=True)[:npt]
    writer.writerow([row1[Hlabel], " ".join(tops_trgt)])
out.close()

In [None]:
result = pd.read_csv('./sub_ens.csv')

In [None]:
pred_list = result.predictions.apply(lambda x : x.split(" ")).values.tolist()
result_df = pd.DataFrame(pred_list, columns=['prediction1', 'prediction2', 'prediction3', 'prediction4', 'prediction5'])
result_df['image_id'] = result['image'].copy()

result_df['image_id'] = result_df['image_id'].apply(lambda x : x.replace('.png', ''))
result_df = result_df[['image_id','prediction1', 'prediction2','prediction3','prediction4','prediction5']].copy()
result_df

In [None]:
result_df.to_csv('./submission.csv.csv', index=False)