In [None]:
!git clone https://github.com/adnan119/TRACER.git

In [None]:
!wget https://github.com/Karel911/TRACER/releases/download/v1.0/TRACER-Efficient-7.pth

In [None]:
!mv ./TRACER-Efficient-7.pth ./best_model.pth

In [None]:
!pip install timm faiss-gpu

In [None]:
!pip install fastai --upgrade

In [None]:
!pip uninstall torch -y
# CUDA 10.1
!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install fastai==2.0.19
!pip install fastcore==1.3.1 
!pip install wwf -q --upgrade

In [None]:
%cd ./TRACER
!mkdir ./results/
!mkdir ./results/DUTS/
!mkdir ./results/DUTS/TE7_0/

In [None]:
!mv ../best_model.pth ./results/DUTS/TE7_0/

In [None]:
!pip install -r requirements.txt

In [None]:
# NB : Restart notebook

In [None]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import gc
from tqdm import tqdm
import fastai
from fastai.vision.all import *
from fastai.vision.learner import _update_first_layer
import faiss
from timm import create_model
from timm.data.transforms_factory import create_transform
from sklearn.preprocessing import normalize

import cv2
import math
import copy
import time
import random
import warnings
import shutil

# For data manipulation
import numpy as np
import pandas as pd
from PIL import *

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# segment test images 
!python main.py test --exp_num 0 --arch 7 --img_size 640 --batch_size 16 --dataset "DUTS" --save_map True --data_path /content/turtle_recall/images/

In [None]:
!ls seg_img | wc -c

In [None]:
shutil.make_archive('./seg_img', 'zip', './seg_img')

In [None]:
!mv /content/TRACER/seg_img.zip /content/drive/MyDrive/TurtleChallenge/

In [None]:
!rm -r seg_img
!rm seg_img.zip

In [None]:
!pwd

In [None]:
%cd ..

In [None]:
SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [None]:
!cp /content/drive/MyDrive/TurtleChallenge/seg_img.zip .

In [None]:
!rm -r TRACER

In [None]:
!unzip seg_img.zip -d clean_turtle_back

In [None]:
!rm seg_img.zip

In [None]:
import os

SOURCE_URL = 'https://storage.googleapis.com/dm-turtle-recall/images.tar'
IMAGE_DIR = './turtle_recall/images'
TAR_PATH = os.path.join(IMAGE_DIR, os.path.basename(SOURCE_URL))
EXPECTED_IMAGE_COUNT = 13891

%sx mkdir --parents "{IMAGE_DIR}"
if len(os.listdir(IMAGE_DIR)) != EXPECTED_IMAGE_COUNT:
  %sx wget --no-check-certificate -O "{TAR_PATH}" "{SOURCE_URL}"
  %sx tar --extract --file="{TAR_PATH}" --directory="{IMAGE_DIR}"
  %sx rm "{TAR_PATH}"

print(f'The total number of images is: {len(os.listdir(IMAGE_DIR))}')

In [None]:
OUTPUT_DIR = Path(".")
DATA_ROOT_DIR = Path("/content/turtle_recall/images")
# TRAIN_DIR = DATA_ROOT_DIR / "train_clean_back"
# TEST_DIR = DATA_ROOT_DIR / "test_clean_back"
# EXTRA_TRAIN_DIR= DATA_ROOT_DIR / "extra_clean_back"
SUBMISSION_CSV_PATH = OUTPUT_DIR / "submission.csv"

# Loading dataset

In [None]:
import pandas as pd
import requests
import io
import urllib.parse

BASE_URL = 'https://storage.googleapis.com/dm-turtle-recall/'


def read_csv_from_web(file_name):
  url = urllib.parse.urljoin(BASE_URL, file_name)
  content = requests.get(url).content
  return pd.read_csv(io.StringIO(content.decode('utf-8')))


# Read in csv files.
train = read_csv_from_web('train.csv')
extra_train = read_csv_from_web('extra_images.csv')
test_df = read_csv_from_web('test.csv')
sample_submission = read_csv_from_web('sample_submission.csv')

# Convert image_location strings to lowercase.
for df in [train, test_df]:
  df.image_location = df.image_location.apply(lambda x: x.lower())
  assert set(df.image_location.unique()) == set(['left', 'right', 'top'])

In [None]:
# Custom metric for mapk:
def mapk(preds, targets, k=5):
  import torch
  scores = []
  for i, actual in enumerate(targets):
    predicted = torch.topk(preds[i], k).indices

    # APK
    score = 0.0
    num_hits = 0.0

    for j, p in enumerate(predicted):
      if p == actual and p not in predicted[:j]:
        num_hits += 1.0
        score += num_hits / (j + 1.0)

    scores.append(score)

  
  return np.mean(scores) #MAPK

# Create the model
# learn = cnn_learner(dls, resnet50, metrics=[error_rate, mapk])

# Run lr_find to pick a good learning rate
# learn.lr_find()

In [None]:
classes = list(train.turtle_id.unique())

In [None]:
classes[:5]

In [None]:
new_data = train.copy()

In [None]:
new_data['image_id'] = new_data['image_id'] + '.png'
new_data['image_path'] = new_data['image_id'].apply(lambda x : f'/content/clean_turtle_back/{str(x)}')

In [None]:
train['image_id']   = train['image_id'] + '.JPG'
test_df['image_id'] = test_df['image_id'] + '.JPG'

train['image_path']   = train['image_id'].apply(lambda x : DATA_ROOT_DIR / str(x))
test_df['image_path'] = test_df['image_id'].apply(lambda x : DATA_ROOT_DIR / str(x))

# Splitting data

In [None]:
new_data

In [None]:
folds = pd.concat([train, new_data], axis=0, ignore_index=True)

In [None]:
N_SPLITS = 5
encoder = LabelEncoder()

encoder.fit(folds['turtle_id'])

folds["label"] = encoder.transform(folds['turtle_id'])

skf = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)


for fold, (_, val_) in enumerate(skf.split(X=folds, y=folds.turtle_id)):
    folds.loc[val_, "kfold"] = int(fold)
    
folds.drop('label',axis=1,inplace=True)
folds.head()

In [None]:
folds['kfold'] = folds['kfold'].astype(int)

In [None]:
folds

In [None]:
seed=42
set_seed(seed, reproducible=True)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms = True

In [None]:
!ls /content/turtle_recall/images | wc -c

In [None]:
folds

In [None]:
# folds['image_id'] = folds['image_id'] + '.png'
# folds
test_df

In [None]:
extra_train

In [None]:
folds

# Creating dataloaders

In [None]:
Val_Fold = 0

def get_x(r): return r['image_path']
def get_y(r): return [r['turtle_id']]

def splitter(df): 

  train_ = df.index[df.kfold != Val_Fold].tolist()

  valid_ = df.index[df.kfold == Val_Fold].tolist()

  return [train_,valid_]

def create_dls(df=folds, bs=8,Val_Fold=0,Image_size=384): # b5_ns 512 
    dblock = DataBlock(blocks = (ImageBlock,MultiCategoryBlock(vocab=classes)),
                       get_x = get_x,
                       get_y = get_y ,
                       splitter = splitter,
                       #item_tfms = [Resize(Image_size,method=ResizeMethod.Squish)],
                       item_tfms = [Resize(768)], # b5_ns 1024
                       batch_tfms =[*aug_transforms(size=Image_size, 
                                                    # do_flip=True, flip_vert=True, max_rotate=5.0, 
                                                    # p_affine=0.5, p_lighting=0.5, max_zoom=1.08,
                                                    # max_lighting=0.2, max_warp=0.2
                                                    ),
                                     Normalize.from_stats(*imagenet_stats)]
                      )

    dls = dblock.dataloaders(folds,bs=bs)

    return dls

In [None]:
dls = create_dls(df=folds)

In [None]:
dls.show_batch()

In [None]:
target_map = {N:CLASS for N,CLASS in enumerate(dls.vocab)}

# Arc Face loss

In [None]:
# From https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/blob/master/src/modeling/metric_learning.py
# Added type annotations, device, and 16bit support
class ArcMarginLoss(Module):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        s: float,
        m: float,
        easy_margin: bool,
        ls_eps: float,
    ):
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, logits, targ):
        cosine = F.linear(F.normalize(logits), F.normalize(self.weight))
        # Enable 16 bit precision
        cosine = cosine.to(torch.float32)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)

        if self.ls_eps > 0:
            targ = (1 - self.ls_eps) * targ + self.ls_eps / self.out_features
        output = (targ * phi) + ((1.0 - targ) * cosine)
        output *= self.s
        loss =  F.cross_entropy(output, torch.argmax(targ, dim=1))
        return loss

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, 
                 m=0.50, easy_margin=False, ls_eps=0.0):
        '''
        in_features: dimension of the input
        out_features: dimension of the last layer (in our case the classification)
        s: norm of input feature
        m: margin
        ls_eps: label smoothing'''
        
        super(ArcMarginProduct, self).__init__()
        self.in_features, self.out_features = in_features, out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        # Fills the input `Tensor` with values according to the method described in
        # `Understanding the difficulty of training deep feedforward neural networks`
        # Glorot, X. & Bengio, Y. (2010)
        # using a uniform distribution.
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m, self.sin_m = math.cos(m), math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------
        one_hot = torch.zeros(cosine.size()).to('cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

In [None]:
# https://github.com/haqishen/Google-Landmark-Recognition-2020-3rd-Place-Solution/blob/main/landmark-recognition-2020-third-place-submission.ipynb
import math

class ArcMarginProduct_subcenter(nn.Module):
    def __init__(self, in_features, out_features, k=3):
        super().__init__()
        self.weight = nn.Parameter(torch.FloatTensor(out_features*k, in_features))
        self.reset_parameters()
        self.k = k
        self.out_features = out_features
        
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        
    def forward(self, features):
        cosine_all = F.linear(F.normalize(features), F.normalize(self.weight))
        cosine_all = cosine_all.view(-1, self.out_features, self.k)
        cosine, _ = torch.max(cosine_all, dim=2)
        
        return cosine 

In [None]:
# src: https://amaarora.github.io/2020/08/30/gempool.html

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM,self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        # Applies 2D average-pooling operation in kH * kW regions by step size
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'

# Creating the model

In [None]:
'''
class building_model(Module):
    def __init__(self,embedding_size:int,arch:str ='efficientnet_b0', pretrained:bool=True):
        self.model = create_model(arch, pretrained=pretrained)
        self.embedding = nn.Linear(self.model.get_classifier().in_features, embedding_size)
        self.model.reset_classifier(num_classes=0, global_pool="avg")
    def forward(self,x):
        features = self.model(x)
        embeddings = self.embedding(features)
        return embeddings
'''

In [None]:
#emb_size = 512
#model = building_model(emb_size,'efficientnet_b0',True)
#model = nn.Sequential(*list(model.children()))

In [None]:
def create_timm_body(arch:str, pretrained=True, drop_rate=0.0, cut=None, n_in=3):
    "Creates a body from any model in the `timm` library."
    model = create_model(arch, pretrained=pretrained, drop_rate=drop_rate, 
                         num_classes=0, global_pool='')
    _update_first_layer(model, n_in, pretrained)
    if cut is None:
        ll = list(enumerate(model.children()))
        cut = next(i for i,o in reversed(ll) if has_pool_type(o))
    if isinstance(cut, int): return nn.Sequential(*list(model.children())[:cut])
    elif callable(cut): return cut(model)
    else: raise NamedError("cut must be either integer or function")
    

In [None]:
emb_size = 512 # 2048 used b5_ns
body = create_timm_body('tf_efficientnet_b7', pretrained=True)
nf = num_features_model(nn.Sequential(*body.children()))
head = nn.Sequential(GeM(),nn.Flatten(),
                     nn.Linear(nf,emb_size,bias=False))

model = nn.Sequential(body, head)

In [None]:
# loss_func =  ArcFaceLoss()

# loss_func = ArcMarginProduct_subcenter(in_features = emb_size, out_features=dls.c)

loss_func = ArcMarginLoss(in_features=emb_size, out_features=dls.c,s=30.0,
                          m = 0.3, easy_margin=False, ls_eps=0.0)

In [None]:
dls = create_dls(df=folds, bs=4,Image_size=512)
learn = Learner(dls, model, 
                loss_func=loss_func, 
                # metrics = [accuracy_multi],
                splitter=default_split
                ).to_fp16()
learn.freeze()

learn.summary()

In [None]:
'''
def accuracy_multi(inp, targ, thresh=0.5, sigmoid=True):
     "Compute accuracy when `inp` and `targ` are the same size."
     if sigmoid: inp = inp.sigmoid()
     return ((inp>thresh)==targ.bool()).float().mean()
'''

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(1,lr_max=5e-4, wd = 1e-6)

In [None]:
learn.unfreeze()
learn.fit_one_cycle(9,lr_max=5e-4)

In [None]:
learn.export('effnet_b7.pkl')

In [None]:
!pwd

In [None]:
!cp effnet_b7.pkl /content/drive/MyDrive/TurtleChallenge/models/

In [None]:
del learn
torch.cuda.empty_cache()
gc.collect()

In [None]:
folds

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
def load_eval_learner(learner_path,output_group,dls,device):
    learn = load_learner(learner_path)
    learn.model.to(device)
    learn.dls = dls
    hook = Hook(learn.model[output_group], lambda m,i,o: o)
    return learn, hook

def load_dataloaders(train_df,test_df,val_fold):

  dls = create_dls(df=train_df, Image_size=384, bs=4, Val_Fold=val_fold) # b5_ns bs=8, image_size = 512

  train_dataloader = dls.test_dl(train_df[train_df.kfold != val_fold],with_labels=True)
  valid_dataloader = dls.test_dl(train_df[train_df.kfold == val_fold],with_labels=True)
  test_dataloader  = dls.test_dl(test_df)
  return train_dataloader, valid_dataloader, test_dataloader, dls

@torch.no_grad()
def get_embeddings(module, dataloader, device, train=True):

    all_image_names = []
    all_embeddings = []
    all_targets = []

    if train:

      for (x, y) in tqdm(dataloader):
          images =  x.to(device)
          targets = y.to(device)
          embeddings = module.model(images)
          all_embeddings.append(embeddings.cpu().numpy())
          all_targets.append(targets.cpu().numpy())


      all_image_names = dataloader.items['image_id'].values
      all_embeddings = np.vstack(all_embeddings)
      all_targets = np.concatenate(all_targets)
      all_targets = L(list(np.argmax(all_targets,axis=1)))
      all_embeddings = normalize(all_embeddings, axis=1, norm="l2")
      all_targets = np.array(all_targets.map(target_map.__getitem__))
      
      return all_image_names, all_embeddings, all_targets
    else:
      for (x,) in tqdm(dataloader):
          images =  x.to(device)
          embeddings = module.model(images)
          all_embeddings.append(embeddings.cpu().numpy())
          
      all_image_names = dataloader.items['image_id'].values
      all_embeddings = np.vstack(all_embeddings)
      all_embeddings = normalize(all_embeddings, axis=1, norm="l2")
      
      return all_image_names, all_embeddings


def create_and_search_index(embedding_size: int, train_embeddings: np.ndarray, val_embeddings: np.ndarray, k: int):
    index = faiss.IndexFlatIP(embedding_size)
    index.add(train_embeddings)
    D, I = index.search(val_embeddings, k=k)  # noqa: E741

    return D, I

def create_val_targets_df(
    train_targets: np.ndarray, val_image_names: np.ndarray, val_targets: np.ndarray
) -> pd.DataFrame:

    allowed_targets = classes
    val_targets_df = pd.DataFrame(np.stack([val_image_names, val_targets], axis=1), columns=["image", "target"])
    val_targets_df.loc[~val_targets_df.target.isin(allowed_targets), "target"] = "new_turtle"

    return val_targets_df

def create_distances_df(
    image_names: np.ndarray, targets: np.ndarray, D: np.ndarray, I: np.ndarray, stage: str  # noqa: E741
) -> pd.DataFrame:
    distances_df = []
    for i, image_name in tqdm(enumerate(image_names), desc=f"Creating {stage}_df"):
        target = targets[I[i]]
        distances = D[i]
        subset_preds = pd.DataFrame(np.stack([target, distances], axis=1), columns=["target", "distances"])
        subset_preds["image"] = image_name
        distances_df.append(subset_preds)
    distances_df = pd.concat(distances_df).reset_index(drop=True)
    distances_df = distances_df.groupby(["image", "target"]).distances.max().reset_index()
    distances_df = distances_df.sort_values("distances", ascending=False).reset_index(drop=True)
    return distances_df

def get_best_threshold(val_targets_df: pd.DataFrame, valid_df: pd.DataFrame):
    best_th = 0
    best_cv = 0
    for th in [0.1 * x for x in range(11)]:
        all_preds = get_predictions(valid_df, threshold=th)

        cv = 0
        for i, row in val_targets_df.iterrows():
            target = row.target
            preds = all_preds[row.image]
            val_targets_df.loc[i, th] = map_per_image(target, preds)

        cv = val_targets_df[th].mean()

        print(f"th={th} cv={cv}")

        if cv > best_cv:
            best_th = th
            best_cv = cv

    print(f"best_th={best_th}")
    print(f"best_cv={best_cv}")

    # Adjustment: Since Public lb has nearly 10% 'new_individual' (Be Careful for private LB)
    # val_targets_df["is_new_turtle"] = val_targets_df.target == "new_turtle"
    # val_scores = val_targets_df.groupby("is_new_turtle").mean().T
    # val_scores["adjusted_cv"] = val_scores[True] * 0.1 + val_scores[False] * 0.9
    # best_th = val_scores["adjusted_cv"].idxmax()
    # print(f"best_th_adjusted={best_th}")

    return best_th, best_cv

def get_predictions(df: pd.DataFrame, threshold: float = 0.2):
    # sample_list = ['t_id_J5dngbNA', 't_id_OqU1NWEA', 't_id_p77GDtzg', 't_id_mXD9Bjsb','t_id_0DPPpRUz']

    predictions = {}
    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Creating predictions for threshold={threshold}"):
        if row.image in predictions:
            if len(predictions[row.image]) == 5:
                continue
            predictions[row.image].append(row.target)
        elif float(row.distances) > threshold:
            predictions[row.image] = [row.target, "new_turtle"]
        else:
            predictions[row.image] = ["new_turtle", row.target]

    for x in tqdm(predictions):
        if len(predictions[x]) < 5:
          # remaining = [y for y in sample_list if y not in predictions]
          # predictions[x] = predictions[x]
          # predictions[x] = predictions[x][:5]
          predictions[x] = (predictions[x] + ['new_turtle']*5)[:5]

    return predictions

def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0


def create_predictions_df(test_df: pd.DataFrame, best_th: float) -> pd.DataFrame:
    predictions = get_predictions(test_df, best_th)

    predictions = pd.Series(predictions).reset_index()
    predictions.columns = ["image", "predictions"]
    predictions["predictions"] = predictions["predictions"].apply(lambda x: " ".join(x))

    return predictions

In [None]:
def infer(
    model_path: str,
    train_df: pd.DataFrame = folds,
    test_df: pd.DataFrame = test_df,
    val_fold: float = 0,
    k: int = 50,
    emb_size:int = emb_size
):
    train_dl, val_dl, test_dl,dls = load_dataloaders(folds,test_df,val_fold)
    (learn, hook) = load_eval_learner(model_path,1,dls,torch.device("cuda"))

    train_image_names, train_embeddings, train_targets = get_embeddings(learn, train_dl,torch.device("cuda"))
    val_image_names, val_embeddings, val_targets = get_embeddings(learn, val_dl,torch.device("cuda"))
    test_image_names, test_embeddings = get_embeddings(learn, test_dl,torch.device("cuda"),train=False)

    D, I = create_and_search_index(emb_size, train_embeddings, val_embeddings, k)  # noqa: E741
    print("Created index with train_embeddings")
    
    val_targets_df = create_val_targets_df(train_targets, val_image_names, val_targets)
    print(f"val_targets_df=\n{val_targets_df.head()}")
    print(f"val_targets_df shape=\n{val_targets_df.shape}")
    val_df = create_distances_df(val_image_names, train_targets, D, I, "val")
    print(f"val_df=\n{val_df.head()}")
    print(f"val_df shape=\n{val_df.shape}")
    best_th, best_cv = get_best_threshold(val_targets_df, val_df)
    print(f"val_targets_df=\n{val_targets_df.describe()}")
    print(f"val_targets_df shape=\n{val_targets_df.shape}")
    train_embeddings = np.concatenate([train_embeddings, val_embeddings])
    train_targets = np.concatenate([train_targets, val_targets])
    print("Updated train_embeddings and train_targets with val data")

    D, I = create_and_search_index(emb_size, train_embeddings, test_embeddings, k)  # noqa: E741
    print("Created index with train_embeddings")

    test_df = create_distances_df(test_image_names, train_targets, D, I, "test")
    print(f"test_df=\n{test_df.head()}")
    print(f"test_df shape=\n{test_df.shape}")

    predictions = create_predictions_df(test_df, best_th)
    print(f"predictions.head()={predictions.head()}")
    print(f"predictions shape={predictions.shape}")
 
    # Fix missing predictions
    # From https://www.kaggle.com/code/jpbremer/backfins-arcface-tpu-effnet/notebook
    # public_predictions = pd.read_csv(PUBLIC_SUBMISSION_CSV_PATH)
    # ids_without_backfin = np.load(IDS_WITHOUT_BACKFIN_PATH, allow_pickle=True)

    # ids2 = public_predictions["image"][~public_predictions["image"].isin(predictions["image"])]

    # predictions = pd.concat(
    #     [
    #         predictions[~(predictions["image"].isin(ids_without_backfin))],
    #         public_predictions[public_predictions["image"].isin(ids_without_backfin)],
    #         public_predictions[public_predictions["image"].isin(ids2)],
    #     ]
    # )
    # predictions = predictions.drop_duplicates()

    predictions.to_csv(f'submission_{val_fold}.csv', index=False)

    print(f'Fold {val_fold} done!')

In [None]:
!cp /content/drive/MyDrive/TurtleChallenge/models/effnet_b5_ns.pkl .

In [None]:
folds

In [None]:
# test_data = test_df[test_df['image_location'] == 'right'].reset_index(drop=True)

In [None]:
# data = folds[folds['image_location'] == 'right'].reset_index(drop=True)

In [None]:
for fold_num in range(5):
  
  infer(train_df = folds, test_df = test_df, model_path="effnet_b5_ns.pkl", val_fold=fold_num)

In [None]:
sub_files = [
                #  '/content/b5_ns_mixed_data__1_ (1).csv',
                #  '/content/sub_ens_b6.csv',
                #  '/content/sub_ens_3_ensemble_b6_b5_ns_b4.csv',
                #  '/content/submission_5_folds_b4.csv',
                #  '/content/submission_4.csv',
                  '/content/final_blend.csv',
                  '/content/sub_ens.csv',
                  # '/content/sub_vit.csv',
                  # '/content/orient_3_b4_ns_back.csv',
                  # '/content/sub_ens_densenet2.csv',
             

]

# Weights of the individual subs
sub_weight = [
                # 0.728**2,
                # 0.689**2,
                # 0.749**2,
                # 0.712**2,
                # 0.629**2,
                # 0.770**2,
                0.898**2,
                0.897**2,
                # 0.139**2
            ]

In [None]:
sub = pd.read_csv('/content/orient_3_b3_ns.csv')
sub
# sub['image_id'] = sub['image'].apply(lambda x: x.replace('.JPG', ''))
# sub[['image_id', 'predictions']].to_csv('submission_0.csv', index=False)

In [None]:
sub1 = pd.read_csv('/content/final_blend.csv')
sub1['predictions'] = sub1['prediction1'] + " " + sub1['prediction2']  + " " + sub1['prediction3'] + " " + sub1['prediction4'] + " " + sub1['prediction5']
sub1['image'] = sub1['image_id']
sub1[['image', 'predictions']].to_csv('/content/final_blend.csv', index=False)

In [None]:
sub1

In [None]:
sub2 = pd.read_csv('/content/submit_1fold_dense.csv')
sub2['predictions'] = sub2['prediction1'] + " " + sub2['prediction2']  + " " + sub2['prediction3'] + " " + sub2['prediction4'] + " " + sub2['prediction5']
sub2['image'] = sub2['image_id']
sub2[['image', 'predictions']].to_csv('submit_1fold_dense.csv', index=False)
sub2

In [None]:
Hlabel = 'image' 
Htarget = 'predictions'
npt = 5
place_weights = {}
for i in range(npt):
    place_weights[i] = ( 1 / (i + 1) )
    
print(place_weights)

lg = len(sub_files)
sub = [None]*lg
for i, file in enumerate( sub_files ):
   
    print("Reading {}: w={} - {}". format(i, sub_weight[i], file))
    reader = csv.DictReader(open(file,"r"))
    sub[i] = sorted(reader, key=lambda d: str(d[Hlabel]))

out = open("sub_ens.csv", "w", newline='')
writer = csv.writer(out)
writer.writerow([Hlabel,Htarget])

for p, row in enumerate(sub[0]):
    target_weight = {}
    for s in range(lg):
        row1 = sub[s][p]
        for ind, trgt in enumerate(row1[Htarget].split(' ')):
            target_weight[trgt] = target_weight.get(trgt,0) + (place_weights[ind]*sub_weight[s])
    tops_trgt = sorted(target_weight, key=target_weight.get, reverse=True)[:npt]
    writer.writerow([row1[Hlabel], " ".join(tops_trgt)])
out.close()

In [None]:
sample_submission

In [None]:
result = pd.read_csv('/content/sub_ens.csv')
result

In [None]:
pred_list = result.predictions.apply(lambda x : x.split(" ")).values.tolist()
pred_list[:5]

In [None]:
result_df = pd.DataFrame(pred_list, columns=['prediction1', 'prediction2', 'prediction3', 'prediction4', 'prediction5'])
result_df

In [None]:
result_df['image_id'] = result['image'].copy()

In [None]:
result_df['image_id'] = result_df['image_id'].apply(lambda x : x.replace('.JPG', ''))
result_df

In [None]:
result_df = result_df[['image_id','prediction1',	'prediction2','prediction3','prediction4','prediction5']].copy()
result_df

In [None]:
result_df.isna().sum()

In [None]:
# result_df.fillna('new_turtle', inplace=True)

In [None]:
result_df.to_csv('final_blend_7.csv', index=False)

In [None]:
result_df.shape

In [None]:
final_sub = pd.concat([
                      pd.read_csv('/content/sub_left.csv'),
                      pd.read_csv('/content/sub_right.csv'),
                      pd.read_csv('/content/sub_top.csv'), 
                      ], axis=0)
final_sub

In [None]:
final_sub['image_id'] = final_sub['image_id'].apply(lambda x : x.replace('.JPG',''))
final_sub

In [None]:
final_sub.to_csv('orient_3_b4_ns_back.csv', index=False)

In [None]:
'''
bs = 32
for Fold in range(N_SPLITS):
    print(f'Epoch-{Fold}')
    dls = create_dls(Val_Fold=Fold,bs=bs)
    learn = cnn_learner(dls, resnet18, metrics=partial(accuracy_multi,thresh=0.2))
    learn = learn.to_fp16()
    learn.fit_one_cycle(1,lr_max=slice(2.2e-6, 2e-4))
    learn.export(f'learn{Fold}.pkl')
    del learn
    torch.cuda.empty_cache()
    gc.collect()
'''