# Private 0.51+ and Public 0.54+ kernel with some straightforward methodologies


**How we approach:**

When this competition first launch, we just fiddled and played around with the parameters of the original baseline. 

After several weeks, thanks to [Ragnar's](http://www.kaggle.com/ragnar123) awesome kernel of training Effnet, we find some practical ways to train our own models. After reading Keetar's fantastic writeup of his GLD retrieval, we trained our Effnet B6 and B7 first with 384 sized images. CV is 0.84 and 0.85 respectively. Then we use the increasing 512 sized images to further tuned our B6 and B7. Training environment is Colab Pro. Then we simply put the two model ensembling predictions to the golbal features extraction. 

In [1]:
!pip install ../input/glrec2020/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/glrec2020/efficientnet-1.1.0-py3-none-any.whl

Processing /kaggle/input/glrec2020/Keras_Applications-1.0.8-py3-none-any.whl
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Processing /kaggle/input/glrec2020/efficientnet-1.1.0-py3-none-any.whl
Installing collected packages: efficientnet
Successfully installed efficientnet-1.1.0


In [2]:
import operator
import gc
import pathlib
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K
from scipy import spatial
import cv2
import efficientnet.tfkeras as efn
import math
import copy
import csv
import os
import numpy as np
import PIL
import pydegensac

In [3]:
NUMBER_OF_CLASSES = 81313
IMAGE_SIZE = [384, 384]
LR = 0.0001

In [4]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output


# Function to build our model using fine tunning (efficientnet)
def get_model_B6():

    margin = ArcMarginProduct(
        n_classes = NUMBER_OF_CLASSES, 
        s = 64, 
        m = 0.15, 
        name='head/arc_margin', 
        dtype='float32'
        )

    inp = tf.keras.layers.Input(shape = (384, 384, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x4 = efn.EfficientNetB6(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x4)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(512)(x)
    x = margin([x, label])

    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])

    opt = tf.keras.optimizers.Adam(learning_rate = LR)

    model.compile(
        optimizer = opt,
        loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
        ) 

    return model

def get_model_B7():

    margin = ArcMarginProduct(
        n_classes = NUMBER_OF_CLASSES, 
        s = 64, 
        m = 0.15, 
        name='head/arc_margin', 
        dtype='float32'
        )

    inp = tf.keras.layers.Input(shape = (384, 384, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x4 = efn.EfficientNetB7(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x4)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(512)(x)
    x = margin([x, label])

    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])

    opt = tf.keras.optimizers.Adam(learning_rate = LR)

    model.compile(
        optimizer = opt,
        loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
        ) 

    return model

In [5]:
# MODEL1 = get_model_B6()
MODEL2 = get_model_B7()

# MODEL1.load_weights('../input/effb6-512-ep18/effb6model512-18.h5')
# MODEL1 = tf.keras.models.Model(inputs = MODEL1.input[0], outputs = MODEL1.layers[-4].output)

MODEL2.load_weights('../input/effb7-512-ep12/effb7model512-12.h5')
MODEL2 = tf.keras.models.Model(inputs = MODEL2.input[0], outputs = MODEL2.layers[-4].output)

In [6]:
MODEL2.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inp1 (InputLayer)            [(None, 384, 384, 3)]     0         
_________________________________________________________________
efficientnet-b7 (Functional) (None, None, None, 2560)  64097680  
_________________________________________________________________
global_average_pooling2d (Gl (None, 2560)              0         
_________________________________________________________________
dropout (Dropout)            (None, 2560)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               1311232   
Total params: 65,408,912
Trainable params: 65,098,192
Non-trainable params: 310,720
_________________________________________________________________


In [7]:
NUM_TO_RERANK = 3 #originally 5
NUM_PUBLIC_TEST_IMAGES = 10345 # Used to detect if in session or re-run.

In [8]:
def read_image(image_path, size = (384, 384)):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, size)
    img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tostring()
    img = tf.image.decode_jpeg(img, channels = 3)
    img = tf.image.resize(img, (512, 512))
    img = tf.cast(img, tf.float32) / 255.0
    img = tf.reshape(img, [1, 512, 512, 3])
    return img

In [9]:
INPUT_DIR = os.path.join('..', 'input')

DATASET_DIR = os.path.join(INPUT_DIR, 'landmark-recognition-2021')
TEST_IMAGE_DIR = os.path.join(DATASET_DIR, 'test')
TRAIN_IMAGE_DIR = os.path.join(DATASET_DIR, 'train')
TRAIN_LABELMAP_PATH = os.path.join(DATASET_DIR, 'train.csv')

In [10]:
NUM_PUBLIC_TRAIN_IMAGES = 1580470 # Used to detect if in session or re-run.
MAX_NUM_EMBEDDINGS = -1  # Set to > 1 to subsample dataset while debugging.

# Retrieval & re-ranking parameters:
NUM_TO_RERANK = 3
TOP_K = 3 #Number of retrieved images used to make prediction for a test image.

# RANSAC parameters:
MAX_INLIER_SCORE = 25
MAX_REPROJECTION_ERROR = 7.0
# MAX_RANSAC_ITERATIONS = 1000000
MAX_RANSAC_ITERATIONS = 10000
HOMOGRAPHY_CONFIDENCE = 0.99

debug_mode = False

if debug_mode:
    MAX_NUM_EMBEDDINGS = 10
    NUM_PUBLIC_TRAIN_IMAGES = 10

In [11]:
SAVED_MODEL_DIR = '../input/delg-saved-models/local_and_global'
DELG_MODEL = tf.saved_model.load(SAVED_MODEL_DIR)
DELG_IMAGE_SCALES_TENSOR = tf.convert_to_tensor([0.70710677, 1.0, 2])
DELG_SCORE_THRESHOLD_TENSOR = tf.constant(375.)
DELG_INPUT_TENSOR_NAMES = [
    'input_image:0', 'input_scales:0', 'input_abs_thres:0'
]
# NUM_EMBEDDING_DIMENSIONS = 1024
# NUM_EMBEDDING_DIMENSIONS = 2048
# NUM_EMBEDDING_DIMENSIONS = 3072
NUM_EMBEDDING_DIMENSIONS = 512


GLOBAL_FEATURE_EXTRACTION_FN = DELG_MODEL.prune(DELG_INPUT_TENSOR_NAMES,
                                                ['global_descriptors:0'])

LOCAL_FEATURE_NUM_TENSOR = tf.constant(1000)
LOCAL_FEATURE_EXTRACTION_FN = DELG_MODEL.prune(
    DELG_INPUT_TENSOR_NAMES + ['input_max_feature_num:0'],
    ['boxes:0', 'features:0'])

In [12]:
def to_hex(image_id) -> str:
  return '{0:0{1}x}'.format(image_id, 16)


def get_image_path(subset, image_id):
  name = to_hex(image_id)
  return os.path.join(DATASET_DIR, subset, name[0], name[1], name[2],
                      '{}.jpg'.format(name))


def load_image_tensor(image_path):
  return tf.convert_to_tensor(
      np.array(PIL.Image.open(image_path).convert('RGB')))

In [13]:
# def extract_global_features(filepaths):
#     image_paths = [x for x in pathlib.Path(filepaths).rglob('*.jpg')]
#     num_images = len(image_paths)
#     ids = num_images * [None]
#     # Generate an empty matrix where we can store the embeddings of each image
#     embeddings = np.empty((num_images, NUM_EMBEDDING_DIMENSIONS))
#     for i, image_path in enumerate(image_paths):
#         ids[i] = int(image_path.name.split('.')[0], 16)
#         image_tensor = read_image(str(image_path), (384, 384)) #384
#         prediction1 = MODEL1.predict(image_tensor)
#         prediction2 = MODEL2.predict(image_tensor)
#         prediction = tf.concat([prediction1, prediction2], 1)
# #         prediction = tf.concat([prediction1, prediction2 , prediction1, prediction2], 1)
        
#         embeddings[i, :] = prediction
#     return ids, embeddings

def extract_global_features(filepaths):
    image_paths = [x for x in pathlib.Path(filepaths).rglob('*.jpg')]
    num_images = len(image_paths)
    ids = num_images * [None]
    # Generate an empty matrix where we can store the embeddings of each image
    embeddings = np.empty((num_images, NUM_EMBEDDING_DIMENSIONS))
    #print(embeddings.shape)
    for i, image_path in enumerate(image_paths):
        ids[i] = int(image_path.name.split('.')[0], 16)
        image_tensor = read_image(str(image_path), (384, 384)) #384
        image_tensor_delg = load_image_tensor(image_path)
#         features = GLOBAL_FEATURE_EXTRACTION_FN(image_tensor_delg,
#                                             DELG_IMAGE_SCALES_TENSOR,
#                                             DELG_SCORE_THRESHOLD_TENSOR)
#         prediction3 = tf.nn.l2_normalize( tf.reduce_sum(features[0], axis=0, name='sum_pooling'), axis=0, name='final_l2_normalization').numpy().reshape(1, 2048)

        prediction1 = MODEL2.predict(image_tensor)
#         prediction2 = prediction1#MODEL2.predict(image_tensor)
        #print(prediction2.shape)
#         prediction = prediction2
        #prediction = tf.concat([prediction3 ,prediction1, prediction2 ], 1)
        #print(prediction1.shape)
        embeddings[i, :] = prediction1
    return ids, embeddings


def extract_local_features(image_path):
  """Extracts local features for the given `image_path`."""

  input_image = load_image_tensor(image_path)
  width = tf.cast(tf.shape(input_image)[0], dtype=tf.float32)
  height = tf.cast(tf.shape(input_image)[1], dtype=tf.float32)
  image_tensor = tf.cast(tf.image.resize(input_image, [tf.cast(pow(2,0.6) * width, dtype=tf.int32), tf.cast(pow(2,0.6) * height, dtype = tf.int32)]), dtype = input_image.dtype)

  features = LOCAL_FEATURE_EXTRACTION_FN(image_tensor, DELG_IMAGE_SCALES_TENSOR,
                                         DELG_SCORE_THRESHOLD_TENSOR,
                                         LOCAL_FEATURE_NUM_TENSOR)

  # Shape: (N, 2)
  keypoints = tf.divide(
      tf.add(
          tf.gather(features[0], [0, 1], axis=1),
          tf.gather(features[0], [2, 3], axis=1)), 2.0).numpy()

  # Shape: (N, 128)
  descriptors = tf.nn.l2_normalize(
      features[1], axis=1, name='l2_normalization').numpy()

  return keypoints, descriptors

# def extract_local_features(image_path):
#   """Extracts local features for the given `image_path`."""

# #   image_tensor = load_image_tensor(image_path)
#   input_image = load_image_tensor(image_path)
#   width = tf.cast(tf.shape(input_image)[0], dtype=tf.float32)
#   height = tf.cast(tf.shape(input_image)[1], dtype=tf.float32)
    
#   image_tensor = tf.cast(tf.image.resize(input_image, [tf.cast(pow(2,0.6) * width, dtype=tf.int32), tf.cast(pow(2,0.6) * height, dtype = tf.int32)]), dtype = input_image.dtype)

#   features = LOCAL_FEATURE_EXTRACTION_FN(image_tensor, DELG_IMAGE_SCALES_TENSOR,
#                                          DELG_SCORE_THRESHOLD_TENSOR,
#                                          LOCAL_FEATURE_NUM_TENSOR)

#   # Shape: (N, 2)
#   keypoints = tf.divide(
#       tf.add(
#           tf.gather(features[0], [0, 1], axis=1),
#           tf.gather(features[0], [2, 3], axis=1)), 2.0).numpy()

#   # Shape: (N, 128)
#   descriptors = tf.nn.l2_normalize(
#       features[1], axis=1, name='l2_normalization').numpy()

#   return keypoints, descriptors


def get_putative_matching_keypoints(test_keypoints,
                                    test_descriptors,
                                    train_keypoints,
                                    train_descriptors,
                                    max_distance=0.75):
  """Finds matches from `test_descriptors` to KD-tree of `train_descriptors`."""

  train_descriptor_tree = spatial.cKDTree(train_descriptors)
  _, matches = train_descriptor_tree.query(
      test_descriptors, distance_upper_bound=max_distance)

  test_kp_count = test_keypoints.shape[0]
  train_kp_count = train_keypoints.shape[0]

  test_matching_keypoints = np.array([
      test_keypoints[i,]
      for i in range(test_kp_count)
      if matches[i] != train_kp_count
  ])
  train_matching_keypoints = np.array([
      train_keypoints[matches[i],]
      for i in range(test_kp_count)
      if matches[i] != train_kp_count
  ])

  return test_matching_keypoints, train_matching_keypoints


def get_num_inliers(test_keypoints, test_descriptors, train_keypoints,
                    train_descriptors):
  """Returns the number of RANSAC inliers."""

  test_match_kp, train_match_kp = get_putative_matching_keypoints(
      test_keypoints, test_descriptors, train_keypoints, train_descriptors)

  if test_match_kp.shape[
      0] <= 4:  # Min keypoints supported by `pydegensac.findHomography()`
    return 0

  try:
    _, mask = pydegensac.findHomography(test_match_kp, train_match_kp,
                                        MAX_REPROJECTION_ERROR,
                                        HOMOGRAPHY_CONFIDENCE,
                                        MAX_RANSAC_ITERATIONS)
  except np.linalg.LinAlgError:  # When det(H)=0, can't invert matrix.
    return 0

  return int(copy.deepcopy(mask).astype(np.float32).sum())


def get_total_score(num_inliers, global_score):
  local_score = min(num_inliers, MAX_INLIER_SCORE) / MAX_INLIER_SCORE
  return local_score + global_score


def rescore_and_rerank_by_num_inliers(test_image_id,
                                      train_ids_labels_and_scores):
  """Returns rescored and sorted training images by local feature extraction."""

  test_image_path = get_image_path('test', test_image_id)
  test_keypoints, test_descriptors = extract_local_features(test_image_path)

  for i in range(len(train_ids_labels_and_scores)):
    train_image_id, label, global_score = train_ids_labels_and_scores[i]

    train_image_path = get_image_path('train', train_image_id)
    train_keypoints, train_descriptors = extract_local_features(
        train_image_path)

    num_inliers = get_num_inliers(test_keypoints, test_descriptors,
                                  train_keypoints, train_descriptors)
    total_score = get_total_score(num_inliers, global_score)
    train_ids_labels_and_scores[i] = (train_image_id, label, total_score)

  train_ids_labels_and_scores.sort(key=lambda x: x[2], reverse=True)

  return train_ids_labels_and_scores


def load_labelmap():
  with open(TRAIN_LABELMAP_PATH, mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    labelmap = {row['id']: row['landmark_id'] for row in csv_reader}

  return labelmap


def get_prediction_map(test_ids, train_ids_labels_and_scores):
  """Makes dict from test ids and ranked training ids, labels, scores."""

  prediction_map = dict()

  for test_index, test_id in enumerate(test_ids):
    hex_test_id = to_hex(test_id)

    aggregate_scores = {}
    for _, label, score in train_ids_labels_and_scores[test_index][:TOP_K]:
      if label not in aggregate_scores:
        aggregate_scores[label] = 0
      aggregate_scores[label] += score

    label, score = max(aggregate_scores.items(), key=operator.itemgetter(1))

    prediction_map[hex_test_id] = {'score': score, 'class': label}

  return prediction_map


def get_predictions(labelmap):
  """Gets predictions using embedding similarity and local feature reranking."""

  test_ids, test_embeddings = extract_global_features(TEST_IMAGE_DIR)

  train_ids, train_embeddings = extract_global_features(TRAIN_IMAGE_DIR)

  train_ids_labels_and_scores = [None] * test_embeddings.shape[0]

  # Using (slow) for-loop, as distance matrix doesn't fit in memory.
  for test_index in range(test_embeddings.shape[0]):
    distances = spatial.distance.cdist(
        test_embeddings[np.newaxis, test_index, :], train_embeddings,
        'cosine')[0]
    partition = np.argpartition(distances, NUM_TO_RERANK)[:NUM_TO_RERANK]

    nearest = sorted([(train_ids[p], distances[p]) for p in partition],
                     key=lambda x: x[1])

    train_ids_labels_and_scores[test_index] = [
        (train_id, labelmap[to_hex(train_id)], 1. - cosine_distance)
        for train_id, cosine_distance in nearest
    ]

  del test_embeddings
  del train_embeddings
  del labelmap
  gc.collect()

  pre_verification_predictions = get_prediction_map(
      test_ids, train_ids_labels_and_scores)

#  return None, pre_verification_predictions

  for test_index, test_id in enumerate(test_ids):
    train_ids_labels_and_scores[test_index] = rescore_and_rerank_by_num_inliers(
        test_id, train_ids_labels_and_scores[test_index])

  post_verification_predictions = get_prediction_map(
      test_ids, train_ids_labels_and_scores)
#   print(post_verification_predictions)

  return pre_verification_predictions, post_verification_predictions


def save_submission_csv(predictions=None):

  if predictions is None:
    # Dummy submission!
    shutil.copyfile(
        os.path.join(DATASET_DIR, 'sample_submission.csv'), 'submission.csv')
    return

  with open('submission.csv', 'w') as submission_csv:
    csv_writer = csv.DictWriter(submission_csv, fieldnames=['id', 'landmarks'])
    csv_writer.writeheader()
    for image_id, prediction in predictions.items():
      label = prediction['class']
      score = prediction['score']
      csv_writer.writerow({'id': image_id, 'landmarks': f'{label} {score}'})


def main():
  labelmap = load_labelmap()
  num_training_images = len(labelmap.keys())
  print(f'Found {num_training_images} training images.')

  if num_training_images == NUM_PUBLIC_TRAIN_IMAGES:
    print('Copying sample submission.')
    save_submission_csv()
    return

  _, post_verification_predictions = get_predictions(labelmap)
  save_submission_csv(post_verification_predictions)

In [14]:
if __name__ == '__main__':
  main()

Found 1580470 training images.
Copying sample submission.
