In [0]:
%tensorflow_version 1.14
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import keras
from keras.applications import ResNet50
from keras.layers import Layer
from keras import regularizers
from keras.engine.topology import Input
from keras.layers import Activation, Add, BatchNormalization, Concatenate, Conv2D, Dense, Flatten, GlobalMaxPooling2D, \
    GlobalAveragePooling2D, Lambda, MaxPooling2D, Reshape
from keras.models import Model
from keras.optimizers import Adam

from collections import Counter

import os
from shutil import copyfile

import os
import random
import shutil
import tarfile
import cv2
import numpy as np
from keras.utils import Sequence

import numpy as np
import cv2
import urllib
import requests
from tqdm import tqdm

import json
import pickle
import sys

import numpy as np
import pandas as pd

from tqdm import tqdm

from keras.models import load_model
from google.colab import files
# src = list(files.upload().values())[0]
# open('keras-vgg16-places365.py','wb').write(src)
# os.chdir("/kaggle/input/keras-vgg16-places365/")
from vgg16_places_365 import VGG16_Places365
# os.chdir("/kaggle/working/")



def check_size(url):
    """
    Helper method to check the size of the file from the url
    """
    r = requests.get(url, stream=True)
    return int(r.headers['Content-Length'])


def download_file(url, filename, bar=True):
    """
    Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
    """
    try:
        chunkSize = 1024
        r = requests.get(url, stream=True)
        with open(filename, 'wb') as f:
            if bar:
                pbar = tqdm(unit="B", total=int(r.headers['Content-Length']))
            for chunk in r.iter_content(chunk_size=chunkSize):
                if chunk:  # filter out keep-alive new chunks
                    if bar:
                        pbar.update(len(chunk))
                    f.write(chunk)
        return filename
    except Exception as e:
        print(e)
        return


def download_image_cv2_urllib(url):
    """
    Modifying the url to download the 360p or 720p version actually slows it down.
    """
    try:
        resp = urllib.request.urlopen(url)
        foo = np.asarray(bytearray(resp.read()), dtype="uint8")
        foo = cv2.imdecode(foo, cv2.IMREAD_COLOR)
        foo = cv2.resize(foo, (192, 192), interpolation=cv2.INTER_AREA)
        foo = cv2.cvtColor(foo, cv2.COLOR_BGR2RGB)
        return foo
    except:
        return np.array([])
    
import numpy as np
import tensorflow as tf
import keras.backend as K




def accuracy_class(y_true, y_pred):
    true = K.argmax(y_true, axis=1)
    pred = K.argmax(y_pred, axis=1)
    matches = K.equal(true, pred)
    return K.mean(matches)


def accuracy_class_numpy(y_true, y_pred):
    true = np.argmax(y_true, axis=1)
    pred = np.argmax(y_pred, axis=1)
    matches = true == pred
    return np.mean(matches)


def getConfidence(y_pred):
    y_pred_max = np.reshape(np.amax(y_pred, axis=1), (y_pred.shape[0], 1))

    top5 = np.zeros((y_pred.shape[0], 5))
    max_indices = np.argsort(y_pred, axis=1)[:, ::-1][:, :5]
    for i in range(y_pred.shape[0]):
        top5[i, :] = y_pred[i, max_indices[i, :]]
    diff = y_pred_max - top5
    weights = np.array([[0., 0.35, 0.28, 0.22, 0.15]])
    weighted_diffs = diff * weights
    return np.sum(weighted_diffs, axis=1)


def getOrder(y_pred):
    summ = getConfidence(y_pred)
    summ_indices = np.argsort(summ)[::-1]
    return summ_indices


def MAP_numpy(y_true, y_pred):
    true = np.argmax(y_true, axis=1)
    pred = np.argmax(y_pred, axis=1)
    matches = true == pred

    order = getOrder(y_pred)
    orderedMatches = matches[order]

    correct = 0.
    summ = 0.
    for i in range(y_true.shape[0]):
        correct += int(orderedMatches[i])
        summ += (correct / (i + 1)) * int(orderedMatches[i])
    return summ / y_true.shape[0]


def validateMAP(model, valid_x, valid_y):
    """
    :param model: the model to use
    :param valid_x: numpy array of validation images
    :param valid_y: list of landmarks of the validation images
    :return:
    """
    N = valid_x.shape[0]
    batchsize = 1000
    conf_list = []
    y_pred_list = []
    validM = N // batchsize + int(N % batchsize > 0)
    for i in range(validM):
        preds = model.predict(valid_x[i * batchsize:min(N, (i + 1) * batchsize), :, :, :])
        conf = list(np.amax(preds, axis=1))
        conf_list.extend(conf)
        y_pred = list(np.argmax(preds, axis=1))
        y_pred_list.extend(y_pred)

    matches = list(np.array(y_pred_list) == np.array(valid_y))

    order = list(np.argsort(conf_list)[::-1])
    orderedMatches = [matches[o] for o in order]

    correct = 0.
    summ = 0.
    for i in range(len(orderedMatches)):
        correct += int(orderedMatches[i])
        summ += (correct / (i + 1)) * int(orderedMatches[i])

    print(np.sum(matches))
    print(correct)
    print(summ / len(orderedMatches))


class DataGen(Sequence):
    """
    This generator downloads one tar file at each epoch. Extracts and selects the valid images from it to
    form batches. And after the epoch is complete, deletes the files to free up space.
    """
    def __init__(self, valid_ids_dict, num_classes, start=10, batch_size=128, steps=10, verbose=1):

        self.valid_ids_dict = valid_ids_dict # dict of image ids to landmarks {image_id: landmark_id}
        self.NUM_CLASSES = num_classes # number of valid classes to consider

        self.batch_size = batch_size
        self.steps = steps # should be equal to the number of epochs
        self.images = []
        self.landmarks = []
        self.tar_idx = start
        self.epoch_init()

    def epoch_init(self):
        self.all_images = []
        self.all_landmarks = []

        if self.tar_idx < 10:
            tarfilestr = "00" + str(self.tar_idx)
        elif self.tar_idx < 100:
            tarfilestr = "0" + str(self.tar_idx)
        else:
            tarfilestr = str(self.tar_idx)

        download_file("https://s3.amazonaws.com/google-landmark/train/images_{}.tar".format(tarfilestr), "images.tar",
                      bar=False)
        #print(os.listdir())
        tar = tarfile.open('images.tar')
        tar.extractall("imagesfolder")
        tar.close()

        self.total = self.pickfiles("imagesfolder")
        self.tar_idx += 1
        print("tar", self.tar_idx - 1, "total:", self.total)

    def pickfiles(self, dirr):
        count = 0
        for f in os.listdir(dirr):
            if os.path.isfile(dirr + "/" + f):
                if f[:-4] in self.valid_ids_dict:
                    self.all_images.append(dirr + "/" + f)
                    self.all_landmarks.append(self.valid_ids_dict[f[:-4]])
                    count += 1
            else:
                count += self.pickfiles(dirr + "/" + f)
        return count

    def normalize(self, data):
        return data / 255 - 0.5

    def __getitem__(self, index):
        image_path_list = self.all_images[index * self.batch_size:min(self.total, (index + 1)) * self.batch_size]
        class_list = self.all_landmarks[index * self.batch_size:min(self.total, (index + 1)) * self.batch_size]

        if len(image_path_list) == 0:
            image_path_list = self.all_images[:self.batch_size]
            class_list = self.all_landmarks[:self.batch_size]

        images = []
        y_list = []
        for ix in range(len(image_path_list)):
            try:
                image_path = image_path_list[ix]
                im = cv2.imread(image_path)
                im = cv2.resize(im, (192, 192), interpolation=cv2.INTER_AREA)
                im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
                if im.shape == (192, 192, 3):
                    images.append(im)
                    y_list.append(class_list[ix])
            except:
                continue

        x = np.array(images)
        y = np.zeros((len(y_list), self.NUM_CLASSES))

        for i in range(len(y_list)):
            y[i, y_list[i]] = 1.

        return x, y

    def on_epoch_end(self):
        self.steps -= 1
        os.unlink("images.tar")
        shutil.rmtree("imagesfolder")
        if self.steps > 0:
            self.epoch_init()

    def __len__(self):
        return self.total // self.batch_size + int(self.total % self.batch_size > 0)


class DataGenURLVersion(Sequence):
    """';'
    This generator uses the image urls from the train dataset to form batches
    and downloads each image individually. It will be approx 10 times slower than above version.
    """
    def __init__(self, valid_urls_dict, num_classes, data, batch_size=24, verbose=1):
        self.batch_size = batch_size
        self.data_urls = data
        self.NUM_CLASSES = num_classes # number of classes
        self.valid_urls_dict = valid_urls_dict # dict of url and corresponding landmark {image_url: landmark}

    def normalize(self, data):
        return data

    def __getitem__(self, index):
        batch_urls = random.sample(self.data_urls, self.batch_size)

        output = []
        y_classes = []
        for url in batch_urls:
            im = download_image_cv2_urllib(url)
            if im.size != 0:
                output.append(im)
                y_classes.append(self.valid_urls_dict[url.split("/")[-1]])

        x = np.array(output)
        y = np.zeros((len(output), self.NUM_CLASSES))

        for i in range(len(y_classes)):
            y[i, y_classes[i]] = 1.

        return x, y

    def on_epoch_end(self):
        return

    def __len__(self):
        # return len(valid_urls_list) // self.batch_size
        return 10

    

# ------------------------------ form the dataset ------------------------------ #

download_file("https://s3.amazonaws.com/google-landmark/metadata/train.csv", "train.csv")
train = pd.read_csv("train.csv")

print(train.head())
print(train.shape)
print("Number of classes {}".format(len(train.landmark_id.unique())))



NUM_THRESHOLD = 20

counts = dict(Counter(train['landmark_id']))
landmarks_dict = {x:[] for x in train.landmark_id.unique() if counts[x] >= NUM_THRESHOLD and x != 138982}
NUM_CLASSES = len(landmarks_dict)
print("Total number of valid classes: {}".format(NUM_CLASSES))

i = 0
landmark_to_idx = {}
idx_to_landmark = []
for k in landmarks_dict:
    landmark_to_idx[k] = i
    idx_to_landmark.append(k)
    i += 1

all_ids = train['id'].tolist()
all_landmarks = train['landmark_id'].tolist()
valid_ids_dict = {x[0].split("/")[-1]:landmark_to_idx[x[1]] for x in zip(all_ids, all_landmarks) if x[1] in landmarks_dict}
valid_ids_list = [x[0] for x in zip(all_ids, all_landmarks) if x[1] in landmarks_dict]

NUM_EXAMPLES = len(valid_ids_list)
print("Total number of valid examples: {}".format(NUM_EXAMPLES))


# ------------------------------------- validation ------------------------------------------------- #

download_file("https://s3.amazonaws.com/google-landmark/train/images_001.tar", "validation.tar", bar=False)
tar = tarfile.open('validation.tar')
tar.extractall("validation")
tar.close()

os.unlink("validation.tar")

print(os.listdir())

validation_images_paths = []
validation_landmarks = []


def pickfiles(dirr):
    count = 0
    for f in os.listdir(dirr):
        if os.path.isfile(dirr + "/" + f):
            if f[:-4] in valid_ids_dict:
                validation_images_paths.append(dirr + "/" + f)
                validation_landmarks.append(valid_ids_dict[f[:-4]])
                count += 1
        else:
            count += pickfiles(dirr + "/" + f)
    return count


total = pickfiles("validation")
print("total:", total)

validation_images = []

for image_path in validation_images_paths:
    im = cv2.imread(image_path)
    im = cv2.resize(im, (192, 192), interpolation=cv2.INTER_AREA)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    validation_images.append(im)

valid_x = np.array(validation_images)
valid_y = np.zeros((len(validation_landmarks), NUM_CLASSES))

for i in range(len(validation_landmarks)):
    valid_y[i, validation_landmarks[i]] = 1.

shutil.rmtree("validation")
del validation_images

# ------------------------------------ model ----------------------------------------- #

res = ResNet50(include_top=False, weights='imagenet', input_shape=(192, 192, 3))

# making all the layers trainable
for layer in res.layers:
    layer.trainable = True

out = GlobalMaxPooling2D()(res.output)
out = Dense(NUM_CLASSES, activation='softmax')(out)
model = Model(res.input, out)
model.summary()
# ---------------------------------- clear block ------------------------------------- #

# folder = "./"
# for file in os.listdir(folder):
#     file_path = os.path.join(folder, file)
#     if os.path.isfile(file_path):
#         os.unlink(file_path)
#     else:
#         import shutil
#         shutil.rmtree(file_path)

#gc.collect()

# ----------------------------------- training ---------------------------------------- #

EPOCHS = 1
opt = Adam(0.0002)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=[accuracy_class])
model.fit_generator(generator=DataGen(valid_ids_dict, NUM_CLASSES, start=10, batch_size=64,steps=EPOCHS),
                   epochs=EPOCHS,
                   validation_data = [valid_x, valid_y],
                   use_multiprocessing=True,
                   workers=8,
                   verbose=2)

EPOCHS = 1
opt = Adam(0.0001)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=[accuracy_class])
model.fit_generator(generator=DataGen(valid_ids_dict, NUM_CLASSES, start=180, batch_size=48,steps=EPOCHS),
                    epochs=EPOCHS,
                    validation_data = [valid_x, valid_y],
                    use_multiprocessing=True,
                    workers=4,
                    verbose=2)

EPOCHS = 1
opt = Adam(0.00004)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=[accuracy_class])
model.fit_generator(generator=DataGen(valid_ids_dict, NUM_CLASSES, start=340, batch_size=48,steps=EPOCHS),
                    epochs=EPOCHS,
                    validation_data = [valid_x, valid_y],
                    use_multiprocessing=True,
                    workers=4,
                    verbose=2)

EPOCHS = 1
opt = Adam(0.00002)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=[accuracy_class])
model.fit_generator(generator=DataGen(valid_ids_dict, NUM_CLASSES, start=390, batch_size=48,steps=EPOCHS),
                    epochs=EPOCHS,
                    validation_data = [valid_x, valid_y],
                    use_multiprocessing=True,
                    workers=4,
                    verbose=2)

# ------------------------------------------- GAP metric validation -------------------------------------- #

# model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
print("model saved")
del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
model = load_model('my_model.h5', custom_objects={
        "accuracy_class": accuracy_class
    })
print("loaded model")
#gap = validateMAP()
#print(gap)

# ------------------------------------------- testset ------------------------------------------------- #

download_file("https://s3.amazonaws.com/google-landmark/metadata/test.csv", "test.csv")
testdf = pd.read_csv("test.csv")
print(testdf.head())

testids = testdf['id'].tolist()
print(len(testids))

# -------------------------------------------- prediction ------------------------------------------------ #

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

import time

tm = time.time()

final_ids = []
final_conf = []
final_preds = []

tar_images = []
tar_ids = []
    
    
def pickfiles(dirr):
    count = 0
    for f in os.listdir(dirr):
        if os.path.isfile(dirr + "/" + f):
            tar_images.append(dirr + "/" + f)
            tar_ids.append(f[:-4])
            count += 1
        else:
            count += pickfiles(dirr + "/" + f)
    return count


for tar in range(20):
    if tar < 10:
        tar_id = "00" + str(tar)
    else:
        tar_id = "0" + str(tar)

    tar_images = []
    tar_ids = []

    download_file("https://s3.amazonaws.com/google-landmark/test/images_{}.tar".format(tar_id), "images.tar", bar=False)
    tar = tarfile.open('images.tar')
    tar.extractall("imagesfolder")
    tar.close()

    os.unlink("images.tar")

    total = pickfiles("imagesfolder")
    print(tar, total, len(tar_ids))
        
    
    N = total
    batchsize = 1000
    conf_list = []
    y_pred_list = []
    validM = N // batchsize + int(N % batchsize > 0)
    
    # Placeholders for predictions


    # Places365 Model
    discrim_model = VGG16_Places365(weights='places')
    class_information = pd.read_csv('categories_places365_extended.csv')
    topn = 5
    
    for i in range(validM):
        temp = tar_images[i * batchsize:min(N, (i + 1) * batchsize)]
        batch_images = []
        batch_images_2 = []
        for t in temp:
            im = cv2.imread(t)
            im = cv2.resize(im, (192, 192), interpolation=cv2.INTER_AREA)
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            batch_images.append(im)
        for t in temp:
            im = cv2.imread(t)
            im = cv2.resize(im, (224, 224), interpolation=cv2.INTER_AREA)
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            batch_images_2.append(im)
        p0, p1, p2 = [], [], []
        isLandmark = []
        batch_images = np.array(batch_images)
        
        
        # Loop through all images
        for (i, image) in enumerate(batch_images_2):
    
            # Predict Top N Image Classes
            image = np.expand_dims(image, 0)
            topn_preds = np.argsort(discrim_model.predict(image)[0])[::-1][0:topn]
            
            p0 = topn_preds[0]
            p1 = topn_preds[1]
            p2 = topn_preds[2]
            p0_landmark = class_information.loc[p0, ['io']][0] # p0.map(.set_index('class')['io'].replace({1:False, 2:True}))
            p1_landmark = class_information.loc[p0, ['io']][0]#p1.map(class_information.set_index('class')['io'].replace({1:False, 2:True}))
            p2_landmark = class_information.loc[p0, ['io']][0]#p2.map(class_information.set_index('class')['io'].replace({1:False, 2:True}))
            num_landmark = 0
            if (p0_landmark == 2): 
                num_landmark += 1
            if (p1_landmark == 2):
                num_landmark += 1
            if (p2_landmark == 2): 
                num_landmark += 1
            isLandmark.append(.01 + .33 * num_landmark)
            
        
        preds = model.predict(batch_images)
        
        conf = list(np.amax(preds, axis=1))
        conf = [c * isLandmark[i] for (i, c) in enumerate(conf)]
        conf_list.extend(conf)
        y_pred = list(np.argmax(preds, axis=1))
                  
        
        y_pred_list.extend(y_pred)

    final_preds.extend(y_pred_list)
    final_conf.extend(conf_list)
    final_ids.extend(tar_ids)
    shutil.rmtree("imagesfolder")

print("time", time.time() - tm)
print(len(final_preds))


# --------------------------------------- submission -------------------------------------- #

out = []
for i in range(len(final_preds)):
    idx = final_preds[i]
    out.append(str(idx_to_landmark[idx]) + " " + str(round(final_conf[i], 10)))

print(out[:5])

outdf = pd.DataFrame({"id": final_ids, "landmarks": out})
print(outdf.head())

outdf.to_csv("submissions.csv", index=False)

# ---------------------------------------- the end ----------------------------------------- #

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.14`. This will be interpreted as: `1.x`.


TensorFlow is already loaded. Please restart the runtime to change versions.




  0%|          | 0/525832518 [00:00<?, ?B/s][A[A

  0%|          | 104448/525832518 [00:00<13:14, 661863.31B/s][A[A

  0%|          | 522240/525832518 [00:00<10:03, 870726.50B/s][A[A

  0%|          | 1856512/525832518 [00:00<07:16, 1201437.77B/s][A[A

  1%|          | 5228544/525832518 [00:00<05:07, 1690523.68B/s][A[A

  1%|▏         | 7813120/525832518 [00:00<03:40, 2349177.81B/s][A[A
 40%|████      | 210882560/525832518 [00:24<00:09, 32458039.93B/s][A

  2%|▏         | 11523072/525832518 [00:00<02:38, 3253760.52B/s][A[A

  3%|▎         | 15537152/525832518 [00:00<01:54, 4452699.90B/s][A[A

  4%|▎         | 19649536/525832518 [00:00<01:24, 6009093.03B/s][A[A

  5%|▍         | 23696384/525832518 [00:01<01:03, 7945758.40B/s][A[A

  5%|▌         | 26542080/525832518 [00:01<00:50, 9838736.93B/s][A[A

  6%|▌         | 30856192/525832518 [00:01<00:38, 12731153.38B/s][A[A

  7%|▋         | 35869696/525832518 [00:01<00:30, 15991860.04B/s][A[A

  8%|▊         | 40

                 id  ... landmark_id
0  6e158a47eb2ca3f6  ...      142820
1  202cd79556f30760  ...      104169
2  3ad87684c99c06e1  ...       37914
3  e7f70e9c61e66af3  ...      102140
4  4072182eddd0100e  ...        2474

[5 rows x 3 columns]
(4132914, 3)
Number of classes 203094
Total number of valid classes: 52583
Total number of valid examples: 3116604




100%|██████████| 525832518/525832518 [00:30<00:00, 46352136.02B/s][A[A

['.config', '__pycache__', 'test.csv', 'categories_places365.txt', 'places365_class_index.json', 'vgg16-hybrid1365.png', 'my_model.h5', 'categories_places365_extended.csv', 'validation', 'places_utils.py', 'train.csv', 'vgg16-places365.png', 'vgg16_places_365.py', 'categories_hybrid1365.txt', 'vgg16_hybrid_places_1365.py', 'sample_data']
total: 6212




Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 192, 192, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 198, 198, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 96, 96, 64)   9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 96, 96, 64)   256         conv1[0][0]                      
____________________________________________________________________________________________




  0%|          | 0/1998812 [00:00<?, ?B/s][A[A[A


  2%|▏         | 34816/1998812 [00:00<00:06, 282967.72B/s][A[A[A


 11%|█▏        | 226304/1998812 [00:00<00:04, 373697.38B/s][A[A[A


100%|██████████| 1998812/1998812 [00:00<00:00, 4321307.03B/s]


                 id
0  00016575233bc956
1  0001aadbcd8cb923
2  0002c06b2440a5f9
3  0002eb1ee5a5a6b2
4  000594dad986513e
117577
<tarfile.TarFile object at 0x7f684f249c50> 5879 5879
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
<tarfile.TarFile object at 0x7f683ba1d898> 5879 5879
