In [49]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
%cd /gdrive/MyDrive/University/GoogleTrainingCamp/data

/gdrive/MyDrive/University/GoogleTrainingCamp/data


In [3]:
try:
  # %tensorflow_version only exists in Colab.
  import tensorflow.compat.v2 as tf
except Exception:
  pass
tf.enable_v2_behavior()

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

#from __future__ import absolute_import, division, print_function, unicode_literals
import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
from tqdm.auto import tqdm
import csv
import pandas as pd

# Reload Data and Preprocess

In [50]:
# Download caption annotation files
annotation_folder = '/annotations/'
# if not os.path.exists(os.path.abspath('.') + annotation_folder):
#   annotation_zip = tf.keras.utils.get_file('captions.zip',
#                                             cache_subdir=os.path.abspath('.'),
#                                             origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
#                                             extract = True)
#   annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'
#   os.remove(annotation_zip)

# Download image files
image_folder = '/train2014/'
# if not os.path.exists(os.path.abspath('.') + image_folder):
#   image_zip = tf.keras.utils.get_file('train2014.zip',
#                                       cache_subdir=os.path.abspath('.'),
#                                       origin = 'http://images.cocodataset.org/zips/train2014.zip',
#                                       extract = True)
#   PATH = os.path.dirname(image_zip) + image_folder
#   os.remove(image_zip)
# else:
PATH = os.path.abspath('.') + image_folder

In [51]:
annotation_file = './annotations/captions_train2014.json'

PATH = './train2014'

In [52]:
# Read the json file
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

# Store captions and image names in vectors
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    caption = '<start> ' + annot['caption'] + ' <end>'
    image_id = annot['image_id']
    full_coco_image_path = os.path.join(PATH, 'COCO_train2014_' + '%012d.jpg' % (image_id))

    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)

# Shuffle captions and image_names together
# Set a random state
train_captions, img_name_vector = shuffle(all_captions,
                                          all_img_name_vector,
                                          random_state=1)

# Select the first 30000 captions from the shuffled set
num_examples = 30000
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]

In [53]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [54]:
image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

In [None]:
# # Get unique images
# encode_train = sorted(set(img_name_vector))

# # Feel free to change batch_size according to your system configuration
# image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
# image_dataset = image_dataset.map(
#   load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

# for img, path in tqdm(image_dataset):
#   batch_features = image_features_extract_model(img)
#   batch_features = tf.reshape(batch_features,
#                               (batch_features.shape[0], -1, batch_features.shape[3]))

#   for bf, p in zip(batch_features, path):
#     path_of_feature = p.numpy().decode("utf-8")
#     np.save(path_of_feature, bf.numpy())

In [55]:
# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)


# Choose the top 5000 words from the vocabulary
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
#train_seqs = tokenizer.texts_to_sequences(train_captions)

tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)

# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

# Calculates the max_length, which is used to store the attention weights
max_length = calc_max_length(train_seqs)

In [56]:
# Create training and validation sets using an 80-20 split
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector,
                                                                    cap_vector,
                                                                    test_size=0.0333,
                                                                    random_state=0)

# NEW: Work with Caption to Got Most Similar Images with Your Query

## Reload previous Results
Here you can start to test your model. We provide you a baseline model that use Jaccard similarity in order to compare 2 captions.
This is a first approach and you have to improve it! 

Loading the Data and "all_captions.csv" file you don't have to train again all image captioning model. 

In [122]:
all_captions = pd.read_csv("all_captions_30_epochs.csv", sep=',') 

real_captions = all_captions['true_caption'].tolist()
pred_captions = all_captions['pred_caption'].tolist()

In [110]:
len(pred_captions)

1000

In [123]:
real_captions[:5]

['a white canoe in a body of water in front of a red bench',
 'a man kiteboarding over waves in the ocean',
 'a team playing baseball on a baseball diamond',
 'a group of players <unk> as they reach for a frisbee',
 'a dog following two people on horses on a beach']

In [124]:
pred_captions[:5]

['a blue boat is sitting in front of a building',
 'a man on a wet suit and riding on a beach',
 'the baseball play at the ball during a game',
 'everyone and boy in uniform with it are playing frisbee',
 'horses on a patch in the ocean']

## Generate captions embeddings

In [None]:
!pip install -U sentence-transformers

In [62]:
from sentence_transformers import SentenceTransformer

In [113]:
# load and set up the SentenceTransformer model 
model_transformer = SentenceTransformer('all-MiniLM-L12-v2', device='cpu')

In [125]:
real_captions_emb = []
for cap in real_captions:
  real_captions_emb.append(model_transformer.encode(cap, batch_size=2048))

In [126]:
pred_captions_emb = []
for cap in pred_captions:
  pred_captions_emb.append(model_transformer.encode(cap, batch_size=2048))

## Use Jaccard as Similarity Criterion

In [None]:
import warnings
import cv2

In [None]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [None]:
# Jaccard Example
a = ['a', 'dog','over','a','bike']
b = ['a', 'cat','over','a','bike']
jaccard_similarity(a,b)

0.6

In [None]:
def get_similar_result_jaccard(idx, real_captions, pred_captions):

    real_captions_j = [x.split() for x in real_captions]
    pred_captions_j = [x.split() for x in pred_captions]

    s_score_list = []

    for idx_2 in range(len(pred_captions_j)):
      
      s_score = jaccard_similarity(real_captions_j[idx], pred_captions_j[idx_2])
      s_score_list.append((idx_2, s_score))

    s_score_list.sort(key=lambda x: x[1], reverse=True)

    return s_score_list

## Use cosine similarity between embeddings

In [66]:
from sentence_transformers import util

In [67]:
def get_similar_result_cosine(idx, real_captions_emb, pred_captions_emb):

    s_score_list = []

    for idx_2 in range(len(pred_captions_emb)):
      
      s_score = util.cos_sim(real_captions_emb[idx], pred_captions_emb[idx_2])
      s_score_list.append((idx_2, s_score))

    s_score_list.sort(key=lambda x: x[1], reverse=True)

    return s_score_list

## Create File with Your Submission Results

In [68]:
def create_submission_file(top_k, img_name_val, real_captions_dict, pred_captions_dict, get_sim_function, key='emb', name='emb_cosine'):

    with open(f'./submissions/submission_{name}.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerow(["caption", "image_list"])

        for idx in tqdm(range(len(img_name_val))):

            s_score_res = get_sim_function(idx, real_captions_dict[key], pred_captions_dict[key])

            writer.writerow([' '.join(real_captions_dict['str'][idx].split()), ' '.join(list(map(lambda x: str(x[0]), s_score_res[:top_k])))])

In [154]:
real_captions_dict = {'str':real_captions, 'emb':real_captions_emb}
pred_captions_dict = {'str':pred_captions, 'emb':pred_captions_emb}

name =  'emb_all-MiniLM-L12-v2_30_epochs' # 'emb_cosine'

In [128]:
create_submission_file(len(img_name_val), img_name_val, real_captions_dict, pred_captions_dict, get_similar_result_cosine, name=name)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [155]:
sub_results = pd.read_csv(f'./submissions/submission_{name}.csv')

In [156]:
sub_results.head(20)

Unnamed: 0,caption,image_list
0,a white canoe in a body of water in front of a...,530 735 375 636 734 0 417 593 811 252 714 99 4...
1,a man kiteboarding over waves in the ocean,548 344 383 502 803 950 900 853 876 227 951 39...
2,a team playing baseball on a baseball diamond,2 46 805 743 343 781 34 321 851 83 534 488 717...
3,a group of players <unk> as they reach for a f...,3 587 212 807 885 52 535 992 288 508 196 743 7...
4,a dog following two people on horses on a beach,860 388 456 928 980 4 100 926 121 536 822 727 ...
5,a cycle <unk> is seen going on the road,724 797 642 517 89 419 369 675 381 737 817 998...
6,a very tiny and very dirty little bathroom,847 356 319 986 761 302 592 923 764 635 597 80...
7,a few people standing on the beach flying a kite,157 844 344 310 933 766 853 383 502 229 727 56...
8,a veggie wrap sitting on a plate covered in gravy,8 818 796 941 479 113 994 659 328 409 216 280 ...
9,a person holding a pair of scissors with orang...,138 88 855 972 586 49 224 80 560 686 577 435 8...


## Baseline

In [74]:
if not os.path.exists('./submissions/submission_baseline.csv'):
  create_submission_file(len(img_name_val), img_name_val, real_captions_dict, pred_captions_dict, get_similar_result_jaccard, key='str', name='baseline')

baseline = pd.read_csv('./submissions/submission_baseline.csv')

## Custom metric definition

In [136]:
def ndcg_metric(image_list):

  ndcg = lambda x: 1 / np.log2(x + 1)

  return np.mean([ndcg(query_res.index(i)+1) for i, query_res in enumerate(image_list)])

## Evaluation

In [75]:
image_list_baseline = [list(map(lambda y: int(y), x.split())) for x in baseline['image_list'].to_list()]

In [157]:
image_list = [list(map(lambda y: int(y), x.split())) for x in sub_results['image_list'].to_list()]

In [158]:
with open('./submissions/_model_ranking.txt', 'a') as ranking_file:
  ranking_file.write(f'submission_{name}, {ndcg_metric(image_list)}\n')

## View Some Results

### Show Qualitative Results for a Choosen Index

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize

stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def show_image(image_fname, new_figure=True):
  if new_figure:
    plt.figure()
  np_img = cv2.imread(image_fname)
  np_img = cv2.cvtColor(np_img, cv2.COLOR_BGR2RGB)
  plt.imshow(np_img) 

def show_qualitative_results(idx1, top_k=20):

    b_score_res = get_similar_result_cosine(idx1, real_captions_dict['emb'], pred_captions_dict['emb'])

    print("Real capt:", ' '.join(real_captions[idx1]))
    print("Pred capt:", ' '.join(pred_captions[idx1]))
    sentence1 = [w for w in real_captions[idx1] if not w in stop_words]
    sentence2 = [w for w in pred_captions[idx1] if not w in stop_words]
    ss = jaccard_similarity(sentence1, sentence2)
    print("Score with True Predicted caption:", ss)
    print()

    show_image(img_name_val[idx1], new_figure=False)
    plt.grid(False)
    plt.ioff()
    plt.axis('off')


    fig = plt.figure(figsize=(10, 7))

    for idx2, (idx, sim_val) in enumerate(b_score_res[:20]):
        print(idx, sim_val, ' '.join(pred_captions[idx]))
        plt.subplot(4, 5, idx2+1)
        show_image(img_name_val[idx], new_figure=False)
        plt.grid(False)
        plt.ioff()
        plt.axis('off')
        plt.title('{}'.format(idx2+1))

In [None]:
show_qualitative_results(idx1 = 0)

### Show Distribution of Right Prediction

In [None]:
all_idx = []
top_k = 1000

for ref_idx in tqdm(range(len(img_name_val))):
    s_score_res = get_similar_result_jaccard(ref_idx, real_captions, pred_captions)
    list_res = list(map(lambda x: x[0], s_score_res[:top_k]))
    index = list_res.index(ref_idx)
    all_idx.append(index)

n, bins, patches = plt.hist(all_idx, bins=1000)
plt.xlabel('top K')
plt.ylabel('Frequency')

plt.show()

## ImageToText -- Trained on COCO -- NOT VALID

In [27]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image

In [26]:
#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

  "Argument interpolation should be of type InterpolationMode instead of int. "


In [33]:
img_emb = []
for img in img_name_val:
  img_emb.append(model.encode(Image.open(img), batch_size=2048))

In [36]:
cap_emb = []
for cap in real_captions:
  cap_emb.append(model.encode(cap, batch_size=2048))

In [159]:
captions_dict = {'str':real_captions, 'emb':cap_emb}
img_dict = {'emb':cap_emb}

name =  'emb_clip-ViT-B-32' # 'emb_cosine'

In [39]:
create_submission_file(len(img_name_val), img_name_val, captions_dict, img_dict, get_similar_result_cosine, name=name)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [160]:
sub_results = pd.read_csv(f'./submissions/submission_{name}.csv')

In [161]:
sub_results.head()

Unnamed: 0,caption,image_list
0,a white canoe in a body of water in front of a...,0 714 598 698 728 329 87 735 93 995 408 133 39...
1,a man kiteboarding over waves in the ocean,1 548 227 638 396 876 434 803 179 900 219 554 ...
2,a team playing baseball on a baseball diamond,2 534 488 781 237 453 321 34 837 292 873 891 1...
3,a group of players <unk> as they reach for a f...,3 265 990 992 939 529 584 52 237 708 962 2 183...
4,a dog following two people on horses on a beach,4 860 766 481 833 809 703 606 528 509 831 789 ...


In [162]:
image_list = [list(map(lambda y: int(y), x.split())) for x in sub_results['image_list'].to_list()]

In [163]:
with open('./submissions/_model_ranking.txt', 'a') as ranking_file:
  ranking_file.write(f'submission_{name}, {ndcg_metric(image_list)}\n')