In [21]:
from icecream import ic

# ResNet18 finetune to create embeddings similar to GPT2 embedded captions

The idea of this notebook is to explore and attempt to use contrastive learning to embedd the extracted features of ResNet18 and finetune it for our task.

We might want to do two different approaches:
- Embedd/process captions and perform k-means to make sure they make sense, then use the distance of two captions of two images as the distance that two images should have(contrastive learning)

- Also, once we've proven we can embedd the captions in a way that are distinct by their semantic meaning try to match the image embedding to the caption embedding.

Let's load gpt2 and start playing. Also load the dataset captions.

In [3]:
import pandas as pd
captions = pd.read_csv("dataset/captions.txt")
captions.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [9]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

In [32]:
#return hidden states
gpt2.config.output_hidden_states=True

First question, how are we going to embedd the captions?

We might try to just pass them through gpt2 and the representation before passing it to the LM head (FFC layer) will be our representation of the caption.

So in the end, the task we want to gpt to learn (or the scheme) will be: [image representation as last token (or many tokens)] "< s  > or some token to start and then the generated caption.

Lets try processing a caption with GPT2 to get the last hidden representation before passing it to the FFC for LM.

In [7]:
sample_caption = captions["caption"][0]
sample_caption

'A child in a pink dress is climbing up a set of stairs in an entry way .'

In [11]:
START_TOKEN = "<s>"
END_TOKEN = "<|endoftext|>"

In [52]:
with torch.no_grad():
    inputs = tokenizer(sample_caption, return_tensors="pt")
    ic(inputs["input_ids"].shape)
    outputs = gpt2(**inputs, labels=inputs["input_ids"])
    ic(outputs["logits"].shape) #not last hidden state!
    ic(len(outputs["hidden_states"])) #13 layers 12+embedding
    ic(outputs["hidden_states"][-1].shape) #last layer
    last_hidden = outputs["hidden_states"][-1]
    last_tok_hidden = last_hidden[:,-1,:]

ic| inputs["input_ids"].shape: torch.Size([1, 18])
ic| outputs["logits"].shape: torch.Size([1, 18, 50257])
ic| len(outputs["hidden_states"]): 13
ic| outputs["hidden_states"][-1].shape: torch.Size([1, 18, 768])


Let's encode all the captions and then perform k-means to see if we can differentiate one between the other.

In [57]:
from tqdm import tqdm

In [61]:
gpt2 = gpt2.to("mps")

In [193]:
with torch.no_grad():
    vectors = []
    for i,row in tqdm(captions.iterrows(),total=len(captions)):
        cpt = row["caption"]
        #not sure if everytime we pass something it doesn't do self attention on previously 
        #passed tokens(in other instances) to make sure it doesn't pass "<|endoftext|>"
        inputs = tokenizer(END_TOKEN + cpt, return_tensors="pt").to("mps")
        outputs = gpt2(**inputs, labels=inputs["input_ids"])
        vectors.append(outputs["hidden_states"][-1][:,-1,:].to("cpu").squeeze(0).numpy())

  6%|▌         | 2232/40455 [01:29<25:31, 24.97it/s]


KeyboardInterrupt: 

In [65]:
import pickle as pkl

In [None]:
#save to pkl file the "embedded captions"
with open('captions_gpt2_embedded.pkl', 'wb') as file:
    pkl.dump(vectors, file, protocol=pkl.HIGHEST_PROTOCOL)


In [None]:
with open('captions_gpt2_embedded.pkl', 'rb') as handle:
    vectors = pkl.load(handle)

Perform k-means to see how much they represent this difference in meaning

In [79]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(vectors)

  kmeans = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(vectors)


In [75]:
tags = pd.DataFrame(columns=["tags"])
tags["tags"] = kmeans.labels_
tags.value_counts()

tags
0       40455
dtype: int64

Something is wrong.

In [88]:
import numpy as np

In [92]:
for v1 in tqdm(vectors):
    for v2 in vectors:
        if(np.sum(v1 == v2) < len(vectors[0])):
            print("found two distinc vectors")
            break

  1%|          | 478/40455 [00:54<1:16:22,  8.72it/s]


KeyboardInterrupt: 

In [None]:
#results 

Do the captions in some clusters same some words?

In [None]:
#look for common words...

On the other hand, we'll look into getting some representation a class of the image in a much more hard coded way. We'll lemmatize and apply POS tagging to the captions so we can identifiy the main elements of this very simple captions: Noun - Verb

In [93]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [112]:
#example of what we'll do.
caption = "A child in a pink dress is climbing up a set of stairs in an entry way."
doc = nlp(caption) #process sequence
#for each sequence look for the one that is subject and noun and one that is object verb.
#we might also look for a noun in the object.

sntc = list(doc.sents)
root_token = sntc[0].root
for child in root_token.children:
    if child.dep_ == 'nsubj':
        subj = child
    if child.dep_ == 'dobj':
        obj = child

print(caption)
print(root_token)
print(subj)
print(obj)

A child in a pink dress is climbing up a set of stairs in an entry way.
climbing
child
set


In [133]:
caption = captions["caption"][99]
doc = nlp(caption) #process sequence
#for each sequence look for the one that is subject and noun and one that is object verb.
#we might also look for a noun in the object.

sntc = list(doc.sents)
root_token = sntc[0].root
for child in root_token.children:
    #in case the root is a verb then get the subject, in case the root is a noun get a verb
    if child.pos_  == "VERB" or child.dep_ == "nsubj" : 
        baby = child

print(caption)
print(root_token)
print(baby)

Two men are ice fishing .
are
men


In [131]:
caption = captions["caption"][100]
doc = nlp(caption) #process sequence
#for each sequence look for the one that is subject and noun and one that is object verb.
#we might also look for a noun in the object.

sntc = list(doc.sents)
root_token = sntc[0].root
for child in root_token.children:
    #in case the root is a verb then get the subject, in case the root is a noun get a verb
    if child.pos_  == "VERB" or child.dep_ == "nsubj" : 
        baby = child

print(caption)
print(root_token)
print(baby)

Two different breeds of brown and white dogs play on the beach .
play
breeds


Simpler approach, get all verbs and nouns

In [135]:
caption = captions["caption"][100]
doc = nlp(caption) #process sequence
#for each sequence look for the one that is subject and noun and one that is object verb.
#we might also look for a noun in the object.

nouns = []
verbs = []
root_token = sntc[0].root
for token in doc:
    if token.pos_ == "NOUN":
        nouns.append(token.lemma_)
    if token.pos_ == "VERB":
        verbs.append(token.lemma_)


print(caption)
print(nouns)
print(verbs)

Two different breeds of brown and white dogs play on the beach .
['breed', 'dog', 'beach']
['play']


In [136]:
caption = captions["caption"][10]
doc = nlp(caption) #process sequence
#for each sequence look for the one that is subject and noun and one that is object verb.
#we might also look for a noun in the object.

nouns = []
verbs = []
root_token = sntc[0].root
for token in doc:
    if token.pos_ == "NOUN":
        nouns.append(token.lemma_)
    if token.pos_ == "VERB":
        verbs.append(token.lemma_)


print(caption)
print(nouns)
print(verbs)

A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .
['girl', 'paint', 'front', 'rainbow', 'hand', 'bowl']
['cover', 'sit', 'paint']


In [140]:
caption = captions["caption"][800]
doc = nlp(caption) #process sequence
#for each sequence look for the one that is subject and noun and one that is object verb.
#we might also look for a noun in the object.

nouns = []
verbs = []
root_token = sntc[0].root
for token in doc:
    if token.pos_ == "NOUN":
        nouns.append(token.lemma_)
    if token.pos_ == "VERB":
        verbs.append(token.lemma_)


print(caption)
print(nouns)
print(verbs)

a brown dog jumping into a pool after a bloe ball .
['dog', 'pool', 'bloe', 'ball']
['jump']


To create a class we can embedd the different words using glove and then summ them up to get a representation.

In [144]:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [147]:
# Embed the word "hello"
word = 'hello'
glove_vectors[word]

array([-0.77069  ,  0.12827  ,  0.33137  ,  0.0050893, -0.47605  ,
       -0.50116  ,  1.858    ,  1.0624   , -0.56511  ,  0.13328  ,
       -0.41918  , -0.14195  , -2.8555   , -0.57131  , -0.13418  ,
       -0.44922  ,  0.48591  , -0.6479   , -0.84238  ,  0.61669  ,
       -0.19824  , -0.57967  , -0.65885  ,  0.43928  , -0.50473  ],
      dtype=float32)

In [174]:
glove_vectors.index_to_key

['<user>',
 '.',
 ':',
 'rt',
 ',',
 '<repeat>',
 '<hashtag>',
 '<number>',
 '<url>',
 '!',
 'i',
 'a',
 '"',
 'the',
 '?',
 'you',
 'to',
 '(',
 '<allcaps>',
 '<elong>',
 ')',
 'me',
 'de',
 '<smile>',
 '！',
 'que',
 'and',
 '。',
 '-',
 'my',
 'no',
 '、',
 'is',
 'it',
 '…',
 'in',
 'n',
 'for',
 '/',
 'of',
 'la',
 "'s",
 '*',
 'do',
 "n't",
 'that',
 'on',
 'y',
 "'",
 'e',
 'o',
 'u',
 'en',
 'this',
 'el',
 'so',
 'be',
 "'m",
 'with',
 'just',
 '>',
 'your',
 '^',
 'like',
 'have',
 'te',
 'at',
 '？',
 'love',
 'se',
 'are',
 '<',
 'm',
 'r',
 'if',
 'all',
 'b',
 '・',
 'not',
 'but',
 'we',
 'es',
 'ya',
 '&',
 'follow',
 'up',
 'what',
 'get',
 'lol',
 'un',
 '♥',
 'lo',
 'when',
 'was',
 '“',
 '”',
 'one',
 'por',
 'si',
 'out',
 '_',
 'mi',
 'can',
 '<sadface>',
 'من',
 '♡',
 '´',
 'he',
 'con',
 'they',
 'now',
 'go',
 '،',
 'para',
 'los',
 'know',
 'haha',
 'good',
 'tu',
 'back',
 '~',
 'about',
 'new',
 ';',
 'as',
 'day',
 'how',
 'who',
 'will',
 'want',
 'people',
 'y

In [221]:
def glove_embedd_caption(caption):
    doc = nlp(caption) #process sequence
    #for each sequence look for the one that is subject and noun and one that is object verb.
    #we might also look for a noun in the object.

    nouns = []
    verbs = []
    root_token = sntc[0].root
    for token in doc:
        if token.pos_ == "NOUN":
            nouns.append(token.lemma_)
        if token.pos_ == "VERB":
            verbs.append(token.lemma_)
    
    nouns_emb = np.array([glove_vectors[word] for word in nouns if word in glove_vectors.index_to_key]) #check if in vocab
    verbs_emb = np.array([glove_vectors[word] for word in verbs if word in glove_vectors.index_to_key])


    #sum the vectors up but normalising the number of nouns and verbs
    nouns_emb /= len(nouns_emb)
    verbs_emb /= len(verbs_emb)
    
    try:
        emb  =  np.append(nouns_emb,verbs_emb,axis=0)
    except:
        if nouns_emb.shape[0] > 0:
            emb = nouns_emb
        else:
            emb = verbs_emb

    emb = np.sum(emb,axis=0)

    return emb


In [222]:
caption = captions["caption"][45]
print(caption)
print(glove_embedd_caption(caption))

A black dog leaps over a log .
[-0.29985    -0.08996     0.1367      0.18855667 -0.04983    -0.30964068
  0.55050665 -0.67535007  0.67251664  0.17497666 -0.01235534  0.28218
 -2.8983665   0.06203665 -0.46037334  0.06152333  0.66373664 -0.08760332
  0.08296648 -0.29561666 -0.3776      0.39835998  0.78195333 -0.08756666
 -0.30947334]


Now let's process the whole dataset.

In [223]:
vectors = []
for i,row in tqdm(captions.iterrows(),total=len(captions)):
    caption = row["caption"]
    try:
        emb = glove_embedd_caption(caption)
    except:
        print(i)
    vectors.append(glove_embedd_caption(caption))

100%|██████████| 40455/40455 [06:42<00:00, 100.63it/s]


In [227]:
np.array(vectors)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (40455,) + inhomogeneous part.

In [224]:
#save to pkl file the "embedded captions"
with open('captions_glove_embedded.pkl', 'wb') as file:
    pkl.dump(vectors, file, protocol=pkl.HIGHEST_PROTOCOL)

In [192]:
for i,v in enumerate(vectors):
    if len(v) != 4:
        print(i)
        print(v)
        break

0
[-0.2482962 -1.6422853 -0.3836474  0.2934045 -0.9514509 -0.6690476
 -2.812535 ]


In [225]:
kmeans = KMeans(n_clusters=30, random_state=0, n_init="auto").fit(vectors)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (40455,) + inhomogeneous part.

In [None]:
tags = pd.DataFrame(columns=["tags"])
tags["tags"] = kmeans.labels_
tags.value_counts()