In [0]:
#!pip install gensim
import numpy as np
import tensorflow as tf
import gensim
import re
from gensim.models import Word2Vec
import requests, zipfile, io
from PIL import Image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import Dense, Flatten

In [0]:
# downloading data
image_data_url = "http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_Dataset.zip"
text_data_url = "http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_text.zip"

# downloading images
images_from_url = requests.get(image_data_url)
# extraction
images_zip = zipfile.ZipFile(io.BytesIO(images_from_url.content))
images_zip.extractall()

# downloading captions
text_from_url = requests.get(text_data_url)
# extraction
text_zip = zipfile.ZipFile(io.BytesIO(text_from_url.content))
text_zip.extractall()

In [86]:
### Preparing metadata

# respective captions
captions = open("Flickr8k.token.txt").read()

# some processing magic
captions = captions.split("\n")
captions = [i.split("\t") for i in captions]

# removing (.)
for c in range(len(captions)-1):
    captions[c][1] = ' '.join(re.findall(r'\w+[a-zA-Z]', captions[c][1]))
    

# cleaning up the image names
for j in range(len(captions)):
    captions[j][0] = captions[j][0][0:len(captions[j][0])-2]

    
# this dictionary holds the image names and respective captions
meta_data = {}
for i in range(0, len(captions)-1, 5):
    meta_data[captions[i][0]] = [captions[i+j][1] for j in range(5)]
    
# sample
i = 0
for m in meta_data.keys():
    print(m, ":")
    print(meta_data[m], "\n")
    i+=1
    if i==3:break

# removing a bad key
del(meta_data["2258277193_586949ec62.jpg.1"])

1000268201_693b08cb0e.jpg :
['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin'] 

1001773457_577c3a7d70.jpg :
['black dog and spotted dog are fighting', 'black dog and tri colored dog playing with each other on the road', 'black dog and white dog with brown spots are staring at each other in the street', 'Two dogs of different breeds looking at each other on the road', 'Two dogs on pavement moving toward each other'] 

1002674143_1b742ab4b8.jpg :
['little girl covered in paint sits in front of painted rainbow with her hands in bowl', 'little girl is sitting in front of large painted rainbow', 'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it', 'There is girl with pigtails sitting in front of rainbow painting', 'Young girl with pigtails painting

In [0]:
## hyperparameters
em_vector_len = 200
gru_cell_len = 512
gru_cell_len_2 = 512

In [5]:
### making word embedding
# to do this we'll loop over all the sentences in the whole dataset and make word vectors out of it using gensim

# this list will hold all the sentences 
sentences = []

for key in meta_data.keys():
    for s in meta_data[key]:
        sentences.append(s.split())
        
# training the word2vec model
embedding = Word2Vec(sentences, size=em_vector_len, window=4, sg=1, workers=10)
embedding.train(sentences, total_examples=len(sentences), epochs = 10)

# demo
embedding.wv.most_similar("black", topn=3)

  if np.issubdtype(vec.dtype, np.int):


[('long-haired', 0.6281743049621582),
 ('fluffy', 0.6118836402893066),
 ('beige', 0.60596764087677)]

In [0]:
tf.reset_default_graph()

### Weights 
w_1 = {'w_rx': tf.get_variable("w_rx_1", [gru_cell_len, em_vector_len], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ra': tf.get_variable("w_ra_1", [gru_cell_len, gru_cell_len], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ux': tf.get_variable("w_ux_1", [gru_cell_len, em_vector_len], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ua': tf.get_variable("w_ua_1", [gru_cell_len, gru_cell_len], initializer=tf.contrib.layers.xavier_initializer()),
     'w_hx': tf.get_variable("w_hx_1", [gru_cell_len, em_vector_len], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ar': tf.get_variable("w_ar_1", [gru_cell_len, gru_cell_len], initializer=tf.contrib.layers.xavier_initializer())
    }


w_2 = {'w_rx': tf.get_variable("w_rx_2", [gru_cell_len, gru_cell_len_2], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ra': tf.get_variable("w_ra_2", [gru_cell_len, gru_cell_len], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ux': tf.get_variable("w_ux_2", [gru_cell_len, gru_cell_len_2], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ua': tf.get_variable("w_ua_2", [gru_cell_len, gru_cell_len], initializer=tf.contrib.layers.xavier_initializer()),
     'w_hx': tf.get_variable("w_hx_2", [gru_cell_len, gru_cell_len_2], initializer=tf.contrib.layers.xavier_initializer()),
     'w_ar': tf.get_variable("w_ar_2", [gru_cell_len, gru_cell_len], initializer=tf.contrib.layers.xavier_initializer())
    }

### Placeholders
x = tf.placeholder(tf.float32, [None, em_vector_len])
y = tf.placeholder(tf.float32, [None, em_vector_len])
x_img = tf.placeholder(tf.float32, [None, 224, 224, 3])

In [0]:
# using InceptionV3 for the image classifier(encoder)

vgg = VGG16(include_top=False, weights='imagenet', input_tensor=x_img, input_shape=(224,224,3))
encoder = vgg.output
encoder = Dense(1024, activation="relu")(encoder)
encoder = Flatten()(encoder)
encoder = Dense(512, activation="tanh")(encoder)

In [0]:
# GRU Cell
def Gru_cell(n_h, a, x_in, weights, return_seq = False, y_hat = [], t=0):
    """
        n_h        : number of neurons in hidden layer
        a          : Initial hidden state
        x_in       : 
        return_seq : whether to return all the values of prediction vector yhat or just the last output
        t          : Time Step
    """
    a = tf.reshape(a, (n_h, 1))
   
    x_calc = tf.reshape(x_in[t], (x_in[t].shape[0], 1))
    
    reset_gate = tf.nn.sigmoid(tf.matmul(weights['w_rx'], x_calc) + tf.matmul(weights['w_ra'], a))
    update_gate = tf.nn.sigmoid(tf.matmul(weights['w_ux'], x_calc) + tf.matmul(weights['w_ua'], a))
    
    cell = tf.nn.tanh(tf.matmul(weights['w_hx'], x_calc) + tf.matmul(weights['w_ar'], tf.multiply(a, reset_gate)))
    a = tf.multiply( (1-update_gate), cell) + tf.multiply(update_gate, a)
    y_hat.append(tf.nn.softmax(a))
    
    
    # recursively iterate over all time steps
    if t == x_in[t].shape[0] - 1:
        Gru_cell(n_h, a, x_in[t+1], return_seq, y_hat = y_hat, t = t+1)
    
    if return_seq == True:
        return y_hat
    else: 
        return y_hat[-1]

In [59]:
# preparing image data
# empty list to hold all the images
images = []

# loop over all keys in meta_data 
for img in meta_data.keys():
    # image object -> numpy array -> append to list
    images.append(np.asarray(Image.open("Flicker8k_Dataset/" + str(img)).resize((224,224))).reshape(224,224,3))
    
images = np.array(images)  

print("image data: ", images.shape)

image data:  (8091, 224, 224, 3)


In [0]:
# preparing text data
# dictionary to hold word vectors for images
text = {}

for key in meta_data.keys():
    for word in 
    text[key] = 
########################## fix this ##########################

In [0]:
s = tf.Session()
s.run(tf.global_variables_initializer())

img = np.asarray(Image.open("Flicker8k_Dataset/1000268201_693b08cb0e.jpg").resize((224,224))).reshape(1,224,224,3)

encoder_vector = s.run(encoder, {x_img: img})


decoder_1 = Gru_cell(gru_cell_len, e, x, weights = w_1, return_seq=True)
decoder_2 = Gru_cell(gru_cell_len, e, decoder_1, weights = w_2, return_seq=False)

