In [0]:
#!pip install gensim
import numpy as np
import tensorflow as tf
import gensim
import re
from gensim.models import Word2Vec
import requests, zipfile, io
from PIL import Image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import Dense, Flatten

In [0]:
# downloading data
image_data_url = "http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_Dataset.zip"
text_data_url = "http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_text.zip"

# downloading images
images_from_url = requests.get(image_data_url)
# extraction
images_zip = zipfile.ZipFile(io.BytesIO(images_from_url.content))
images_zip.extractall()

# downloading captions
text_from_url = requests.get(text_data_url)
# extraction
text_zip = zipfile.ZipFile(io.BytesIO(text_from_url.content))
text_zip.extractall()

!rm -r __MACOSX

In [0]:
### Preparing metadata

# respective captions
captions = open("Flickr8k.token.txt").read()

# some processing magic
captions = captions.split("\n")
captions = [i.split("\t") for i in captions]

# removing (.)
for c in range(len(captions)-1):
    captions[c][1] = ' '.join(re.findall(r'\w+[a-zA-Z]', captions[c][1]))
    

# cleaning up the image names
for j in range(len(captions)):
    captions[j][0] = captions[j][0][0:len(captions[j][0])-2]

    
# this dictionary holds the image names and respective captions
meta_data = {}
for i in range(0, len(captions)-1, 5):
    meta_data[captions[i][0]] = [captions[i+j][1] for j in range(5)]
    
# sample
i = 0
for m in meta_data.keys():
    print(m, ":")
    print(meta_data[m], "\n")
    i+=1
    if i==3:break

# removing a bad key
del(meta_data["2258277193_586949ec62.jpg.1"])

In [0]:
embedding_len = 200          # length of word vector

### making word embedding
# to do this we'll loop over all the sentences in the whole dataset and make word vectors out of it using gensim

# this list will hold all the sentences 
sentences = []

for key in meta_data.keys():
    for s in meta_data[key]:
        sentences.append(s.split())
        
# training the word2vec model
embedding = Word2Vec(sentences, size=embedding_len, min_count=0, window=4, sg=1, workers=20)
embedding.train(sentences, total_examples=len(sentences), epochs = 10)

# demo
embedding.wv.most_similar("entry", topn=3)

In [0]:
### Placeholders
x_train = tf.placeholder(tf.float32, [None, embedding_len])
y_train = tf.placeholder(tf.float32, [None, embedding_len])
x_img = tf.placeholder(tf.float32, [None, 224, 224, 3])

In [0]:
def GRU(n_h, n_em, a_t, x, w = None, init=False, t=0):
    """
        n_h    : no. of neurons in GRU hidden layer
        n_em   : length of embedding vector
        a_t    : hidden state vector
        x      : word vector input
        w      : weights
        init   : whether to initialize weights 
        t      : time step
    """
    
    if init == True:
        tf.reset_default_graph()
        # layer 1
        w1_rx = tf.get_variable('w1_rx', [n_h, n_em], initializer=tf.contrib.layers.xavier_initializer())
        w1_ra = tf.get_variable('w1_ra', [n_h, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b1_r  = tf.get_variable('b1_r', [n_h, 1], initializer=tf.contrib.layers.xavier_initializer())
    
        w1_ux = tf.get_variable('w1_ux', [n_h, n_em], initializer=tf.contrib.layers.xavier_initializer())
        w1_ua  = tf.get_variable('w1_ua', [n_h, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b1_u   = tf.get_variable('b1_u', [n_h, 1], initializer=tf.contrib.layers.xavier_initializer())

        w1_cx  = tf.get_variable('w1_cx', [n_h, n_em], initializer=tf.contrib.layers.xavier_initializer())
        w1_ca  = tf.get_variable('w1_ca', [n_h, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b1_c   = tf.get_variable('b1_c', [n_h, 1], initializer=tf.contrib.layers.xavier_initializer())

        w1_y   = tf.get_variable('w1_y', [n_em, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b1_y   = tf.get_variable('b1_y', [n_em, 1], initializer=tf.contrib.layers.xavier_initializer())
        
        # layer 2
        w2_rx = tf.get_variable('w2_rx', [n_h, n_em], initializer=tf.contrib.layers.xavier_initializer())
        w2_ra = tf.get_variable('w2_ra', [n_h, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b2_r  = tf.get_variable('b2_r', [n_h, 1], initializer=tf.contrib.layers.xavier_initializer())
    
        w2_ux = tf.get_variable('w2_ux', [n_h, n_em], initializer=tf.contrib.layers.xavier_initializer())
        w2_ua  = tf.get_variable('w2_ua', [n_h, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b2_u   = tf.get_variable('b2_u', [n_h, 1], initializer=tf.contrib.layers.xavier_initializer())

        w2_cx  = tf.get_variable('w2_cx', [n_h, n_em], initializer=tf.contrib.layers.xavier_initializer())
        w2_ca  = tf.get_variable('w2_ca', [n_h, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b2_c   = tf.get_variable('b2_c', [n_h, 1], initializer=tf.contrib.layers.xavier_initializer())

        w2_y   = tf.get_variable('w2_y', [n_em, n_h], initializer=tf.contrib.layers.xavier_initializer())
        b2_y   = tf.get_variable('b2_y', [n_em, 1], initializer=tf.contrib.layers.xavier_initializer())
   

    # word vector at step t
    x_t = tf.reshape(x[t], [x.shape[1],1])
    a_t = tf.reshape(a_t, [a_t.shape[1],1])
    
    # gru layer 1    
    reset1 = tf.nn.sigmoid( tf.matmul(w1_rx, x_t) + tf.matmul(w1_ra, a_t) + b1_r )
    update1 = tf.nn.sigmoid( tf.matmul(w1_ux, x_t) + tf.matmul(w1_ua, a_t) + b1_u )
    cell1 = tf.nn.tanh( tf.matmul(w1_cx, x_t) + tf.matmul(w1_ca, tf.multiply(reset1,a_t)) + b1_c )
    h1 = tf.multiply(update1, a_t) + tf.multiply((1-update1), cell)
    yhat1 = tf.nn.softmax(tf.matmul(w1_y, h1) + b1_y)
    
    # gru layer 2
    reset2 = tf.nn.sigmoid( tf.matmul(w2_rx, x_t) + tf.matmul(w2_ra, a_t) + b2_r )
    update2 = tf.nn.sigmoid( tf.matmul(w2_ux, x_t) + tf.matmul(w2_ua, a_t) + b2_u )
    cell2 = tf.nn.tanh( tf.matmul(w2_cx, x_t) + tf.matmul(w2_ca, tf.multiply(reset2,a_t)) + b2_c )
    h2 = tf.multiply(update2, a_t) + tf.multiply((1-update2), cell)
    yhat2 = tf.nn.softmax(tf.matmul(w2_y, h1) + b2_y)
    
    #if t <= x.shape[0]:
     #   GRU(n_size = n_size, a_t = h2, x = yhat, w=w, t = t+1)
    
    return yhat

In [9]:
# using VGG16 for the image classifier(encoder)

with tf.device('/gpu:0'):
    vgg = VGG16(include_top=False, weights='imagenet', input_tensor=x_img, input_shape=(224,224,3))
    encoder = vgg.output
    encoder = Dense(1024, activation="relu")(encoder)
    encoder = Flatten()(encoder)
    encoder = Dense(512, activation="tanh")(encoder)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [0]:
GRU(100, n_em = 200, a_t=encoder, x = x_train, init=True, t=0)