In [2]:
import pandas as pd

import numpy as np

import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.utils import to_categorical
import string

In [3]:
vocab_size = 5000
max_len = 15

In [4]:
df = pd.read_csv("flickr8k/captions")

In [5]:
df.head(5)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [6]:
df["caption"] = df.caption.apply(lambda line:  word_tokenize(line.lower()) )

In [7]:
df.head(5)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,"[a, child, in, a, pink, dress, is, climbing, u..."
1,1000268201_693b08cb0e.jpg,"[a, girl, going, into, a, wooden, building, .]"
2,1000268201_693b08cb0e.jpg,"[a, little, girl, climbing, into, a, wooden, p..."
3,1000268201_693b08cb0e.jpg,"[a, little, girl, climbing, the, stairs, to, h..."
4,1000268201_693b08cb0e.jpg,"[a, little, girl, in, a, pink, dress, going, i..."


In [8]:
words = []

for line in df.caption:
    words.extend(line)

In [9]:
len(words)

476679

In [10]:
words, counts = np.unique(words, return_counts=True)

In [11]:
len(words)

8916

In [12]:
words = words[counts.argsort()[-vocab_size:]]

In [13]:
bad = stopwords.words("english") + list(string.punctuation)

In [14]:
words = [word for word in words if word not in bad]

In [15]:
vocab_size = len(words)

In [16]:
vocab_size

4884

In [17]:
vocab = dict(zip(words, range(1, vocab_size + 1)))

In [18]:
rev_vocab = dict(zip(range(1, vocab_size + 1), words))

In [19]:
sent = df.caption[2000]

In [20]:
" ".join(sent[1:-1])

'climber in an orange helmet is ascending attached to a rope whilst climbing a rock face'

In [21]:
" ".join([word for word in sent[1:-1] if word in vocab])

'climber orange helmet ascending attached rope whilst climbing rock face'

In [22]:
# line = random.choice(df.caption)
# line = [vocab[word] for word in line if word in vocab]

# line

In [27]:
def data_gen(images, lines, batch_size=32):
    
    n = 0
    
    X1, X2, Y = [], [], []
    
    while True:
        
        index = random.randint(0, len(images) - 1)
        
        address = images[index]
        photo = np.array(load_img("./flickr8k/images/"+address, target_size=(256, 256, 3)))
        
        line = lines[index]
        
        line = [vocab[word] for word in line if word in vocab]
        if len(line) > 4:
            index = random.randint(0, len(line)-1)
            X1.append(photo)
            X2.append(line[:index])
            Y.append(to_categorical(line[index], num_classes=vocab_size+1))
            n += 1

            if n % batch_size == 0:
                X1_mod = np.array(X1)
                X2_mod = pad_sequences(X2, maxlen=max_len)
                y_mod = np.array(Y)
                X1, X2, Y = [], [], []
                yield [[X1_mod, X2_mod], y_mod]

    

In [37]:
gen = data_gen(df.image, df.caption, batch_size=100)

In [35]:
data = next(gen)

In [36]:
data[1].shape

(100, 4885)

In [44]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.applications.resnet import ResNet50

In [39]:
resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(256, 256, 3))

In [52]:
for layer in resnet.layers:
    layer.trainable = False

In [53]:
in_layer = Input(shape=(max_len,))
embedding = Embedding(input_dim=vocab_size+1, output_dim=10)(in_layer)
rnn = LSTM(units=500, activation="relu")(embedding)
encoded_caption = Dense(units=256, activation="relu")(rnn)

In [54]:
flat = Flatten()(resnet.output)
encoded_img = Dense(256, activation="relu")(flat)

In [55]:
concat = concatenate([encoded_img, encoded_caption])

In [56]:
out_layer = Dense(units=vocab_size+1, activation="softmax")(concat)

In [57]:
model = Model(inputs=[resnet.input, in_layer], outputs=out_layer)

In [58]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [59]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256, 256, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 262, 262, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 128, 128, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
conv1_bn (BatchNormalization)   (None, 128, 128, 64) 256         conv1_conv[0][0]                 
____________________________________________________________________________________________

In [278]:
gen = data_gen(df.caption, batch_size=2000)

for i in range(5):
    data = next(gen)
    model.fit(data[0], data[1])


Train on 2000 samples
Train on 2000 samples
Train on 2000 samples
Train on 2000 samples
Train on 2000 samples


In [303]:
line = df.caption[200][:2]


In [304]:
line

['two', 'constructions']

In [305]:
line = [vocab[word] for word in line if word in vocab]

In [306]:
X_test = pad_sequences([line], maxlen=max_len)

In [307]:
model.predict(X_test).argmax(axis=1)

array([4884])

In [308]:
rev_vocab[4884]

'dog'