In [8]:
import os
import numpy as np
import tensorflow as tf

In [9]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6427595556842017175
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14048821248
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9441591463301344886
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4080, pci bus id: 0000:01:00.0, compute capability: 8.9"
xla_global_id: 416903419
]


In [10]:
ess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA GeForce RTX 4080, pci bus id: 0000:01:00.0, compute capability: 8.9



In [11]:
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [12]:
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load training dataset (6K)
filename = 'Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

Dataset: 6000


In [13]:
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=6000


In [14]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

30000

In [15]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

preprocessed words 7578 -> 1651


In [16]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

In [17]:
print(ixtoword)

{1: 'startseq', 2: 'child', 3: 'in', 4: 'pink', 5: 'dress', 6: 'is', 7: 'climbing', 8: 'up', 9: 'set', 10: 'of', 11: 'stairs', 12: 'an', 13: 'way', 14: 'endseq', 15: 'girl', 16: 'going', 17: 'into', 18: 'wooden', 19: 'building', 20: 'little', 21: 'the', 22: 'to', 23: 'her', 24: 'black', 25: 'dog', 26: 'and', 27: 'spotted', 28: 'are', 29: 'fighting', 30: 'tricolored', 31: 'playing', 32: 'with', 33: 'each', 34: 'other', 35: 'on', 36: 'road', 37: 'white', 38: 'brown', 39: 'spots', 40: 'staring', 41: 'at', 42: 'street', 43: 'two', 44: 'dogs', 45: 'different', 46: 'looking', 47: 'pavement', 48: 'moving', 49: 'toward', 50: 'covered', 51: 'paint', 52: 'sits', 53: 'front', 54: 'painted', 55: 'rainbow', 56: 'hands', 57: 'bowl', 58: 'sitting', 59: 'large', 60: 'small', 61: 'grass', 62: 'plays', 63: 'it', 64: 'there', 65: 'pigtails', 66: 'painting', 67: 'young', 68: 'outside', 69: 'man', 70: 'lays', 71: 'bench', 72: 'while', 73: 'his', 74: 'by', 75: 'him', 76: 'which', 77: 'also', 78: 'tied', 79:

In [18]:
vocab_size = len(ixtoword) + 1 # one for appended 0's
vocab_size

1652

In [19]:
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 34


In [20]:
# Load Glove vectors
glove_dir = 'archive'
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [21]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [22]:
embedding_matrix.shape

(1652, 200)

In [23]:
print(embedding_matrix[6])

[ 0.32927999  0.25525999  0.26752999 -0.084809    0.29764     0.062339
 -0.15475     0.17783999  0.32328001 -0.92751998  0.15194     0.16324
 -0.10428    -0.026464    0.65970999  0.14782     0.38622999  0.25169
  0.1261     -0.43138     0.28092     3.16039991 -0.17565    -0.0032247
  0.64389002 -0.39697     0.18975     0.37999001 -0.079175   -0.14781
 -0.072965    0.057247   -0.42313999  0.4508     -0.097386   -0.47587001
 -0.96599001 -0.75594997 -0.033932   -0.070886   -0.44828001 -0.52094001
 -0.1823      0.18582    -0.074273   -0.017871    0.16742     0.015459
  0.30289999 -0.1258      0.32418001 -0.31263    -0.076832    0.051959
  0.27241999 -0.18285    -0.36478999 -0.63562    -0.21685     0.035812
  0.12485     0.37268001 -0.16976    -0.094146   -0.16412    -0.10728
  0.037866    0.1175     -0.15533     0.34062001  0.58848     0.38992
 -0.54838997  0.85013002 -0.83727998  0.15482    -0.37191001 -0.65408999
 -0.27631    -0.025224    0.075732   -0.23904    -0.18311    -0.084571
  0.

In [25]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
#from keras.optimizers import Adam, RMSprop
#from keras.layers.wrappers import Bidirectional
from keras.layers import add
#from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
#from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [26]:
def data_generator(descriptions, photos, wordtoix, max_length):
    X1, X2, y = list(), list(), list()
    for key, desc_list in descriptions.items():
        photo = photos[key+'.jpg']
        for desc in desc_list:
                # encode the sequence
            seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
            for i in range(1, len(seq)):
                    # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                X1.append(photo)
                X2.append(in_seq)
                y.append(out_seq)
            # yield the batch data        
    return [[np.array(X1), np.array(X2)], np.array(y)]
              

In [27]:
item=train_descriptions['1000268201_693b08cb0e']
x,y=list(),list()
m,n=list(),list()
for desc in item:
    print(desc)
    seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
    for i in range(1, len(seq)):      
                in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                m.append(in_seq)
                n.append(out_seq)
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                #X1.append(photo)
                x.append(in_seq)
                y.append(out_seq)
    

startseq child in pink dress is climbing up set of stairs in an entry way endseq
startseq girl going into wooden building endseq
startseq little girl climbing into wooden playhouse endseq
startseq little girl climbing the stairs to her playhouse endseq
startseq little girl in pink dress going into wooden cabin endseq


In [28]:
print(x[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]


In [29]:
print(len(n))

43


In [30]:
print(len(m))

43


In [31]:
print(y)

[array([0., 0., 1., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([0., 0., 0., ..., 0., 0., 0.], dtype=fl

In [32]:
import pickle
train_features = pickle.load(open("encoded_train_images.pkl", "rb"))
print('Photos: train=%d' % len(train_features))

Photos: train=6000


In [33]:
#epochs = 10
number_pics_per_bath = 6000
steps = len(train_descriptions)//number_pics_per_bath

In [34]:
X,y = data_generator(train_descriptions, train_features, wordtoix, max_length)

In [35]:
print(len(y))

292328


In [36]:
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [37]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 34)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 34, 200)      330400      ['input_2[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 4096)         0           ['input_1[0][0]']                
                                                                                              

In [38]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
#model.optimizer.lr = 0.0001
model.fit(X,y, epochs=10, verbose=1)
#model.save('./model_weights/model_' + str(i) + '.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10

In [None]:
model.save('First_try.h5')

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
with open("encoded_test_images.pkl", "rb") as encoded_pickle:
    encoding_test = pickle.load(encoded_pickle)

In [None]:
def greedySearch(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
images = 'Flicker8k_Dataset/'

In [None]:
z=0

In [None]:
import matplotlib.pyplot as plt
#z=0
z+=1
pic = list(encoding_test.keys())[z]
image = encoding_test[pic].reshape((1,4096))
x=plt.imread(images+pic)
plt.imshow(x)
plt.show()
print("Greedy:",greedySearch(image))