In [2]:
import numpy as np
import string
from PIL import Image
from tqdm import tqdm_notebook as tqdm
import os
from pickle import dump, load

# Tensorflow Imports
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merging import add
from keras.models import Model, load_model




### Data Cleaning

In [22]:
# text file loading
def load_doc(file_name):
    file= open(file_name, 'r')# reading mode
    text= file.read()
    file.close()
    return text

# Data cleaning (lower casing, puntuataions removal, word containing number)
def clean_text(captions):
    table= str.maketrans('','',string.punctuation)
    for img,cap in captions.items():
        for i,img_cap in enumerate(cap):
            
            img_cap.replace("-"," ")
            descr= img_cap.split()
            
            # to lowercase
            descr= [wd.lower() for wd in descr]
            # removing 's and 'a'
            descr= [wd.translate(table) for wd in descr if(len(wd)>1)]
            # punctuation removal
            descr= [wd.translate(table) for wd in descr]
            # token removal
            descr= [wd for wd in descr if(wd.isalpha())]
            # convert to string
            img_cap= ' '.join(descr)
            captions[img][i]= img_cap
    return captions

# get imgs with respective captions
def img_captions(file_name):
    file= load_doc(file_name)
    captions= file.split('\n')
    descr= {}
    for caption in captions[:-1]:
        img, caption= caption.split('\t')
        if img[:-2] not in descr:
            descr[img[:-2]]= [caption]
        else:
            descr[img[:-2]].append(caption)
    return descr

# vocabulary building
def text_vocab(descr):
    vocab= set()
    
    for key in descr.keys():
        [vocab.update(d.split()) for d in descr[key]]
    
    return vocab

# saving descriptions in one file
def save_descr(descr, file_name):
    lines= list()
    for key,descr_list in descr.items():
        for descr in descr_list:
            lines.append(key+'\t'+descr)
    data= "\n".join(lines)
    file= open(file_name, "w")
    file.write(data)
    file.close()
    
# define paths
dataset_txt= './Flickr8k_text/'
dataset_img= './Flickr8k_Dataset/Flicker8k_Dataset/'

file_name= dataset_txt + "/" + "Flickr8k.token.txt"

description= img_captions(file_name)
print("Length of Descriptions: ",len(description))

cleaned_descr= clean_text(description)

vocab= text_vocab(cleaned_descr)
print("Length of Vocabulary: ",len(vocab))

save_descr(cleaned_descr, "descriptions.txt")

Length of Descriptions:  8092
Length of Vocabulary:  8767


### Feature Exctraction

In [25]:
def feature_extractor(data_path):
    model= Xception(include_top=False, pooling='avg')
    features= {}
    for img_iter in tqdm(os.listdir(data_path)):
        file_name= data_path + "/" + img_iter
        img= Image.open(file_name)
        img= img.resize((299,299)) # deafault dim for xception-net input
        img= np.expand_dims(img, axis=0)
        img= img /127.5
        img= img - 1.0
        with tf.device('/GPU:0'):
            feature= model.predict(img)
        features[img_iter]= feature
    
    return features

# length of feature vactor will be 2048
features= feature_extractor(dataset_img)
dump(features, open('feature.pkl', 'wb'))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for img_iter in tqdm(os.listdir(data_path)):


  0%|          | 0/8091 [00:00<?, ?it/s]





















































In [38]:
features= load(open("feature.pkl", "rb"))
features

{'1000268201_693b08cb0e.jpg': array([[0.4733967 , 0.01732627, 0.07333964, ..., 0.08559036, 0.02102286,
         0.23766518]], dtype=float32),
 '1001773457_577c3a7d70.jpg': array([[0.0015821 , 0.11113477, 0.00037395, ..., 0.2650358 , 0.35279816,
         0.05871661]], dtype=float32),
 '1002674143_1b742ab4b8.jpg': array([[0.        , 0.02488971, 0.0155406 , ..., 0.        , 0.        ,
         0.10192642]], dtype=float32),
 '1003163366_44323f5815.jpg': array([[0.1456884 , 0.00272412, 0.2777652 , ..., 0.17018229, 0.11957315,
         0.09414066]], dtype=float32),
 '1007129816_e794419615.jpg': array([[0.        , 0.12443963, 0.73916227, ..., 0.00390438, 0.00997145,
         0.50172365]], dtype=float32),
 '1007320043_627395c3d8.jpg': array([[0.04136696, 0.        , 0.01274469, ..., 0.00944691, 0.6420165 ,
         0.04792123]], dtype=float32),
 '1009434119_febe49276a.jpg': array([[0.        , 0.        , 0.02624737, ..., 0.30528584, 0.23091778,
         0.14191064]], dtype=float32),
 '1012

### Helper Functions for Training

In [44]:
# load data (list of images)
def load_imgs(file_name):
    file= load_doc(file_name)
    imgs= file.split("\n")[:-1]
    return imgs

def load_cleaned_descrs(file_name, imgs):
    file= load_doc(file_name)
    descrs= {}
    for line in tqdm(file.split("\n")):
        word= line.split()
        if len(word)<1:
            continue
         
        img, img_cap= word[0], word[1:]
        if img in imgs:
            if img not in descrs:
                descrs[img]= []
            des= '<start> '+ " ".join(img_cap)+' <end>'
            descrs[img].append(des)
    
    return descrs

def load_feature(imgs):
    feature= load(open("feature.pkl", "rb"))
    features= {k:feature[k] for k in imgs}
    
    return features

file_name= dataset_txt + "/" + "Flickr_8k.trainImages.txt"
train_img= load_imgs(file_name)
train_descrs= load_cleaned_descrs("descriptions.txt", train_img)
train_feature= load_feature(train_img)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm(file.split("\n")):


  0%|          | 0/40460 [00:00<?, ?it/s]

### Vocabulary Tokenization

In [50]:
# converting dict to list of discriptions
def dict_to_list(descr):
    all_descr= []
    for key in descr.keys():
        [all_descr.append(d) for d in descr[key]]
    return all_descr

# creating tokenizer function (it'll vectorize text corpus)
def create_tokens(descr):
    descr_list= dict_to_list(descr)
    tokenizer= Tokenizer()
    tokenizer.fit_on_texts(descr_list)
    return tokenizer

tokens= create_tokens(train_descrs)
dump(tokens, open("tokens.pkl", "wb"))
vocab_size= len(tokens.word_index) + 1
print('total words:', vocab_size)

total words: 7581


In [51]:
# calculating max length for description of image 
def max_descr_len(descr):
    descr_list= dict_to_list(descr)
    return max(len(d.split()) for d in descr_list)

max_len= max_descr_len(description)
max_len

32

#### Defining Input and Output of model
- Input: it will be in form of [X1,X2] and output will be Y  <br>        (where X1 is 2048 length of feature vector and X2 is input text sequence)
- Output: it will be text sequence that model will predict

In [57]:
# creating Input-Output pairs from image description

def data_generator(descrs, feature, tokens, max_len):
    while 1:
        for key, descr_list in description.items():
            # retrve img feature
            feature= feature[key][0]
            input_img, input_seq, output_word= create_sequences(tokens, max_len, descr_list, feature)
            yield [[input_img, input_seq], output_word]

def create_sequences(tokens, max_len, descr_list, feature):
    X1, X2, Y= list(), list(), list()
    for desc in descr_list:
        # sequence encoding
        seq= tokens.texts_to_sequences([desc])[0]
        #splitting one sequence into multiple X,Y pairs
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            # pad sequence
            in_seq= pad_sequences([in_seq], maxlen=max_len)[0]
            # encode output sequence
            out_seq= to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            Y.append(out_seq)
            
    return np.array(X1), np.array(X2), np.array(Y)

# checking shape of I/P and O/P data
[a,b], c= next(data_generator(train_descrs, features, tokens, max_len))
a.shape, b.shape, c.shape

((37, 2048), (37, 32), (37, 7581))

#### Creating CNN-RNN Model

In [77]:
def final_model(vocab_size, max_length):
    # features from the CNN model squeezed from 2048 to 256 nodes
    input1 = Input(shape=(2048,))
    fl1= Dropout(0.5)(input1)
    fl2= Dense(256, activation='relu')(fl1)
    
    # LSTM Cell
    input2 = Input(shape=(max_length,))
    sl1= Embedding(vocab_size, 256, mask_zero=True)(input2)
    sl2= Dropout(0.5)(sl1)
    sl3= LSTM(256)(sl2)
    
    # Merge both models
    decoder1= add([fl2,sl3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [img, seq] ---> [word]
    model = Model(inputs=[input1, input2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    print(model.summary())
    
    return model

#### Model Training 

In [82]:
print('Dataset: ',len(train_img))
print('Training Description: ',len(train_descrs))
print('Description Length: ',max_len)
print('Train Images: ',len(train_feature))
print('Vocabulary: ',(vocab_size))

model= final_model(vocab_size, max_len)
epochs= 10
steps= len(train_descrs)
with tf.device('/GPU:0'):
    for i in tqdm(range(epochs)):
        generator= data_generator(train_descrs, train_feature, tokens, max_len)
        model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)

Dataset:  6000
Training Description:  6000
Description Length:  32
Train Images:  6000
Vocabulary:  7581
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_29 (InputLayer)          [(None, 32)]         0           []                               
                                                                                                  
 input_28 (InputLayer)          [(None, 2048)]       0           []                               
                                                                                                  
 embedding_11 (Embedding)       (None, 32, 256)      1940736     ['input_29[0][0]']               
                                                                                                  
 dropout_22 (Dropout)           (None, 2048)         0           ['input_28[0][0]']   

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(epochs)):


  0%|          | 0/10 [00:00<?, ?it/s]

  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)


   1/6000 [..............................] - ETA: 7:03:25 - loss: 8.9269

UnknownError: Graph execution error:

2 root error(s) found.
  (0) UNKNOWN:  IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
Traceback (most recent call last):

  File "C:\Users\DEV\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 270, in __call__
    ret = func(*args)

  File "C:\Users\DEV\anaconda3\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\DEV\anaconda3\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 1030, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "C:\Users\DEV\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 831, in wrapped_generator
    for data in generator_fn():

  File "C:\Users\DEV\AppData\Local\Temp/ipykernel_13776/2864613916.py", line 7, in data_generator
    feature= feature[key][0]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_2]]
  (1) UNKNOWN:  IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
Traceback (most recent call last):

  File "C:\Users\DEV\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 270, in __call__
    ret = func(*args)

  File "C:\Users\DEV\anaconda3\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\DEV\anaconda3\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 1030, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "C:\Users\DEV\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 831, in wrapped_generator
    for data in generator_fn():

  File "C:\Users\DEV\AppData\Local\Temp/ipykernel_13776/2864613916.py", line 7, in data_generator
    feature= feature[key][0]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_591415]

In [None]:
mode.save('model.h5')

#### Testing Code

In [2]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import argparse
import numpy as np
import string
from PIL import Image
from tqdm import tqdm_notebook as tqdm
import os
from pickle import dump, load

# Tensorflow Imports
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merging import add
from keras.models import Model, load_model



ap = argparse.ArgumentParser()
ap.add_argument('-i', '--image', required=True, help="Image Path")
args = vars(ap.parse_args())
img_path = args['image']

def extract_features(model, filename):
        try:
            image = Image.open(filename)

        except:
            print("ERROR: Couldn't open image! Make sure the image path and extension is correct")
        image = Image.open(filename)
        image= image
        image = image.resize((299,299))
        image = np.array(image)
        # for images that has 4 channels, we convert them into 3 channels
        if image.shape[2] == 4: 
            image = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        return feature

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
         if index == integer:
                return word
    return None


def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text


# img_path = 'Flickr8k_Dataset/Flicker8k_Dataset/1001773457_577c3a7d70.jpg'
max_length = 32
tokenizer = load(open("tokens.pkl","rb"))
model = load_model('model_9.h5')
xception_model = Xception(include_top=False, pooling="avg")

photo = extract_features(xception_model, img_path)
img = Image.open(img_path)

description = generate_desc(model, tokenizer, photo, max_length)[5:-3]
print("\n\n")
print(description)
plt.imshow(img)

 two dogs are playing in the grass 
