In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from google.colab import drive
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import load_model
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, RepeatVector, Embedding, LSTM, TimeDistributed, Concatenate
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.vgg16 import preprocess_input as preprocess_input_vgg
from tensorflow.keras.applications.resnet50 import preprocess_input as preprocess_input_resnet
from copy import deepcopy
import pickle
import warnings
import os

#os.environ['KMP_DUPLICATE_LIB_OK']='True'
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/gdrive')

### 1. Leemos la imágenes  y captions de Flickr8k

Leemos el archivo de captions para obtener el 'id' de la imagen y sus descripciones. Por cada imagen se cuenta con 5 descripciones

In [None]:
def read_file(filename):
    content = ""
    with open(filename) as f:
        content = f.read().split("\n")
    return content

In [None]:
target_size = (224,224,3)
images_folder = '/gdrive/My Drive/dataset_imagecaption/Images/' #'flickr8k-dataset/images/'
file_captions = '/gdrive/My Drive/dataset_imagecaption/captions.txt' #'flickr8k-dataset/captions.txt'

#content = read_file('flickr8k-dataset/captions.txt')
content = read_file(file_captions)

Mostramos cuantas imágenes tiene nuestro dataset

In [None]:
print("Total de imágenes: ", len(os.listdir(images_folder)))

Como el archivo captions.txt tiene una estructura de la siguiente manera

```
image,caption
"1000268201_693b08cb0e.jpg","A child in a pink dress is climbing up a set..."
"1000268201_693b08cb0e.jpg","A girl going into a wooden building..."
```

Necesitamos leer el archivo linea por linea para obtener cada imagen y sus 5 descripciones.

In [None]:
def create_dataset_content(content):
    headers = content[0].split(',')
    images_map = {}
    images_with_captions = []
    for idx in range(1, len(content)-1):
        row = content[idx].split('.jpg')
        image_id, caption = row[0] + '.jpg', row[1].strip()[1:]
        images_with_captions.append([image_id, caption])
        # Almacenamos en un mapa
        if image_id not in images_map:
            images_map[image_id] = []
        images_map[image_id].append(caption)
    return headers, images_with_captions, images_map

Almacenamos los headers y data para el DataFrame. Además obtenemos un mapa cuya `key` es el id de la imagen y el valor es una lista de sus captions

In [None]:
headers, images_with_captions, images_map = create_dataset_content(content)

Almacenamos todas las imágenes y los captions respecitvos.

In [None]:
dataset = pd.DataFrame(data=images_with_captions, columns=headers)
dataset['image'] = dataset['image'].apply(lambda image: image)

In [None]:
del images_with_captions

In [None]:
print("Total de captions:", dataset.shape[0])

### 2. Visualizamos algunas imágenes y sus captions

In [None]:
def plot_images_with_captions(images_ids, dataset, num_images=3, target_size=(224,224,3)):
    fig = plt.figure(figsize=(8,8))
    count = 1
    for idx, image_id in enumerate(images_ids[:num_images]):
        # Obtenemos los captions
        captions = dataset[dataset['image'] == image_id]['caption'].values

        #Obtenemos la imagen
        img_path = f'{images_folder}{image_id}'
        img = load_img(img_path, target_size=target_size)
        img = img_to_array(img)

        # Agregamos la imagen al plot
        ax = fig.add_subplot(num_images, 2, count, xticks=[], yticks=[])
        ax.imshow(img/255.)
        count += 1

        ax = fig.add_subplot(num_images,2, count)
        plt.axis('off')
        ax.plot()
        ax.set_xlim(0,1)
        ax.set_ylim(0,len(captions))
        for i, caption in enumerate(captions):
            ax.text(0,i, caption)

        count += 1

    plt.show()

In [None]:
plot_images_with_captions(list(images_map.keys()), dataset, num_images=2)

In [None]:
def show_image(path, image_id, target_size=(224,224,3)):
    img_path = f'{path}{image_id}'
    img = load_img(img_path, target_size=target_size)
    img = img_to_array(img) / 255.
    plt.imshow(img)
    plt.show()

### 3. Creamos el modelo de CNN (Resnet50)

In [None]:
#model_vgg = VGG16(include_top=True)
#model_vgg16 = Model(inputs=model_vgg.inputs, outputs=model_vgg.layers[-2].output)
model_resnet = ResNet50(include_top=False, weights='imagenet', input_shape=target_size, pooling='avg')

In [None]:
model_resnet.summary()

### 4. Extraemos los features vectors de cada imagen

In [None]:
def preprocess_images_vgg(path, image_id, target_size=(224,224,3)):
    img = load_img(f'{path}{image_id}', target_size=target_size)
    img = img_to_array(img)
    return preprocess_input_vgg(np.expand_dims(img, axis=0))

def preprocess_images_resnet(path, image_id, target_size=(224,224,3)):
    img = load_img(f'{path}{image_id}',target_size=target_size)
    img = img_to_array(img)
    return preprocess_input_resnet(np.expand_dims(img, axis=0))

def extract_features(path, images, model, target_size, preprocessing):
    images_features_map = {}
    for image_id in tqdm(images):
        img = preprocessing(path, image_id, target_size)
        features = model.predict(img)
        images_features_map[image_id] = features.reshape(2048)
    return images_features_map

In [None]:
do = False
if do:
    features_map = extract_features(images_folder, list(images_map.keys()), model_resnet, target_size, preprocess_images_resnet)
    with open( "precomputed/encoded_images.p", "wb" ) as pickle_f:
        pickle.dump(features_map, pickle_f) 

### 5. Pre procesamiento de captions

In [None]:
import string

In [None]:
def remove_punctuation(text):
    """ Removemos los signos de puntuación de cada plabra.
        Se extiende con caracteres para español
    """
    punctuation = string.punctuation + '¿¡'
    return text.translate(str.maketrans('','', punctuation))

def remove_short_words(text):
    """ Remueve caracteres únicos. Por ejemplo ['a', 'c', 'd']
    """
    words = text.split(' ')
    return ' '.join([word for word in words if len(word) > 1 ])

def remove_alpha_numeric(text):
    """ Removemos aquellas palabras que contienen números o caracteres especiales
    """
    words = text.split(' ')
    return ' '.join([word for word in words if word.isalpha()])

def preprocess_captions(text):
    """ Pipeline para realizar el preprocesamiento del texto
    """
    text = remove_punctuation(text)
    text = remove_short_words(text)
    text = remove_alpha_numeric(text)
    return text.lower()

def add_start_end_token(text):
    """ Agregamos los tokens <start> <end> que se usarán en la secuencia
    """
    return '<start> '+text+' <end>'

def create_word_to_index(vocabulary):
    #unique_words = set([word for sentence in captions for word in sentence.split()])
    word_to_index = { word : idx for idx, word in enumerate(['<unk>'] + list(vocabulary)) }
    return word_to_index
    
def create_index_to_word(vocabulary):
    #unique_words = set([word for sentence in captions for word in sentence.split()])
    index_to_word = { idx  : word for idx, word in enumerate(['<unk>'] + list(vocabulary)) }
    return index_to_word

def get_vocabulary(captions):
    """ Obtenemos el vocabulario en base a los captions
    """
    return set([word for sentence in captions for word in sentence.split()])

def get_vocabulary_size(vocabulary):
    """ Obtenemos el tamaño del vocabulario (+1 por el token de <unk>)
    """
    return len(vocabulary)+1

def get_max_caption_size(captions):
    return max([len(sentence.split()) for sentence in captions])

def create_sequences(dataset, word_to_index, vocab_size):
    all_padded_sequences, all_subsequence_words = [], []
    for idx in range(len(dataset)):
        padded_sequence = []
        next_words = []
        w2i = [word_to_index[text] for text in dataset.loc[idx,'caption_pre_start_end'].split()]
        for i in range(1, len(w2i)):
            padded_sequence.append(w2i[:i])
            next_words.append(w2i[i])
        padded_partial_seq = sequence.pad_sequences(padded_sequence, max_caption_size, padding='post')

        y = np.zeros((len(next_words), vocab_size), dtype=np.int32)

        # One-Hot encoding
        for idx, next_word in enumerate(next_words):
            y[idx, next_word] = 1

        # Agregamos la secuencia que tiene el padding a la lista total (son 5 por cada imagen)
        # la dimensión es de (total_images, total_captions, max_caption_size)
        all_padded_sequences.append(padded_partial_seq)
        all_subsequence_words.append(y)
    
    return (np.array(all_padded_sequences), np.array(all_subsequence_words))

### 6. Dividimos el dataset en Train, Test

In [None]:
dataset['caption_pre'] = dataset['caption'].apply(preprocess_captions)
dataset['caption_pre_start_end'] = dataset['caption_pre'].apply(add_start_end_token)

In [None]:
train_size       = 6000
#test_size        = 1591

all_images = list(images_map.keys())
# Cargamos la data de entrenamiento (Train = 6500)
train_images_ids = deepcopy(all_images[:train_size])
train_dataset = dataset[dataset['image'].isin(train_images_ids)].copy()

# Cargamos la data de test (Test = 1591)
test_images_ids  = deepcopy(all_images[train_size:])
test_dataset = dataset[dataset['image'].isin(train_dataset)].copy()

In [None]:
# Almacenamos todas las oraciones en una lista
captions = train_dataset['caption_pre_start_end'].tolist()

# Obtenemos el vocabulario
vocabulary = get_vocabulary(captions)

# Vectorizamos a Word2Index and Index2Word
word_to_index = create_word_to_index(vocabulary)
index_to_word = create_index_to_word(vocabulary)

# Obtenemos el tamaño del vocabulario
vocab_size = get_vocabulary_size(vocabulary)

# Obtenemos la oración más larga para hacer un padding
max_caption_size = get_max_caption_size(captions)

# Creamos la sencuencia de los captions
padded_sequences, subsequence_words = create_sequences(train_dataset[:train_size], word_to_index, vocab_size)

In [None]:
# Verificamos tamaño de vocabulario y cantidad de imagenes
print("Maxima dimension de caption:", max_caption_size)
print("Total de imágenes:", len(train_images_ids))
print("Tamaño de vocabulario original:", vocab_size)
print("Total de secuencias:", len(padded_sequences))
print("Total de subsecuencias:", len(subsequence_words))

In [None]:
do = True
total_images = 3000
if do:
    captions = np.vstack(padded_sequences[:total_images])
    next_words = np.vstack(subsequence_words[:total_images])
    #np.save("precomputed/captions_resnet.npy", captions)
    #np.save("precomputed/next_words_resnet.npy", next_words)
    
    print(captions.shape)
    print(next_words.shape)

In [None]:
#with open('precomputed/encoded_images.p', 'rb') as f:
with open('/gdrive/My Drive/dataset_imagecaption/Jorge_Rodriguez/resnet50/precomputed/encoded_images.p', 'rb') as f:
    all_features_map = pickle.load(f)

In [None]:
if do:    
    # Lo que hacemos es tener una lista de cada imagen en el dataset, debido
    # a que tenemos 5 descripciones la lista va a tener 5 elementos repetidos por imagen
    # entonces como hemos usado 6,000 para entrenamiento, nuestro resultado final debe ser
    # de 30,000 registros

    train_features_map = []
    for image_id in train_dataset['image']:
        train_features_map.append(list(all_features_map[image_id]))

    train_features_map = np.asarray(train_features_map)
    print(train_features_map.shape)

    # Ahora como cada secuencia tiene un tamaño distinto, lo que se hace
    # es que por cada imagen (teniendo en cuenta que son 5 repetidos)
    # obtener las secuencias y agregarlas al arreglo train_images
    # lo que obtendremos al final es una mapeo de [imagenes por cada secuencia]

    train_images = []
    for ix in range(total_images):
        for iy in range(len(padded_sequences[ix])):
            train_images.append(train_features_map[ix])

    train_images = np.asarray(train_images)

    #np.save("precomputed/images_train_resnet.npy", train_images)

    print(train_images.shape)

    # Al igual que los features map por cada imagen, ahora tenemos que
    # obtener la misma cantidad de codigos de las imagenes por secuencia
    train_images_names = []
    for ix in range(total_images):
        for iy in range(len(padded_sequences[ix])):
            train_images_names.append(train_dataset.loc[ix, 'image'])

    train_images_names = np.asarray(train_images_names)

    #np.save('precomputed/image_names_train_resnet.npy', train_images_names)

    print(train_images_names.shape)

In [None]:
del train_features_map

### 7. Cargamos todos los archivos generados

### 8. Creamos el modelo (Encoder - Decoder)

In [None]:
embedding_size = 128

In [None]:
encoder = Sequential([
    Dense(embedding_size, input_shape=(2048,)),
    Activation('relu'),
    RepeatVector(max_caption_size)])

encoder.summary()

In [None]:
decoder = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_caption_size),
    LSTM(256, return_sequences=True),
    TimeDistributed(Dense(embedding_size))
])

decoder.summary()

In [None]:
plot_model(encoder, to_file='encoder.png', show_shapes=True)

In [None]:
plot_model(decoder, to_file='decoder.png', show_shapes=True)

In [None]:
# Unimos el encoder y el decoder
encoder_decoder = Concatenate()([encoder.output, decoder.output])
x = LSTM(128, return_sequences=True)(encoder_decoder)
x = LSTM(512, return_sequences=False)(x)
x = Dense(vocab_size)(x)
out = Activation('softmax')(x)

model = Model(inputs=[encoder.input, decoder.input], outputs=out)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
plot_model(model, to_file='decoder_decoder_model.png', show_shapes=True)

In [None]:
# Entrenamiento del model
batch_size = 32
epochs = 20
history = model.fit([train_images, captions], next_words, batch_size=batch_size, epochs=epochs)

In [None]:
model.save_weights('model_weights_resnet50.h5')

In [None]:
def get_feature_vector(resnet, path, image, target_size=(224,224,3)):
    img = preprocess_images_resnet(path,image, target_size=target_size)
    return resnet.predict(img).reshape(2048)

def predict_image_caption(model, feature_vector, max_caption_size):
    words = ['<start>']
    word_pred = ''

    while word_pred != '<end>' or len(words) > max_caption_size:
        w2i = [word_to_index[word] for word in words]
        w2i = sequence.pad_sequences([w2i], maxlen=max_caption_size, padding='post')
        preds = model.predict([np.array([feature_vector]), np.array(w2i) ])
        word_pred = index_to_word[np.argmax(preds[0])]
        words.append(word_pred)
    return ' '.join(words)

In [None]:
feature_vector = get_feature_vector(model_resnet,'/content/', 'giraffe.jpeg')

In [None]:
predict_image_caption(model, feature_vector, max_caption_size)

In [None]:
# Entrenamiento del modeloI’m just 
# Crear el Decoder (LSTM) (Listo)
# Crear generar texto (nueva imagen)
# Evalución (BLEU)
# * Modelo de Attention