In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !cp /content/drive/MyDrive/files/data_path.zip /content
!unzip /content/drive/MyDrive/files/data_path.zip "data/jsons/*" "data/pic/*" "data/ques_embeddings/bioelmo/*" "data/answer_word_frequency.csv" "data/answer_word_frequency.xlsx" -d /content

In [None]:
# %%bash
# cd /content/drive/MyDrive/data
# zip -r knowledge_embs.zip knowledge_embeddings/

In [None]:
# !cp /content/drive/MyDrive/data/knowledge_embs.zip /content/data
# !unzip /content/data/knowledge_embs.zip -d /content/data
!unzip /content/drive/MyDrive/data/kg_embs.zip -d /content/data

In [None]:
# !ls /content/data/knowledge_embeddings | wc -l

32761


In [None]:
# %cd /content
# !unzip data_path.zip "data/ques_embeddings/bioelmo/*" -d /content

/content


In [None]:
%cd /content/drive/MyDrive/VQACode

/content/drive/MyDrive/VQACode


In [None]:
# %cd /content/drive/MyDrive/VQACode-/VQACode

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import pandas as pd
import time
import pathlib
from utils.load_data_overall import DataLoader
from utils.evaluation import AnswerEvaluator
from utils.training_toolkit import CustomSchedule, loss_function
from models.Transformer.transformers import VQATransformer
from models.Transformer.masks import create_masks

### Set up Arguments

In [None]:
num_layers=2
d_model=512
num_heads=8
dff=2048
maximum_position_encoding=10000
EPOCHS = 50
batch_size = 64
cnn_type = 'resnet'
embedding = 'bioelmo'  # choose from ['w2v', 'bioelmo', 'biobert', 'bluebert', 'large_biobert', 'elmo']
data_augmentation = True


In [None]:
####### DO NOT CHANGE VALUES OF THIS BLOCK IF YOU ARE NOT THE DEVELOPER ##########

check_point_path = './check_point/transformer/QA/' + embedding +'/' + cnn_type + '_' + str(num_layers)
saving_folder = './QA_results/transformer/' + embedding + '/'
save_result_path = saving_folder + cnn_type + '_' + str(num_layers) + '.csv'

emb_size = 1024
pe_output = 36 + 1
MAX_LENGTH = pe_output
if cnn_type == 'inception':
    img_shape = [299, 299]
    img_padding = tf.TensorShape([299, 299, 3])
if cnn_type in ['resnet', 'resnet_v2', 'dense_net', 'vgg19']:
    img_shape = None
    img_padding = tf.TensorShape([224, 224, 3])

if embedding == 'bioelmo':
    pe_input = 38
elif embedding == 'elmo':
    pe_input = 42
elif embedding == 'biobert':
    pe_input = 72
    emb_size = 768
elif embedding == 'bluebert':
    pe_input = 69
elif embedding == 'large_biobert':
    pe_input = 60  
elif embedding == 'w2v':
    pe_input = 48
    emb_size = 200
elif embedding == 'bert':
    pe_input = 72
    emb_size = 1024
else:
    raise TypeError("Wrong embedding type")
    
if data_augmentation:
    aug = tf.keras.Sequential([tf.keras.layers.experimental.preprocessing.RandomFlip(),
                               tf.keras.layers.experimental.preprocessing.RandomRotation(0.05)])

#### 

### Create Datasets

In [None]:
# create train, val, test dataset
kn_input = 22

data_loader = DataLoader('/content/data', emb_folder=embedding)
full_dataset, tokenizer = data_loader.create_dataset('QA')
vocab_size=len(tokenizer.index_word) + 1
Data_SET_SIZE = len(full_dataset)
train_size = int(0.52 * Data_SET_SIZE)
val_size = int(0.30 * Data_SET_SIZE)
test_size = int(0.18 * Data_SET_SIZE)
train_set = full_dataset.take(train_size)
val_test_ds = full_dataset.skip(train_size)
val_set = val_test_ds.take(val_size)
test_ds = val_test_ds.skip(val_size)
test_set = test_ds.take(test_size)

batch_train_set = train_set.padded_batch(batch_size, padded_shapes=((img_padding, tf.TensorShape([pe_input, emb_size]), tf.TensorShape([kn_input, 1024])),
                                            tf.TensorShape([pe_output-1]), []), drop_remainder=True)
batch_val_set = val_set.padded_batch(1, padded_shapes=((img_padding, tf.TensorShape([pe_input, emb_size]), tf.TensorShape([kn_input, 1024])),
                                                                    tf.TensorShape([pe_output-1]), []), drop_remainder=True)
batch_test_set = test_set.padded_batch(1, padded_shapes=((img_padding, tf.TensorShape([pe_input, emb_size]), tf.TensorShape([kn_input, 1024])),
                                                         tf.TensorShape([pe_output-1]), []), drop_remainder=True)

QA: 32761
yes_no: 16332
open_ended 16429
Load: QA


In [None]:
# for i in enumerate(batch_train_set):
#   (batch, (img_question, tar, q_id)) = i
#   print(batch,img_question[0].shape,img_question[1].shape,img_question[2].shape,tar.shape,q_id.shape)
#   break

0 (64, 224, 224, 3) (64, 38, 1024) (64, 155, 1024) (64, 36) (64,)


In [None]:
# maxl = 0
# for (img_question, tar, q_id) in full_dataset.as_numpy_iterator():
#   # print(img_question[0].shape,img_question[1].shape,tar.shape,q_id)
#   # print(img_question[2].shape)
#   l = img_question[2].shape[0]
#   if l > maxl:
#     maxl = l
# print(maxl)

In [None]:
# import numpy as np
# # validate saved features and calculate max length of all questions
# ques_id = 48
# emb = np.load('/content/data/ques_embeddings/bioelmo/'+str(ques_id)+'.npy')
# # length = emb.shape[0]
# print(ques_id,'shape is', emb.shape)        


### 

### Define Models and Related Functions 

In [None]:
transformer = VQATransformer(num_layers, d_model, num_heads, dff, vocab_size, pe_input, pe_output,
                          pretrained_cnn_type=cnn_type)

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, check_point_path, max_to_keep=5)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
@tf.function()
def train_step(img, question, kn, tar):
    if data_augmentation:
        img = aug(img)
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(question, tar_inp)
    with tf.GradientTape() as tape:
        predictions, _ = transformer(question, img, kn, tar_inp,
                                     True,
                                     enc_padding_mask,
                                     combined_mask,
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [None]:
def evaluate(question, img, kn):
    end_token = tf.constant(tokenizer.texts_to_sequences(['<end>']), tf.int32)
    output = dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    for i in range(MAX_LENGTH):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            question, output)
        predictions, attention_weights = transformer(question,
                                    img,
                                    kn,
                                    output,
                                    False,
                                    enc_padding_mask,
                                    combined_mask,
                                    dec_padding_mask)

        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        if predicted_id == end_token:
            return tf.squeeze(output, axis=0), attention_weights
        output = tf.concat([output, predicted_id], axis=-1)
    return tf.squeeze(output, axis=0), attention_weights

In [None]:
################ADD#################################################################################
def get_score(batch_data_set, csv_saving_path): 
    true_answers_list = []
    predicted_answers_list = []
    ques_id_list = []
    for (batch, (img_question, target, ques_id)) in enumerate(batch_data_set):
        target = target.numpy()
        target = target[0]
        true_answer = []
        for i in target:
            if i == 0:
                break
            else:
                true_answer.append(tokenizer.index_word[i])
        true_answer = " ".join(true_answer[1: -1])
        prediction, attention = evaluate(img_question[1], img_question[0], img_question[2])
        p = prediction.numpy()
        # print('an1:',p)
        predict_answer = [tokenizer.index_word[i] for i in p][1:]
        # print('an2:',predict_answer)
        predict_answer = " ".join(predict_answer)
        true_answers_list.append(true_answer)
        predicted_answers_list.append(predict_answer)
        ques_id_list.append(ques_id)
    # print('answer list:',predicted_answers_list)
    data = {"true answer": true_answers_list, "predicted answer": predicted_answers_list, "ques_id": ques_id_list}
    df = pd.DataFrame(data)
    if not pathlib.Path(saving_folder).exists():
        pathlib.Path(saving_folder).mkdir(parents=True, exist_ok=True)
    df.to_csv(csv_saving_path)
    # print("complete writing", csv_saving_path)
    return AnswerEvaluator(csv_saving_path).evaluate()
##################ADD#################################################################################

### 

### Train the Model 

In [None]:
ckpt_manager.restore_or_initialize()

'./check_point/transformer/QA/bioelmo/resnet_8/ckpt-1'

In [None]:
for epoch in range(EPOCHS):
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()
    for (batch, (img_question, tar, _)) in enumerate(batch_train_set):
        train_step(img_question[0], img_question[1], img_question[2], tar)
        # if batch % 50 == 0:
        #     print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
        #         epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                        train_loss.result(),
                                        train_accuracy.result()))
  ##################Change#################################################################################
    # if (epoch+1) % 10 == 0:
    #   csv_saving_path = saving_folder + 'val' + str(epoch) + '.csv'
    #   score = get_score(batch_val_set, csv_saving_path)
    #   model_accuracy = score['Accuracy']
    #   # if model_accuracy > accuracy:
    #   print('Validation Accuracy',model_accuracy)
    #   # ckpt_save_path = ckpt_manager.save()
    #   # accuracy = model_accuracy       
##################Change#################################################################################

    # if (epoch + 1) % 2 == 0:
    #     ckpt_save_path = ckpt_manager.save()
    #     print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
    #                                                         ckpt_save_path))

    # print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Loss 0.0018 Accuracy 0.0799
Epoch 2 Loss 0.0018 Accuracy 0.0799
Epoch 3 Loss 0.0017 Accuracy 0.0799
Epoch 4 Loss 0.0017 Accuracy 0.0799
Epoch 5 Loss 0.0015 Accuracy 0.0799
Epoch 6 Loss 0.0018 Accuracy 0.0799
Epoch 7 Loss 0.0016 Accuracy 0.0799
Epoch 8 Loss 0.0017 Accuracy 0.0799
Epoch 9 Loss 0.0018 Accuracy 0.0799
Epoch 10 Loss 0.0015 Accuracy 0.0799
Epoch 11 Loss 0.0016 Accuracy 0.0799
Epoch 12 Loss 0.0016 Accuracy 0.0799
Epoch 13 Loss 0.0015 Accuracy 0.0800
Epoch 14 Loss 0.0015 Accuracy 0.0799
Epoch 15 Loss 0.0014 Accuracy 0.0800
Epoch 16 Loss 0.0015 Accuracy 0.0799
Epoch 17 Loss 0.0015 Accuracy 0.0800
Epoch 18 Loss 0.0015 Accuracy 0.0799
Epoch 19 Loss 0.0015 Accuracy 0.0800
Epoch 20 Loss 0.0015 Accuracy 0.0800
Epoch 21 Loss 0.0012 Accuracy 0.0800
Epoch 22 Loss 0.0016 Accuracy 0.0799
Epoch 23 Loss 0.0013 Accuracy 0.0800
Epoch 24 Loss 0.0014 Accuracy 0.0800
Epoch 25 Loss 0.0013 Accuracy 0.0800
Epoch 26 Loss 0.0014 Accuracy 0.0800
Epoch 27 Loss 0.0014 Accuracy 0.0800
Epoch 28 L

In [None]:
ckpt_save_path = ckpt_manager.save()

In [None]:
ckpt_save_path

'./check_point/transformer/QA/bioelmo/resnet_8/ckpt-1'

### 

### Predicting and Evaluating 

In [None]:
true_answers_list = []
predicted_answers_list = []
ques_id_list = []
print('Start predicting...')
for (batch, (img_question, target, ques_id)) in enumerate(batch_test_set):
    target = target.numpy()
    target = target[0]
    true_answer = []
    for i in target:
        if i == 0:
            break
        else:
            true_answer.append(tokenizer.index_word[i])
    true_answer = " ".join(true_answer[1: -1])

    prediction, attention = evaluate(img_question[1], img_question[0], img_question[2])
    p = prediction.numpy()
    predict_answer = [tokenizer.index_word[i] for i in p][1:]
    predict_answer = " ".join(predict_answer)
    true_answers_list.append(true_answer)
    predicted_answers_list.append(predict_answer)
    ques_id_list.append(ques_id)
    print("predicted answer: " + str(batch), end='\r', flush=True)

Start predicting...


In [None]:
data = {"true answer": true_answers_list, "predicted answer": predicted_answers_list, "ques_id": ques_id_list}
df = pd.DataFrame(data)
if not pathlib.Path(saving_folder).exists():
    pathlib.Path(saving_folder).mkdir(parents=True, exist_ok=True)
name = save_result_path
df.to_csv(name)
print("complete writing", name)

complete writing ./QA_results/transformer/bioelmo/resnet_2.csv


In [None]:
scores = AnswerEvaluator(name).evaluate()

{'testlen': 10743, 'reflen': 10702, 'guess': [10743, 4847, 3410, 2566], 'correct': [5339, 841, 373, 215]}
ratio: 1.0038310596149314
Accuracy: 68.75
Exact Match: 49.63
F1 Score: 68.39
BLEU-1: 0.5
BLEU-2: 0.29
BLEU-3: 0.21
BLEU-4: 0.17


# Predict

In [None]:
import numpy as np
image_size = [224, 224]
def load_and_preprocess_image(path):
    image = tf.io.read_file(path) # read image file
    image = tf.image.decode_jpeg(image, channels=3) # decode image
    image = tf.image.resize(image, image_size)
    return image
def load_question_features(path):
    return np.load(path)

In [None]:
image_id = 'Fig.491'
ques_embd = 'bioelmo/'
question_id = '1431'

In [None]:
img_input = tf.convert_to_tensor(load_and_preprocess_image('/content/data/pic/'+str(image_id)+'.jpg'))
ques_input = tf.convert_to_tensor(load_question_features('/content/data/ques_embeddings/'+str(ques_embd)+str(question_id)+'.npy'))
kg_input = tf.convert_to_tensor(load_question_features('/content/data/knowledge_embeddings/'+str(question_id)+'.npy'))

ques_input = tf.pad(ques_input,[[0,pe_input-ques_input.shape[0]],[0,0]],"CONSTANT")
kg_input = tf.pad(kg_input,[[0,kn_input-kg_input.shape[0]],[0,0]],"CONSTANT")

img_input = tf.expand_dims(img_input, axis=0)
ques_input = tf.expand_dims(ques_input, axis=0)
kg_input = tf.expand_dims(kg_input, axis=0)

# Answer
prediction, attention = evaluate(ques_input, img_input, kg_input)

p = prediction.numpy()
predict_answer = [tokenizer.index_word[i] for i in p][1:]
predict_answer = " ".join(predict_answer)
print(predict_answer)

no
