<a href="https://colab.research.google.com/github/David9857/VQA/blob/main/train/KVQA1_yes_no.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/files/data_path.zip /content
!unzip data_path.zip "data/jsons/*" "data/pic/*" "data/ques_embeddings/biobert/*" "data/answer_word_frequency.csv" "data/answer_word_frequency.xlsx" -d /content

In [None]:
!cp /content/drive/MyDrive/data/knowledge_embs.zip /content/data
!unzip /content/data/knowledge_embs.zip -d /content/data

In [1]:
%cd /content/drive/MyDrive/VQACode

/content/drive/MyDrive/VQACode


In [2]:
import tensorflow as tf
import pandas as pd
import time
import pathlib
from utils.load_data import DataLoader
from utils.evaluation import AnswerEvaluator
from utils.training_toolkit import CustomSchedule, loss_function
from models.Transformer.transformers import VQATransformer
from models.Transformer.masks import create_masks

### Set up arguments

In [3]:
num_layers=2
d_model=512
num_heads=8
dff=2048
maximum_position_encoding=10000
EPOCHS = 30
batch_size = 64
cnn_type = 'resnet'
embedding = 'biobert'  # choose from ['w2v', 'bioelmo', 'biobert', 'bluebert', 'large_biobert', 'elmo', 'bert']
data_augmentation = True

In [4]:
####### DO NOT CHANGE VALUES OF THIS BLOCK IF YOU ARE NOT THE DEVELOPER ##########

check_point_path = './check_point/transformer/yesno/' + embedding +'/' + cnn_type + '_' + str(num_layers)
saving_folder = './yes_no_results/transformer/' + embedding + '/'
save_result_path = saving_folder + cnn_type + '_' + str(num_layers) + '.csv'

emb_size = 1024
pe_output = 3
MAX_LENGTH = pe_output
if cnn_type == 'inception':
    img_shape = [299, 299]
    img_padding = tf.TensorShape([299, 299, 3])
if cnn_type in ['resnet', 'resnet_v2', 'dense_net', 'vgg19']:
    img_shape = None
    img_padding = tf.TensorShape([224, 224, 3])

if embedding == 'bioelmo':
    pe_input = 38
elif embedding == 'elmo':
    pe_input = 42
elif embedding == 'biobert':
    pe_input = 72
    emb_size = 768
elif embedding == 'bluebert':
    pe_input = 69
elif embedding == 'large_biobert':
    pe_input = 60  
elif embedding == 'w2v':
    pe_input = 48
    emb_size = 200
elif embedding == 'bert':
    pe_input = 72
    emb_size = 1024
else:
    raise TypeError("Wrong embedding type")
    
if data_augmentation:
    aug = tf.keras.Sequential([tf.keras.layers.experimental.preprocessing.RandomFlip(),
                               tf.keras.layers.experimental.preprocessing.RandomRotation(0.05)])

### 

### Load Data

In [5]:
# create train, val, test dataset
kn_input = 194

data_loader = DataLoader('/content/data', emb_folder=embedding)
full_dataset, tokenizer = data_loader.create_dataset('yes_no')
vocab_size=len(tokenizer.index_word) + 1
Data_SET_SIZE = len(full_dataset)
train_size = int(0.52 * Data_SET_SIZE)
val_size = int(0.30 * Data_SET_SIZE)
test_size = int(0.18 * Data_SET_SIZE)
train_set = full_dataset.take(train_size)
val_test_ds = full_dataset.skip(train_size)
val_set = val_test_ds.take(val_size)
test_ds = val_test_ds.skip(val_size)
test_set = test_ds.take(test_size)

batch_train_set = train_set.padded_batch(batch_size, padded_shapes=((img_padding, tf.TensorShape([pe_input, emb_size]), tf.TensorShape([kn_input, 1024])),
                                                                    tf.TensorShape([pe_output]), []), drop_remainder=True)
batch_test_set = test_set.padded_batch(1, padded_shapes=((img_padding, tf.TensorShape([pe_input, emb_size]), tf.TensorShape([kn_input, 1024])),
                                                         tf.TensorShape([pe_output]), []), drop_remainder=True)

11111


In [None]:
# maxl = 0
# maxe = 0
# for (img_question, tar, q_id) in full_dataset.as_numpy_iterator():
#   # print(img_question[0].shape,img_question[1].shape,tar.shape,q_id)
#   # print(img_question[2].shape)
#   l = img_question[2].shape[0]
#   e = img_question[2].shape[1]
#   if l > maxl:
#     maxl = l
#   if e > maxe:
#     maxe = e
# print(maxl)
# print(maxe)

194
1024


#### 

### Define Models and Related Functions

In [6]:
transformer = VQATransformer(num_layers, d_model, num_heads, dff, vocab_size, pe_input, pe_output,
                          pretrained_cnn_type=cnn_type)

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, check_point_path, max_to_keep=5)

In [7]:
@tf.function()
def train_step(img, question, kn, tar):
    if data_augmentation:
        img = aug(img)
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(question, tar_inp)
    with tf.GradientTape() as tape:
        predictions, _ = transformer(question, img, kn, tar_inp,
                                     True,
                                     enc_padding_mask,
                                     combined_mask,
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [8]:
def evaluate(question, img, kn):
    end_token = tf.constant(tokenizer.texts_to_sequences(['<end>']), tf.int32)
    output = dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    for i in range(MAX_LENGTH):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            question, output)
        predictions, attention_weights = transformer(question,
                                      img,
                                      kn,
                                      output,
                                      False,
                                      enc_padding_mask,
                                      combined_mask,
                                      dec_padding_mask)

        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        if predicted_id == end_token:
            return tf.squeeze(output, axis=0), attention_weights
        output = tf.concat([output, predicted_id], axis=-1)
    return tf.squeeze(output, axis=0), attention_weights

#### 

### Train Model

In [16]:
## restore check point 
# ckpt.restore(ckpt_manager.latest_checkpoint)
# print(batch_train_set)
# EPOCHS = 10

In [9]:
def eval_result():  
  true_answers_list = []
  predicted_answers_list = []
  ques_id_list = []
  # print('Start predicting...')
  for (batch, (img_question, target, ques_id)) in enumerate(batch_test_set):
      target = target.numpy()
      target = target[0]
      true_answer = []
      for i in target:
          if i == 0:
              break
          else:
              true_answer.append(tokenizer.index_word[i])
      true_answer = " ".join(true_answer[1: -1])

      prediction, attention = evaluate(img_question[1], img_question[0], img_question[2])
      p = prediction.numpy()
      predict_answer = [tokenizer.index_word[i] for i in p][1:]
      predict_answer = " ".join(predict_answer)
      true_answers_list.append(true_answer)
      predicted_answers_list.append(predict_answer)
      ques_id_list.append(ques_id)
      print("predicted answer: " + str(batch), end='\r', flush=True)

  # save predictions
  data = {"true answer": true_answers_list, "predicted answer": predicted_answers_list, "ques_id":ques_id_list}
  df = pd.DataFrame(data)
  if not pathlib.Path(saving_folder).exists():
      pathlib.Path(saving_folder).mkdir(parents=True, exist_ok=True)
  name = save_result_path
  df.to_csv(name)
  # print("complete writing", name)

  # show scores
  scores = AnswerEvaluator(name).evaluate()
  print(scores)

In [None]:
for epoch in range(EPOCHS):
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()
    for (batch, (img_question, tar, _)) in enumerate(batch_train_set):
        # print(img_question[0].shape, img_question[1].shape, img_question[2].shape)
        train_step(img_question[0], img_question[1], img_question[2], tar)
        # if batch % 50 == 0:
        #     print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
        #         epoch + 1, batch, train_loss.result(), train_accuracy.result()))
        

    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                train_loss.result(),
                                                train_accuracy.result()))
    if (epoch+1) % 5 == 0:
      ckpt_save_path = ckpt_manager.save()
      print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,ckpt_save_path))
      
      eval_result()

    # print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Loss 0.8483 Accuracy 0.6868
Epoch 2 Loss 0.2148 Accuracy 0.9006
Epoch 3 Loss 0.1796 Accuracy 0.9174
Epoch 4 Loss 0.1498 Accuracy 0.9337
Epoch 5 Loss 0.1096 Accuracy 0.9536
Accuracy: 81.82
Exact Match: 81.82
F1 Score: 81.24
BLEU-1: 81.82
BLEU-2: 47.24
BLEU-3: 30.44
BLEU-4: 15.75
{'Accuracy': 81.82, 'Exact Match': 81.82, 'F1 Score': 81.24, 'BLEU-1': 81.82, 'BLEU-2': 47.24, 'BLEU-3': 30.44, 'BLEU-4': 15.75}
Epoch 6 Loss 0.0878 Accuracy 0.9654
Epoch 7 Loss 0.0804 Accuracy 0.9680
Epoch 8 Loss 0.0726 Accuracy 0.9722
Epoch 9 Loss 0.0652 Accuracy 0.9759
Epoch 10 Loss 0.0530 Accuracy 0.9802
Accuracy: 83.93
Exact Match: 83.93
F1 Score: 83.88
BLEU-1: 83.93
BLEU-2: 48.46
BLEU-3: 31.23
BLEU-4: 16.15
{'Accuracy': 83.93, 'Exact Match': 83.93, 'F1 Score': 83.88, 'BLEU-1': 83.93, 'BLEU-2': 48.46, 'BLEU-3': 31.23, 'BLEU-4': 16.15}
Epoch 11 Loss 0.0521 Accuracy 0.9818
Epoch 12 Loss 0.0551 Accuracy 0.9793
Epoch 13 Loss 0.0524 Accuracy 0.9811
Epoch 14 Loss 0.0507 Accuracy 0.9825
Epoch 15 Loss 0.044

### 

### Predicting and Evaluating

In [12]:

data = {"true answer": true_answers_list, "predicted answer": predicted_answers_list, "ques_id":ques_id_list}
df = pd.DataFrame(data)
if not pathlib.Path(saving_folder).exists():
    pathlib.Path(saving_folder).mkdir(parents=True, exist_ok=True)
name = save_result_path
df.to_csv(name)
print("complete writing", name)

complete writing ./yes_no_results/transformer/biobert/resnet_2.csv


In [13]:
scores = AnswerEvaluator(name).evaluate()

Accuracy: 83.59
Exact Match: 83.59
F1 Score: 83.24
BLEU-1: 83.59
BLEU-2: 48.26
BLEU-3: 31.1
BLEU-4: 16.09
