In [1]:
# # Used for obtaining the training data
# ! python ./preprocessing/download_wordvecs.py --download_dir ./data
# ! python ./preprocessing/squad_preprocess.py --data_dir ./data

From the paper:
"We use a max sequence length of 600 during training and a hidden state size of 200 for all recurrent
units, maxout layers, and linear layers. All LSTMs have randomly initialized parameters and an
initial state of zero. Sentinel vectors are randomly initialized and optimized during training. For
the dynamic decoder, we set the maximum number of iterations to 4 and use a maxout pool size of
16. We use dropout to regularize our network during training (Srivastava et al., 2014), and optimize
the model using ADAM (Kingma & Ba, 2014). All models are implemented and trained with
Chainer (Tokui et al., 2015)."

In [2]:
from model import DynamicCoattentionNW
import chainer as ch
import numpy as np
import chainer.functions as F


max_seq_length = 600

hid_state_size = 200

dyn_dec_max_it = 4
maxout_pool_size = 16

dropout = 0.1

--------------------------------------------------------------------------------
CuPy (cupy) version 12.3.0 may not be compatible with this version of Chainer.
Please consider installing the supported version by running:
  $ pip install 'cupy>=7.7.0,<8.0.0'

See the following page for more details:
  https://docs.cupy.dev/en/latest/install.html
--------------------------------------------------------------------------------



In [None]:
from preprocessing.vocab import get_glove

glove_path = "./data/glove.840B.300d.txt"
glove_vector_size = 300
emb_mat, word2id, id2word = get_glove(glove_path, glove_vector_size)

vocab_size = len(word2id)

print(emb_mat.shape)

Loading GLoVE vectors from file: ./data/glove.840B.300d.txt


 87%|████████▋ | 1902150/2196017 [01:35<00:15, 19142.98it/s]

In [None]:

model = DynamicCoattentionNW(max_seq_length, hid_state_size, dyn_dec_max_it, maxout_pool_size, dropout, emb_mat)

# Setup an optimizer
optimizer = ch.optimizers.Adam()
optimizer.setup(model)

model.to_gpu()

In [None]:
from data_batcher import get_batch_generator
from chainer.backends import cuda
import time
from tqdm import tqdm

# batchsize = 2 # for quick test only
batchsize = 32 # seems to be the fastest overall
# batchsize = 64
max_epoch = 10
epochs_pre_trained = 0
show_mean_loss_at_batches = 200

train_file_lines = 86307

train_context_path = "./data/train.context"
train_qn_path = "./data/train.question"
train_ans_path = "./data/train.answer"
train_span_path = "./data/train.span"

# try loading a trained model
for i in range(max_epoch, 0, -1):
    try:
        ch.serializers.load_npz('DCANW_E{}.model'.format(i), model)
        print("Model DCANW_E{}.model loaded successfully".format(i))
        epochs_pre_trained = i
        break
    except FileNotFoundError:
        continue


for epoch in range(1+epochs_pre_trained, max_epoch+1):
    print("Epoch: {}".format(epoch))
    batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)
    batch_id = 0
    losses = []
    for batch in tqdm(batch_gen, total=train_file_lines/batchsize): # progress bar
        ch.cuda.cupy.get_default_pinned_memory_pool().free_all_blocks() #free up memory
        batch_id += 1
        model.reset_state()
    
        # Calculate the prediction & loss of the network
        c_seq = ch.Variable(cuda.to_gpu(batch.context_ids))
        q_seq = ch.Variable(cuda.to_gpu(batch.qn_ids))
        ground_truth = cuda.to_gpu(batch.ans_span)
        s_prediction, e_prediction, loss = model(c_seq, q_seq, ground_truth)

        # Calculate the gradients in the network
        model.cleargrads()
        loss.backward()

        # Update all the trainable parameters
        optimizer.update()

        # print("Batch done")
        losses.append(loss.item())
        if batch_id % show_mean_loss_at_batches == 0:
            print("Current mean loss of epoch: {}".format(np.mean(losses)))

    print("Mean loss of epoch {}: {}".format(epoch, np.mean(losses)))
    ch.serializers.save_npz('DCANW_E{}.model'.format(epoch), model)
    


print("Training for {} epochs finished".format(max_epoch))

In [None]:

    #  # Display the training loss
    #     print('epoch:{} train_loss:{:.04f} '.format(
    #         epoch, ))

    #     test_losses = []
    #     test_accuracies = []
    #     for test_batch in test_iter:
    #         image_test, target_test = concat_examples(test_batch, gpu_id)

    #         # Forward the test data
    #         prediction_test = model(image_test)

    #         # Calculate the loss
    #         loss_test = F.softmax_cross_entropy(prediction_test, target_test)
    #         test_losses.append(to_cpu(loss_test.array))

    #         # Calculate the accuracy
    #         accuracy = F.accuracy(prediction_test, target_test)
    #         accuracy.to_cpu()
    #         test_accuracies.append(accuracy.array)

    #     test_iter.reset()

    #     print('val_loss:{:.04f} val_accuracy:{:.04f}'.format(
    #         np.mean(test_losses), np.mean(test_accuracies)))

# # train_iter = ch.iterators.SerialIterator(train, batchsize)
# # test_iter = ch.iterators.SerialIterator(test, batchsize, False, False)

In [None]:
# # Create the updater, using the optimizer --> not used
# updater = ch.training.StandardUpdater(train_iter, optimizer, device=-1)

In [None]:
# # Set up a trainer -- not used
# max_epoch = 10
# trainer = ch.training.Trainer(updater, (max_epoch, 'epoch'), out='result')