In [None]:
# # Used for obtaining the training data
# # Uncomment and run for the first-time setup
# ! python ./preprocessing/download_wordvecs.py --download_dir ./data
# ! python ./preprocessing/squad_preprocess.py --data_dir ./data

Ideas:  check Dropout applied where it should be
check EmbId - expected input vs actual input
input: padding -> mask needed?

From the paper:
"We use a max sequence length of 600 during training and a hidden state size of 200 for all recurrent
units, maxout layers, and linear layers. All LSTMs have randomly initialized parameters and an
initial state of zero. Sentinel vectors are randomly initialized and optimized during training. For
the dynamic decoder, we set the maximum number of iterations to 4 and use a maxout pool size of
16. We use dropout to regularize our network during training (Srivastava et al., 2014), and optimize
the model using ADAM (Kingma & Ba, 2014). All models are implemented and trained with
Chainer (Tokui et al., 2015)."

In [1]:
from model import DynamicCoattentionNW
import chainer as ch
import numpy as np
import chainer.functions as F


max_seq_length = 600

hid_state_size = 200

dyn_dec_max_it = 4
maxout_pool_size = 16

dropout = 0.1

--------------------------------------------------------------------------------
CuPy (cupy) version 12.3.0 may not be compatible with this version of Chainer.
Please consider installing the supported version by running:
  $ pip install 'cupy>=7.7.0,<8.0.0'

See the following page for more details:
  https://docs.cupy.dev/en/latest/install.html
--------------------------------------------------------------------------------



In [2]:
from preprocessing.vocab import get_glove

glove_path = "./data/glove.840B.300d.txt"
glove_vector_size = 300
emb_mat, word2id, id2word = get_glove(glove_path, glove_vector_size)

vocab_size = len(word2id)

print(emb_mat.shape)

Loading GLoVE vectors from file: ./data/glove.840B.300d.txt


100%|██████████| 2196017/2196017 [01:49<00:00, 20011.28it/s]

(2196018, 300)





In [3]:

model = DynamicCoattentionNW(max_seq_length, hid_state_size, dyn_dec_max_it, maxout_pool_size, dropout, emb_mat)

# Setup an optimizer
optimizer = ch.optimizers.Adam()
optimizer.setup(model)

model.to_gpu()

<model.DynamicCoattentionNW at 0x253eee64970>

In [4]:
from data_batcher import get_batch_generator
from chainer.backends import cuda
from tqdm import tqdm

# batchsize = 2 # for quick test only
batchsize = 32 # seems to be the fastest overall
# batchsize = 64
# max_epoch = 15
max_epoch = 10
epochs_pre_trained = 0
show_mean_loss_at_batches = 200

train_file_lines = 86300 # rougly

train_context_path = "./data/train.context"
train_qn_path = "./data/train.question"
train_ans_path = "./data/train.answer"
train_span_path = "./data/train.span"

# try loading a trained model
for i in range(max_epoch, 0, -1):
    try:
        ch.serializers.load_npz('DCANW_E{}.model'.format(i), model)
        print("Model DCANW_E{}.model loaded successfully".format(i))
        epochs_pre_trained = i
        break
    except FileNotFoundError:
        continue


for epoch in range(1+epochs_pre_trained, max_epoch+1):
    print("Epoch: {}".format(epoch))
    batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)
    batch_id = 0
    losses = []
    for batch in tqdm(batch_gen, total=train_file_lines/batchsize): # progress bar
        ch.cuda.cupy.get_default_pinned_memory_pool().free_all_blocks() #free up memory
        batch_id += 1
        model.reset_state()
    
        # Calculate the prediction & loss of the network
        c_seq = ch.Variable(cuda.to_gpu(batch.context_ids))
        q_seq = ch.Variable(cuda.to_gpu(batch.qn_ids))
        ground_truth = cuda.to_gpu(batch.ans_span)
        s_prediction, e_prediction, loss = model(c_seq, q_seq, ground_truth)

        # Calculate the gradients in the network
        model.cleargrads()
        loss.backward()

        # Update all the trainable parameters
        optimizer.update()

        # print("Batch done")
        losses.append(loss.item())
        if batch_id % show_mean_loss_at_batches == 0:
            print("Current mean loss of epoch: {}".format(np.mean(losses)))

    print("Mean loss of epoch {}: {}".format(epoch, np.mean(losses)))
    print("Last loss of epoch {}: {}".format(epoch, losses[-1]))
    ch.serializers.save_npz('DCANW_E{}.model'.format(epoch), model)
    


print("Training for {} epochs finished".format(max_epoch))

Model DCANW_E10.model loaded successfully
Training for 10 epochs finished


In [None]:
# quick and dirty evaluation
batchsize = 32 # seems to be the fastest overall
# batchsize = 4
# batchsize = 64

dev_file_lines = 10400 # rougly

dev_context_path = "./data/dev.context"
dev_qn_path = "./data/dev.question"
dev_ans_path = "./data/dev.answer"
dev_span_path = "./data/dev.span"

batch_gen = get_batch_generator(word2id, dev_context_path, dev_qn_path, dev_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)

with open("log.txt", "w") as file:
    with ch.using_config('train', False):
        for batch in tqdm(batch_gen, total=dev_file_lines/batchsize): # progress bar
            ch.cuda.cupy.get_default_pinned_memory_pool().free_all_blocks() #free up memory
            model.reset_state()

            # Calculate the prediction
            c_seq = ch.Variable(cuda.to_gpu(batch.context_ids))
            q_seq = ch.Variable(cuda.to_gpu(batch.qn_ids))
            ground_truth = cuda.to_gpu(batch.ans_span)
            s_prediction, e_prediction, _ = model(c_seq, q_seq, ground_truth)

            for j in range(len(s_prediction)):
                file.write("\nContext: ")
                file.write(str(" ".join(batch.context_tokens[j])))

                file.write("\nQuestion: ")
                file.write(str(" ".join(batch.qn_tokens[j])))

                file.write("\nAnswer: ")
                file.write(str(" ".join(batch.ans_tokens[j])))
                # file.write(str(batch.ans_span))
            
                file.write("\nPrediction: ")
                file.write(str(" ".join(batch.context_tokens[j][int(s_prediction[j]):int(e_prediction[j])+1])))
                file.write("\n")
                # file.write(str([[int(s_prediction[i]), int(e_prediction[i])] for i in range(len(s_prediction))]))

    


  full_bar = Bar(frac,
325it [02:06,  2.56it/s]                             
