In [1]:
# # Used for obtaining the training data
# ! python ./preprocessing/download_wordvecs.py --download_dir ./data
# ! python ./preprocessing/squad_preprocess.py --data_dir ./data

From the paper:
"We use a max sequence length of 600 during training and a hidden state size of 200 for all recurrent
units, maxout layers, and linear layers. All LSTMs have randomly initialized parameters and an
initial state of zero. Sentinel vectors are randomly initialized and optimized during training. For
the dynamic decoder, we set the maximum number of iterations to 4 and use a maxout pool size of
16. We use dropout to regularize our network during training (Srivastava et al., 2014), and optimize
the model using ADAM (Kingma & Ba, 2014). All models are implemented and trained with
Chainer (Tokui et al., 2015)."

In [2]:
# %load_ext autoreload

# %autoreload 2

from model import DynamicCoattentionNW
import chainer as ch
import numpy as np
import chainer.functions as F


max_seq_length = 600

hid_state_size = 200

dyn_dec_max_it = 4
maxout_pool_size = 16

dropout = 0.1

--------------------------------------------------------------------------------
CuPy (cupy) version 12.3.0 may not be compatible with this version of Chainer.
Please consider installing the supported version by running:
  $ pip install 'cupy>=7.7.0,<8.0.0'

See the following page for more details:
  https://docs.cupy.dev/en/latest/install.html
--------------------------------------------------------------------------------



In [3]:
from preprocessing.vocab import get_glove

glove_path = "./data/glove.840B.300d.txt"
glove_vector_size = 300
emb_mat, word2id, id2word = get_glove(glove_path, glove_vector_size)

vocab_size = len(word2id)

print(emb_mat.shape)

Loading GLoVE vectors from file: ./data/glove.840B.300d.txt


100%|██████████| 2196017/2196017 [01:51<00:00, 19686.92it/s]

(2196018, 300)





In [4]:
def a_loss_function(prediction, ground_truth):
    s_p, e_p = prediction
    s_t = ground_truth[:,0].astype('f')
    e_t = ground_truth[:,1].astype('f')

    start_loss = F.mean_squared_error(s_p.astype('f'), s_t)
    end_loss = F.mean_squared_error(e_p.astype('f'), e_t)
    total_loss = start_loss + end_loss

    if (s_p>e_p).any(): #penalize impossible start > end predictions
        total_loss *= 2
    return total_loss

In [5]:

model = DynamicCoattentionNW(max_seq_length, hid_state_size, dyn_dec_max_it, maxout_pool_size, dropout, emb_mat)

# Setup an optimizer
optimizer = ch.optimizers.Adam()
optimizer.setup(model)

model.to_gpu()

<model.DynamicCoattentionNW at 0x294ece967f0>

In [None]:
from data_batcher import get_batch_generator
from chainer.backends import cuda
import time
from tqdm import tqdm

# batchsize = 2 # for quick test only
batchsize = 32 # seems to be the fastest overall
# batchsize = 64
max_epoch = 10
epochs_pre_trained = 0
show_mean_loss_at_batches = 200

train_file_lines = 86307

train_context_path = "./data/train.context"
train_qn_path = "./data/train.question"
train_ans_path = "./data/train.answer"
train_span_path = "./data/train.span"

for i in range(max_epoch, 0, -1):
    try:
        ch.serializers.load_npz('DCANW_E{}.model'.format(i), model)
        print("Model DCANW_E{}.model loaded successfully".format(i))
        epochs_pre_trained = i
        break
    except FileNotFoundError:
        continue


for epoch in range(1+epochs_pre_trained, max_epoch+1):
    print("Epoch: {}".format(epoch))
    batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)
    batch_id = 0
    losses = []
    for batch in tqdm(batch_gen, total=train_file_lines/batchsize): # progress bar
        ch.cuda.cupy.get_default_pinned_memory_pool().free_all_blocks()
        batch_id += 1
        model.reset_state()
    
        # Calculate the prediction of the network
        c_seq = ch.Variable(cuda.to_gpu(batch.context_ids))
        q_seq = ch.Variable(cuda.to_gpu(batch.qn_ids))
        ground_truth = cuda.to_gpu(batch.ans_span)
        s_prediction, e_prediction, loss = model(c_seq, q_seq, ground_truth)
        # prediction = model(batch.context_ids, batch.qn_ids)

        # # Calculate the loss
        # loss = a_loss_function(prediction, ground_truth)

        # Calculate the gradients in the network
        model.cleargrads()
        loss.backward()

        # Update all the trainable parameters
        optimizer.update()

        # print("Batch done")
        losses.append(loss.item())
        if batch_id % show_mean_loss_at_batches == 0:
            print("Current mean loss of epoch: {}".format(np.mean(losses)))

    print("Mean loss of epoch {}: {}".format(epoch, np.mean(losses)))
    ch.serializers.save_npz('DCANW_E{}.model'.format(epoch), model)
    


print("Training for {} epochs finished".format(max_epoch))

Epoch: 1


  4%|▎         | 100/2697.09375 [04:15<1:39:40,  2.30s/it]

Current mean loss: 8.49388102054596


  7%|▋         | 200/2697.09375 [08:31<1:40:23,  2.41s/it]

Current mean loss: 8.207823593616485


 11%|█         | 300/2697.09375 [12:47<1:35:45,  2.40s/it]

Current mean loss: 8.046257079442341


 15%|█▍        | 400/2697.09375 [17:02<1:44:07,  2.72s/it]

Current mean loss: 7.952217050790787


 19%|█▊        | 500/2697.09375 [20:52<1:20:55,  2.21s/it]

Current mean loss: 7.895180998802185


 22%|██▏       | 600/2697.09375 [24:50<1:13:57,  2.12s/it]

Current mean loss: 7.829199256102244


 26%|██▌       | 700/2697.09375 [28:39<1:19:43,  2.40s/it]

Current mean loss: 7.78052865913936


 30%|██▉       | 800/2697.09375 [32:54<1:18:31,  2.48s/it]

Current mean loss: 7.7429425102472305


 33%|███▎      | 900/2697.09375 [37:20<1:16:54,  2.57s/it]

Current mean loss: 7.707712915738424


 37%|███▋      | 1000/2697.09375 [41:34<1:03:48,  2.26s/it]

Current mean loss: 7.673960052013397


 41%|████      | 1100/2697.09375 [45:35<1:06:02,  2.48s/it]

Current mean loss: 7.6466859605095605


 44%|████▍     | 1200/2697.09375 [49:56<1:01:05,  2.45s/it]

Current mean loss: 7.620879534482956


 48%|████▊     | 1300/2697.09375 [54:13<1:02:14,  2.67s/it]

Current mean loss: 7.594574788900522


 52%|█████▏    | 1400/2697.09375 [58:32<52:42,  2.44s/it]  

Current mean loss: 7.576097026552473


 56%|█████▌    | 1500/2697.09375 [1:02:29<46:16,  2.32s/it]

Current mean loss: 7.557947903633118


 59%|█████▉    | 1600/2697.09375 [1:06:45<52:24,  2.87s/it]

Current mean loss: 7.541947692632675


 63%|██████▎   | 1700/2697.09375 [1:10:51<40:03,  2.41s/it]

Current mean loss: 7.5202159760980045


 67%|██████▋   | 1800/2697.09375 [1:15:04<35:32,  2.38s/it]

Current mean loss: 7.489921306504144


 70%|███████   | 1900/2697.09375 [1:19:17<33:52,  2.55s/it]

Current mean loss: 7.450511437717237


 74%|███████▍  | 2000/2697.09375 [1:23:39<28:30,  2.45s/it]

Current mean loss: 7.395540391921997


 78%|███████▊  | 2100/2697.09375 [1:27:41<24:38,  2.48s/it]

Current mean loss: 7.336952983765375


 82%|████████▏ | 2200/2697.09375 [1:31:28<19:36,  2.37s/it]

Current mean loss: 7.273433798876676


 85%|████████▌ | 2300/2697.09375 [1:35:30<15:11,  2.29s/it]

Current mean loss: 7.210173169218976


 89%|████████▉ | 2400/2697.09375 [1:39:27<11:20,  2.29s/it]

Current mean loss: 7.14447316010793


 93%|█████████▎| 2500/2697.09375 [1:43:22<08:19,  2.53s/it]

Current mean loss: 7.072308417129516


 96%|█████████▋| 2600/2697.09375 [1:47:05<03:59,  2.47s/it]

Current mean loss: 6.997671268719893


100%|█████████▉| 2696/2697.09375 [1:50:55<00:02,  2.47s/it]


Mean loss of epoch 1: 6.9193424490509825
Epoch: 2


  0%|          | 7/2697.09375 [00:17<1:51:56,  2.50s/it]


KeyboardInterrupt: 

In [None]:

    #  # Display the training loss
    #     print('epoch:{} train_loss:{:.04f} '.format(
    #         epoch, ))

    #     test_losses = []
    #     test_accuracies = []
    #     for test_batch in test_iter:
    #         image_test, target_test = concat_examples(test_batch, gpu_id)

    #         # Forward the test data
    #         prediction_test = model(image_test)

    #         # Calculate the loss
    #         loss_test = F.softmax_cross_entropy(prediction_test, target_test)
    #         test_losses.append(to_cpu(loss_test.array))

    #         # Calculate the accuracy
    #         accuracy = F.accuracy(prediction_test, target_test)
    #         accuracy.to_cpu()
    #         test_accuracies.append(accuracy.array)

    #     test_iter.reset()

    #     print('val_loss:{:.04f} val_accuracy:{:.04f}'.format(
    #         np.mean(test_losses), np.mean(test_accuracies)))

# # train_iter = ch.iterators.SerialIterator(train, batchsize)
# # test_iter = ch.iterators.SerialIterator(test, batchsize, False, False)

In [None]:
# # Create the updater, using the optimizer --> not used
# updater = ch.training.StandardUpdater(train_iter, optimizer, device=-1)

In [None]:
# # Set up a trainer -- not used
# max_epoch = 10
# trainer = ch.training.Trainer(updater, (max_epoch, 'epoch'), out='result')