In [1]:
# # Used for obtaining the training data
# ! python ./preprocessing/download_wordvecs.py --download_dir ./data
# ! python ./preprocessing/squad_preprocess.py --data_dir ./data

From the paper:
"We use a max sequence length of 600 during training and a hidden state size of 200 for all recurrent
units, maxout layers, and linear layers. All LSTMs have randomly initialized parameters and an
initial state of zero. Sentinel vectors are randomly initialized and optimized during training. For
the dynamic decoder, we set the maximum number of iterations to 4 and use a maxout pool size of
16. We use dropout to regularize our network during training (Srivastava et al., 2014), and optimize
the model using ADAM (Kingma & Ba, 2014). All models are implemented and trained with
Chainer (Tokui et al., 2015)."

In [2]:
from model import DynamicCoattentionNW
import chainer as ch
import chainer.functions as F


max_seq_length = 600

hid_state_size = 200

dyn_dec_max_it = 4
maxout_pool_size = 16

dropout = 0.1

--------------------------------------------------------------------------------
CuPy (cupy) version 12.3.0 may not be compatible with this version of Chainer.
Please consider installing the supported version by running:
  $ pip install 'cupy>=7.7.0,<8.0.0'

See the following page for more details:
  https://docs.cupy.dev/en/latest/install.html
--------------------------------------------------------------------------------



In [3]:
from preprocessing.vocab import get_glove

glove_path = "./data/glove.840B.300d.txt"
glove_vector_size = 300
emb_mat, word2id, id2word = get_glove(glove_path, glove_vector_size)

vocab_size = len(word2id)

print(emb_mat.shape)

Loading GLoVE vectors from file: ./data/glove.840B.300d.txt


100%|██████████| 2196017/2196017 [01:48<00:00, 20281.67it/s]

(2196018, 300)





In [4]:
# %load_ext autoreload

# %autoreload 2

# from data_batcher import get_batch_generator
# batchsize = 4
# # batchsize = 128

# train_context_path = "./data/train.context"
# train_qn_path = "./data/train.question"
# train_ans_path = "./data/train.answer"
# train_span_path = "./data/train.span"

# batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)

# #temp
# from model import DocAndQuesEncoder, CoattentionEncoder, DynamicPointingDecoder, DynamicCoattentionNW
# # enc = DocAndQuesEncoder(emb_mat, dropout, hid_state_size)
# # enc2 = CoattentionEncoder(dropout, hid_state_size)
# # dec = DynamicPointingDecoder(dropout, hid_state_size, maxout_pool_size, dyn_dec_max_it)
# model = DynamicCoattentionNW(max_seq_length, hid_state_size, dyn_dec_max_it, maxout_pool_size, dropout, emb_mat)

# for batch in batch_gen:
#     s, e = model.forward(batch.context_ids, batch.qn_ids)
#     # D, Q = enc.forward(batch.context_ids, batch.qn_ids)
#     # U = enc2.forward(D, Q)
#     # s, e = dec.forward(U)
#     print(s)
#     print(e)


#     print(batch.ans_span)
#     # print(batch.ans_tokens)   
#     break

# # train_iter = ch.iterators.SerialIterator(train, batchsize)
# # test_iter = ch.iterators.SerialIterator(test, batchsize, False, False)

In [5]:
def a_loss_function(prediction, ground_truth):
    s_p, e_p = prediction
    s_t = ground_truth[:,0].astype('f')
    e_t = ground_truth[:,1].astype('f')

    start_loss = F.mean_squared_error(s_p.astype('f'), s_t)
    end_loss = F.mean_squared_error(e_p.astype('f'), e_t)
    total_loss = start_loss + end_loss

    if (s_p>e_p).any(): #penalize impossible start > end predictions
        total_loss *= 2
    return total_loss

In [6]:
model = DynamicCoattentionNW(max_seq_length, hid_state_size, dyn_dec_max_it, maxout_pool_size, dropout, emb_mat)

# Setup an optimizer
optimizer = ch.optimizers.Adam()
optimizer.setup(model)

model.to_gpu()

<model.DynamicCoattentionNW at 0x1ffc54f6a30>

In [None]:
from data_batcher import get_batch_generator
from chainer.backends import cuda
import time
from tqdm import tqdm

# batchsize = 16
# batchsize = 32
batchsize = 64
max_epoch = 10

train_file_lines = 86307

train_context_path = "./data/train.context"
train_qn_path = "./data/train.question"
train_ans_path = "./data/train.answer"
train_span_path = "./data/train.span"

# batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)

for epoch in range(max_epoch):
    batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)
    batch_id = 0
    for batch in tqdm(batch_gen, total=train_file_lines/batchsize):
        batch_id += 1
    # for batch in batch_gen:
        # tic = time.time()
        # Calculate the prediction of the network
        c_seq = ch.Variable(cuda.to_gpu(batch.context_ids))
        q_seq = ch.Variable(cuda.to_gpu(batch.qn_ids))
        prediction = model(c_seq, q_seq)
        # prediction = model(batch.context_ids, batch.qn_ids)

        # Calculate the loss
        loss = a_loss_function(prediction, cuda.to_gpu(batch.ans_span))

        # Calculate the gradients in the network
        model.cleargrads()
        loss.backward()

        # Update all the trainable parameters
        optimizer.update()

        # print("Batch done")
        # toc = time.time()
        # print("Current batch took {} seconds".format(toc-tic))
        if batch_id % 100 == 0:
            print(loss.item())

    ch.serializers.save_npz('DCANW_E{}.model'.format(epoch), model)
    

    #  # Display the training loss
    #     print('epoch:{} train_loss:{:.04f} '.format(
    #         epoch, ))

    #     test_losses = []
    #     test_accuracies = []
    #     for test_batch in test_iter:
    #         image_test, target_test = concat_examples(test_batch, gpu_id)

    #         # Forward the test data
    #         prediction_test = model(image_test)

    #         # Calculate the loss
    #         loss_test = F.softmax_cross_entropy(prediction_test, target_test)
    #         test_losses.append(to_cpu(loss_test.array))

    #         # Calculate the accuracy
    #         accuracy = F.accuracy(prediction_test, target_test)
    #         accuracy.to_cpu()
    #         test_accuracies.append(accuracy.array)

    #     test_iter.reset()

    #     print('val_loss:{:.04f} val_accuracy:{:.04f}'.format(
    #         np.mean(test_losses), np.mean(test_accuracies)))


  0%|          | 0/1348.546875 [00:00<?, ?it/s]

Refilling batches...
Refilling batches took 1.2815513610839844 seconds


  7%|▋         | 100/1348.546875 [02:03<26:43,  1.28s/it]

558034.0


 12%|█▏        | 160/1348.546875 [03:13<23:04,  1.16s/it]

Refilling batches...
Refilling batches took 1.1703357696533203 seconds


 15%|█▍        | 200/1348.546875 [04:02<24:36,  1.29s/it]

470720.71875


 22%|██▏       | 300/1348.546875 [06:01<22:21,  1.28s/it]

503531.15625


 24%|██▎       | 320/1348.546875 [06:24<20:14,  1.18s/it]

Refilling batches...
Refilling batches took 1.1794793605804443 seconds


 30%|██▉       | 400/1348.546875 [08:06<22:44,  1.44s/it]

508498.0


 36%|███▌      | 480/1348.546875 [09:45<17:30,  1.21s/it]

Refilling batches...
Refilling batches took 1.7500123977661133 seconds


 37%|███▋      | 500/1348.546875 [10:13<20:34,  1.46s/it]

472331.34375


 44%|████▍     | 600/1348.546875 [12:20<16:21,  1.31s/it]

448070.5


 47%|████▋     | 640/1348.546875 [13:10<14:22,  1.22s/it]

Refilling batches...
Refilling batches took 1.9687426090240479 seconds


 52%|█████▏    | 700/1348.546875 [14:29<14:39,  1.36s/it]

487435.15625


 59%|█████▉    | 800/1348.546875 [16:35<12:21,  1.35s/it]

497031.6875
Refilling batches...
Refilling batches took 1.925781011581421 seconds


 67%|██████▋   | 900/1348.546875 [18:41<09:36,  1.28s/it]

489301.78125


 71%|███████   | 960/1348.546875 [19:52<07:45,  1.20s/it]

Refilling batches...
Refilling batches took 1.1851739883422852 seconds


 74%|███████▍  | 1000/1348.546875 [20:41<07:29,  1.29s/it]

391557.4375


 82%|████████▏ | 1100/1348.546875 [22:40<05:20,  1.29s/it]

481671.1875


 83%|████████▎ | 1120/1348.546875 [23:07<05:25,  1.42s/it]

Refilling batches...
Refilling batches took 2.0009267330169678 seconds


 87%|████████▋ | 1172/1348.546875 [24:12<03:32,  1.20s/it]

In [None]:
# # Create the updater, using the optimizer
# updater = ch.training.StandardUpdater(train_iter, optimizer, device=-1)

In [None]:
# # Set up a trainer
# max_epoch = 10
# trainer = ch.training.Trainer(updater, (max_epoch, 'epoch'), out='result')

In [None]:
#save model
#see https://docs.chainer.org/en/stable/guides/serializers.html