In [None]:
# # Used for obtaining the training data
# ! python ./preprocessing/download_wordvecs.py --download_dir ./data
# ! python ./preprocessing/squad_preprocess.py --data_dir ./data

From the paper:
"We use a max sequence length of 600 during training and a hidden state size of 200 for all recurrent
units, maxout layers, and linear layers. All LSTMs have randomly initialized parameters and an
initial state of zero. Sentinel vectors are randomly initialized and optimized during training. For
the dynamic decoder, we set the maximum number of iterations to 4 and use a maxout pool size of
16. We use dropout to regularize our network during training (Srivastava et al., 2014), and optimize
the model using ADAM (Kingma & Ba, 2014). All models are implemented and trained with
Chainer (Tokui et al., 2015)."

In [None]:
from model import DynamicCoattentionNW
import chainer as ch
import chainer.functions as F


max_seq_length = 600

hid_state_size = 200

dyn_dec_max_it = 4
maxout_pool_size = 16

dropout = 0.1

In [None]:
from preprocessing.vocab import get_glove

glove_path = "./data/glove.840B.300d.txt"
glove_vector_size = 300
emb_mat, word2id, id2word = get_glove(glove_path, glove_vector_size)

vocab_size = len(word2id)

print(emb_mat.shape)

In [None]:
# %load_ext autoreload

# %autoreload 2

# from data_batcher import get_batch_generator
# batchsize = 4
# # batchsize = 128

# train_context_path = "./data/train.context"
# train_qn_path = "./data/train.question"
# train_ans_path = "./data/train.answer"
# train_span_path = "./data/train.span"

# batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)

# #temp
# from model import DocAndQuesEncoder, CoattentionEncoder, DynamicPointingDecoder, DynamicCoattentionNW
# # enc = DocAndQuesEncoder(emb_mat, dropout, hid_state_size)
# # enc2 = CoattentionEncoder(dropout, hid_state_size)
# # dec = DynamicPointingDecoder(dropout, hid_state_size, maxout_pool_size, dyn_dec_max_it)
# model = DynamicCoattentionNW(max_seq_length, hid_state_size, dyn_dec_max_it, maxout_pool_size, dropout, emb_mat)

# for batch in batch_gen:
#     s, e = model.forward(batch.context_ids, batch.qn_ids)
#     # D, Q = enc.forward(batch.context_ids, batch.qn_ids)
#     # U = enc2.forward(D, Q)
#     # s, e = dec.forward(U)
#     print(s)
#     print(e)


#     print(batch.ans_span)
#     # print(batch.ans_tokens)   
#     break

# # train_iter = ch.iterators.SerialIterator(train, batchsize)
# # test_iter = ch.iterators.SerialIterator(test, batchsize, False, False)

In [None]:
def a_loss_function(prediction, ground_truth):
    s_p, e_p = prediction
    s_t = ground_truth[:,0].astype('f')
    e_t = ground_truth[:,1].astype('f')

    start_loss = F.mean_squared_error(s_p.astype('f'), s_t)
    end_loss = F.mean_squared_error(e_p.astype('f'), e_t)
    total_loss = start_loss + end_loss

    if (s_p>e_p).any(): #penalize impossible start > end predictions
        total_loss *= 2
    return total_loss

In [None]:
model = DynamicCoattentionNW(max_seq_length, hid_state_size, dyn_dec_max_it, maxout_pool_size, dropout, emb_mat)

# Setup an optimizer
optimizer = ch.optimizers.Adam()
optimizer.setup(model)

model.to_gpu()

In [None]:
from data_batcher import get_batch_generator
from chainer.backends import cuda
import time
from tqdm import tqdm

# batchsize = 16
# batchsize = 32
batchsize = 64
max_epoch = 10
epochs_pre_trained = 0

train_file_lines = 86307

train_context_path = "./data/train.context"
train_qn_path = "./data/train.question"
train_ans_path = "./data/train.answer"
train_span_path = "./data/train.span"

for i in range(max_epoch, 0, -1):
    try:
        ch.serializers.load_npz('DCANW_E{}.model'.format(i), model)
        print("Model DCANW_E{}.model loaded successfully".format(i))
        epochs_pre_trained = i
        break
    except FileNotFoundError:
        continue

# batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)

for epoch in range(1+epochs_pre_trained, max_epoch+1):
    print("Epoch: {}".format(epoch))
    batch_gen = get_batch_generator(word2id, train_context_path, train_qn_path, train_span_path, batchsize, max_seq_length, max_seq_length, discard_long=True)
    batch_id = 0
    for batch in tqdm(batch_gen, total=train_file_lines/batchsize):
        batch_id += 1
    # for batch in batch_gen:
        # tic = time.time()
        # Calculate the prediction of the network
        c_seq = ch.Variable(cuda.to_gpu(batch.context_ids))
        q_seq = ch.Variable(cuda.to_gpu(batch.qn_ids))
        prediction = model(c_seq, q_seq)
        # prediction = model(batch.context_ids, batch.qn_ids)

        # Calculate the loss
        loss = a_loss_function(prediction, cuda.to_gpu(batch.ans_span))

        # Calculate the gradients in the network
        model.cleargrads()
        loss.backward()

        # Update all the trainable parameters
        optimizer.update()

        # print("Batch done")
        # toc = time.time()
        # print("Current batch took {} seconds".format(toc-tic))
        if batch_id % 100 == 0:
            print(loss.item())

    ch.serializers.save_npz('DCANW_E{}.model'.format(epoch), model)
    

    #  # Display the training loss
    #     print('epoch:{} train_loss:{:.04f} '.format(
    #         epoch, ))

    #     test_losses = []
    #     test_accuracies = []
    #     for test_batch in test_iter:
    #         image_test, target_test = concat_examples(test_batch, gpu_id)

    #         # Forward the test data
    #         prediction_test = model(image_test)

    #         # Calculate the loss
    #         loss_test = F.softmax_cross_entropy(prediction_test, target_test)
    #         test_losses.append(to_cpu(loss_test.array))

    #         # Calculate the accuracy
    #         accuracy = F.accuracy(prediction_test, target_test)
    #         accuracy.to_cpu()
    #         test_accuracies.append(accuracy.array)

    #     test_iter.reset()

    #     print('val_loss:{:.04f} val_accuracy:{:.04f}'.format(
    #         np.mean(test_losses), np.mean(test_accuracies)))


In [None]:
# # Create the updater, using the optimizer
# updater = ch.training.StandardUpdater(train_iter, optimizer, device=-1)

In [None]:
# # Set up a trainer
# max_epoch = 10
# trainer = ch.training.Trainer(updater, (max_epoch, 'epoch'), out='result')

In [None]:
#save model
#see https://docs.chainer.org/en/stable/guides/serializers.html