In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.cuda import FloatTensor, LongTensor

np.random.seed(42)

In [2]:
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

PAD_TOKEN = '[PAD]' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNKNOWN_TOKEN = '[UNK]' # This has a vocab id, which is used to represent out-of-vocabulary words
START_DECODING = '[START]' # This has a vocab id, which is used at the start of every decoder input sequence
STOP_DECODING = '[STOP]' # This has a vocab id, which is used at the end of untruncated target sequences

VOCAB_SIZE = 50000
ADDITIONAL_WORDS = 200

class Vocab(object):

  def __init__(self, vocab_file, max_size):
    self._word_to_id = {}
    self._id_to_word = {}
    self._count = 0 # keeps track of total number of words in the Vocab

    # [UNK], [PAD], [START] and [STOP] get the ids 0,1,2,3.
    for w in [UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
      self._word_to_id[w] = self._count
      self._id_to_word[self._count] = w
      self._count += 1

    # Read the vocab file and add words up to max_size
    with open(vocab_file, 'r') as vocab_f:
      for line in vocab_f:
        pieces = line.split()
        if len(pieces) != 2:
          print('Warning: incorrectly formatted line in vocabulary file: %s\n' % line)
          continue
        w = pieces[0]
        if w in [SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
          raise Exception('<s>, </s>, [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w)
        if w in self._word_to_id:
          raise Exception('Duplicated word in vocabulary file: %s' % w)
        self._word_to_id[w] = self._count
        self._id_to_word[self._count] = w
        self._count += 1
        if max_size != 0 and self._count >= max_size:
          print("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self._count))
          break

    print("Finished constructing vocabulary of %i total words. Last word added: %s" % (self._count, self._id_to_word[self._count-1]))

  def word2id(self, word):
    if word not in self._word_to_id:
      return self._word_to_id[UNKNOWN_TOKEN]
    return self._word_to_id[word]

  def id2word(self, word_id):
    if word_id not in self._id_to_word:
      raise ValueError('Id not found in vocab: %d' % word_id)
    return self._id_to_word[word_id]

  def size(self):
    return self._count

  def write_metadata(self, fpath):
    print("Writing word embedding metadata file to %s..." % (fpath))
    with open(fpath, "w") as f:
      fieldnames = ['word']
      writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
      for i in xrange(self.size()):
        writer.writerow({"word": self._id_to_word[i]})
        


def article2ids(article_words, vocab):
    ids = []
    oovs = []
    unk_id = vocab.word2id(UNKNOWN_TOKEN)
    for w in article_words.split():
        w = str(w)
        i = vocab.word2id(w)
        if i == unk_id: # If w is OOV
            if w not in oovs: # Add to list of OOVs
                oovs.append(w)
            oov_num = oovs.index(w) # This is 0 for the first article OOV, 1 for the second article OOV...
            ids.append(vocab.size() + oov_num) # This is e.g. 50000 for the first article OOV, 50001 for the second...
        else:
            ids.append(i)
    return ids, oovs


def abstract2ids(abstract_words, vocab, article_oovs):
    ids = []
    unk_id = vocab.word2id(UNKNOWN_TOKEN)
    for w in abstract_words.split():
        i = vocab.word2id(w)
        if i == unk_id: # If w is an OOV word
            pass
            #if w in article_oovs: # If w is an in-article OOV
                #vocab_idx = vocab.size() + article_oovs.index(w) # Map to its temporary article OOV number
                #ids.append(vocab_idx)
        else:
            ids.append(i)
    return ids

def outputids2words(id_list, vocab, article_oovs):
    words = []
    for i in id_list:
        if i <= VOCAB_SIZE:
            w = vocab.id2word(i)
        else:
            article_oov_idx = i - vocab.size()
            w = article_oovs[article_oov_idx]
        words.append(w)
    return ' '.join(words)

In [3]:
import struct
from tensorflow.core.example import example_pb2

def example_gen(filename):
    reader = open(filename, 'rb')
    examples = []
    while True:
        len_bytes = reader.read(8)
        if not len_bytes: break # finished reading this file
        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
        e = example_pb2.Example.FromString(example_str)
        examples.append(e)
        
    for e in examples:  
        article_text = e.features.feature['article'].bytes_list.value[0]
        abstract_text = e.features.feature['abstract'].bytes_list.value[0]
        yield (article_text.decode('utf-8'), abstract_text.decode('utf-8'))
        

  from ._conv import register_converters as _register_converters


In [4]:
vocab = Vocab("finished_files/vocab", VOCAB_SIZE)









max_size of vocab was specified as 50000; we now have 50000 words. Stopping reading.
Finished constructing vocabulary of 50000 total words. Last word added: perisic


In [5]:
from decoder import *

%load_ext autoreload
%autoreload 2

In [6]:
from decoder import *

def build_decoder_model():
    master = DRAGNNDecoderMaster()
    embeddings_computer = EmbeddingComputer(VOCAB_SIZE, 100)
    master.add_component_encoder(TBRU("embed", TaggerRecurrent("input", "embed", False), embeddings_computer, (1,), True).cuda())
    #master.add_component(TBRU("extractive", TaggerRecurrent("embed", "extractive"), TaggerComputer(1000, 1000), (1,), True).cuda())
    master.add_component_encoder(TBRU("rnn", RNNSolidRecurrent("embed", "rnn"), RNNSolidComputer(100, 50), (1,), True).cuda())
    
    master.add_component_decoder(TBRU("decoder_embed", TaggerRecurrent(None, "decoder_embed", True),embeddings_computer, (1,), True).cuda())
    rec = AdditiveAttentiveLSTMEncoderRecurrent(100, 100, 50, "rnn", "decoder_embed", "decoder", False)
    master.add_component_decoder(TBRU("decoder", rec, LSTMEncoderComputer(200, 100), (1,), True).cuda())
    master.add_component_decoder(TBRU("output", TaggerRecurrent("decoder", "output", False), TaggerComputer(100, VOCAB_SIZE), (1,), True).cuda())
    return master

model = build_decoder_model()
torch.cuda.memory_allocated()

41077760

In [7]:
def add_padding(articles):
    lens = [len(article) for article in articles]
    max_len = max(lens)
    
    for i in range(len(articles)):
        articles[i].extend([vocab.word2id(PAD_TOKEN)]*(max_len - len(articles[i])))
    return np.array(articles).T

def add_padding_for_tagging(articles, targets):
    lens = [len(article) for article in articles]
    max_len = max(lens)
    
    for i in range(len(articles)):
        targets[i].extend([0]*(max_len - len(articles[i])))
        articles[i].extend([vocab.word2id(PAD_TOKEN)]*(max_len - len(articles[i])))
    return np.array(articles).T, np.array(targets)

In [8]:
import random

def calculate_mask(articles):
    mask = (articles == vocab.word2id(PAD_TOKEN))
    mask = np.logical_xor(mask, np.ones(articles.shape))
    return mask

def get_target(self, article, abstract):
    return [ int(i in abstract and i != vocab.word2id(UNKNOWN_TOKEN)) for i in article]

def push_abs_ptr(article, abstract, i, abs_ptr):
    while abs_ptr < len(abstract) and (not abstract[abs_ptr] in article[i+1:] 
                                               or abstract[abs_ptr] == vocab.word2id(SENTENCE_START)
                                               or abstract[abs_ptr] == vocab.word2id(SENTENCE_STOP)
                                               or abstract[abs_ptr] == vocab.word2id(UNKNOWN_TOKEN)):
        abs_ptr += 1
    return abs_ptr

class Batcher():
    
    def __init__(self, filename, batch_size):
        self.batch_size = batch_size
        generator = example_gen(filename)

        self.batches = []
        unknown_words_cnt = 0
        self.articles = []
        self.targets = []
        while True:
            articles = []
            targets = []
            for i in range(batch_size):
                try:
                    article_text, abstract_text = next(generator)
                    article_ids, unknown_words = article2ids(article_text, vocab)
                    unknown_words_cnt += len(unknown_words)
                    target = abstract2ids(abstract_text, vocab, unknown_words)
                    articles.append(article_ids)
                    targets.append(target)
                except:
                    break
            if len(articles) == 0:
                break
            self.articles.extend(articles)
            self.targets.extend(targets)
            articles = add_padding(articles)
            targets = add_padding(targets)
            mask = calculate_mask(articles)
            self.batches.append( (articles, targets, mask) )
        print(len(self.batches))
        print(unknown_words_cnt)
    
    def generator(self):
        for batch in self.batches:
            yield batch
            
    def get_random_batch(self):
        i = random.randint(len(self.articles))
        return self.articles[i], self.targets[i]

In [19]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu(result, target):
    BLEUscore = sentence_bleu([target], result, weights=(1, 0, 0, 0))
    return BLEUscore
    
def calculate_bleu_ngramm(result, target, n):
    cnt = 0
    for i in range(len(result) - n + 1):
        if check_ngramm_in_string(result[i:i+n], target):
            cnt += 1
    return cnt / (len(result) - n + 1)
    
def check_ngramm_in_string(ngramm, target):
    for i in range(len(target) - len(ngramm) + 1):
        flag = True
        for j in range(len(ngramm)):
            flag = flag and (ngramm[j] == target[i + j])
        if flag:
            return True
    return False

def calculate_logits_bleu_and_rouge(logits, target):
    result = logits.argmax(-1).cpu().detach().numpy().T
    bleu = 0.
    rouge = 0.
    for i in range(result.shape[0]):
        bleu += calculate_bleu(result[i], target[i])
        rouge += calculate_bleu(target[i], result[i])
    return bleu / result.shape[0], rouge / result.shape[0]

def generate_summary(article, model):
    symbol = [vocab.word2id(SENTENCE_START)]
    X_batch = LongTensor([article])
    inputs = InputLayerState("input", False, X_batch)
    print(X_batch)
    model.eval_run_encoder(inputs)
    result = []
    i = 0
    while True:
        i += 1
        hidden = model.decode(LongTensor([symbol]))
        result.append(symbol)
        symbol = [hidden.argmax(-1)]
        if (symbol[0] == vocab.word2id(SENTENCE_END) or i > 100):
            break
    return result

def gen_and_print_summary(batcher, model):
    article_text, target, mask = batcher.get_random_sample()
    result = generate_summary(article_text, model)
    result = outputids2words(result, vocab, [])
    target = outputids2words(target, vocab, [])
    print('result is ' + result)
    print('target is ' + target)

In [20]:
def calc_f1(tp, fp, tn, fn):
    precision = tp/(fp + tp)
    recall = tp/(tp + fn)
    f1 = 2*precision*recall/(precision+recall)
    return f1

def precalc_f1(articles_tokens, articles, target):
    mask = calculate_mask(articles_tokens).T
    result = (articles > 0.5)
    #print(result[0])
    #print(articles.shape, target.shape, mask.shape)
    n_res = np.logical_not(result)
    n_tar = np.logical_not(target)
    tp = (result * target * mask).sum()
    fp = (n_res * target * mask).sum()
    tn = (n_res * n_tar * mask).sum()
    fn = (result * n_tar * mask).sum()
    return tp, fp, tn, fn

In [21]:
import math
import time

def do_epoch(model, criterion, data, batch_size, optimizer=None):  
    epoch_loss = 0.
    bleu = 0.
    rouge = 0.
    batch_cnt = 1
    is_train = not optimizer is None
    model.train(is_train)

    with torch.autograd.set_grad_enabled(is_train):
        for i, (article_text, target, mask) in enumerate(data.generator()):
            batch_cnt =  i + 1
            X_batch, y_batch = LongTensor(article_text), LongTensor(target)
            inputs = InputLayerState("input", False, X_batch)
            targetLayer = InputLayerState("target", False, y_batch)
            logits = model.train_run(inputs, targetLayer)
            
            logits = logits.squeeze(-1)
            #print(logits.view(1, -1).shape)
            loss = criterion(logits.view(-1, logits.shape[-1]), y_batch.view(-1))
            epoch_loss += loss.item()
            
            cur_bleu, cur_rouge = calculate_logits_bleu_and_rouge(logits, target)
            bleu += cur_bleu
            rouge += cur_rouge
            
            if is_train:
                optimizer.zero_grad()
                loss.backward()
                #nn.utils.clip_grad_norm_(model.parameters(), 1.)
                optimizer.step()
            
            print('\r[{}]: Loss = {:.4f}, BLEU = {:.4f}, ROUGE = {:.4f}'.format(i, loss.item(), cur_bleu, cur_rouge), end='')
    
    gen_and_print_summary(data, model)
    return epoch_loss, bleu / batch_cnt, rouge / batch_cnt

def fit(model, criterion, optimizer, train_data, epochs_count=1, 
        batch_size=32, val_data=None, val_batch_size=None):
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        start_time = time.time()
        train_loss, bleu, rouge = do_epoch(model, criterion, train_data, batch_size, optimizer)
        output_info = '\rEpoch {} / {}, Epoch Time = {:.2f}s: Train Loss = {:.4f}: BLEU = {:.4f}, ROUGE = {:.4f}'
        if not val_data is None:
            val_loss, bleu, rouge = do_epoch(model, criterion, val_data, val_batch_size, None)
            epoch_time = time.time() - start_time
            output_info += ', Val Loss = {:.4f}'
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, bleu, rouge, val_loss))
        else:
            epoch_time = time.time() - start_time
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, bleu, rouge))

In [22]:
criterion = nn.CrossEntropyLoss(ignore_index = vocab.word2id(PAD_TOKEN)).cuda()
optimizer = optim.Adam(model.parameters())

fit(model, criterion, optimizer, epochs_count=50, batch_size=32, train_data=Batcher("finished_files/chunked/train_000.bin",8),
    val_data=None, val_batch_size=32)

125
8637
[0]: Loss = 3.6884, BLEU = 0.0312, ROUGE = 0.0005

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[124]: Loss = 4.2220, BLEU = 0.0330, ROUGE = 0.0008tensor([[50000,    55,  6554,    55,    55,    55,    55,  6975]],
       device='cuda:0')


RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 2. Got 1 and 8 in dimension 1 at c:\a\w\1\s\windows\pytorch\aten\src\thc\generic/THCTensorMath.cu:83

In [None]:
model.save_model("first_try_decoder")

In [None]:
fit(model, criterion, optimizer, epochs_count=50, batch_size=32, train_data="finished_files/chunked/train_000.bin",
    val_data="finished_files/chunked/val_000.bin", val_batch_size=32)