In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.cuda import FloatTensor, LongTensor

np.random.seed(42)

In [2]:
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

PAD_TOKEN = '[PAD]' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNKNOWN_TOKEN = '[UNK]' # This has a vocab id, which is used to represent out-of-vocabulary words
START_DECODING = '[START]' # This has a vocab id, which is used at the start of every decoder input sequence
STOP_DECODING = '[STOP]' # This has a vocab id, which is used at the end of untruncated target sequences

VOCAB_SIZE = 20000
ADDITIONAL_WORDS = 100

class Vocab(object):

  def __init__(self, vocab_file, max_size):
    self._word_to_id = {}
    self._id_to_word = {}
    self._count = 0 # keeps track of total number of words in the Vocab

    # [UNK], [PAD], [START] and [STOP] get the ids 0,1,2,3.
    for w in [UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
      self._word_to_id[w] = self._count
      self._id_to_word[self._count] = w
      self._count += 1

    # Read the vocab file and add words up to max_size
    with open(vocab_file, 'r') as vocab_f:
      for line in vocab_f:
        pieces = line.split()
        if len(pieces) != 2:
          print('Warning: incorrectly formatted line in vocabulary file: %s\n' % line)
          continue
        w = pieces[0]
        if w in [SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
          raise Exception('<s>, </s>, [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w)
        if w in self._word_to_id:
          raise Exception('Duplicated word in vocabulary file: %s' % w)
        self._word_to_id[w] = self._count
        self._id_to_word[self._count] = w
        self._count += 1
        if max_size != 0 and self._count >= max_size:
          print("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self._count))
          break

    print("Finished constructing vocabulary of %i total words. Last word added: %s" % (self._count, self._id_to_word[self._count-1]))

  def word2id(self, word):
    if word not in self._word_to_id:
      return self._word_to_id[UNKNOWN_TOKEN]
    return self._word_to_id[word]

  def id2word(self, word_id):
    if word_id not in self._id_to_word:
      raise ValueError('Id not found in vocab: %d' % word_id)
    return self._id_to_word[word_id]

  def size(self):
    return self._count

  def write_metadata(self, fpath):
    print("Writing word embedding metadata file to %s..." % (fpath))
    with open(fpath, "w") as f:
      fieldnames = ['word']
      writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
      for i in xrange(self.size()):
        writer.writerow({"word": self._id_to_word[i]})
        


def article2ids(article_words, vocab):
    ids = []
    oovs = []
    unk_id = vocab.word2id(UNKNOWN_TOKEN)
    for w in article_words.split():
        w = str(w)
        i = vocab.word2id(w)
        if i == unk_id: # If w is OOV
            if w not in oovs: # Add to list of OOVs
                oovs.append(w)
            oov_num = oovs.index(w) # This is 0 for the first article OOV, 1 for the second article OOV...
            if oov_num < ADDITIONAL_WORDS:
                ids.append(vocab.size() + oov_num) # This is e.g. 50000 for the first article OOV, 50001 for the second...
        else:
            ids.append(i)
    return ids, oovs


def abstract2ids(abstract_words, vocab, article_oovs):
    ids = []
    unk_id = vocab.word2id(UNKNOWN_TOKEN)
    for w in abstract_words.split():
        i = vocab.word2id(w)
        if i == unk_id: # If w is an OOV word
            if w in article_oovs: # If w is an in-article OOV
                vocab_idx = vocab.size() + article_oovs.index(w) # Map to its temporary article OOV number
                if vocab_idx < VOCAB_SIZE + ADDITIONAL_WORDS:
                    ids.append(vocab_idx)
        else:
            ids.append(i)
    return ids

def outputids2words(id_list, vocab, article_oovs):
    words = []
    for i in id_list:
        if i < VOCAB_SIZE:
            w = vocab.id2word(i)
        else:
            article_oov_idx = i - vocab.size()
            print('unknown generated')
            w = article_oovs[article_oov_idx]
        words.append(w)
    return ' '.join(words)

In [3]:
import struct
from tensorflow.core.example import example_pb2

def example_gen(filename):
    reader = open(filename, 'rb')
    examples = []
    while True:
        len_bytes = reader.read(8)
        if not len_bytes: break # finished reading this file
        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
        e = example_pb2.Example.FromString(example_str)
        examples.append(e)
        
    for e in examples:  
        article_text = e.features.feature['article'].bytes_list.value[0]
        abstract_text = e.features.feature['abstract'].bytes_list.value[0]
        yield (article_text.decode('utf-8'), abstract_text.decode('utf-8'))
        

In [4]:
vocab = Vocab("finished_files/vocab", VOCAB_SIZE)
SENTENCE_START_ID = vocab.word2id('<s>')
SENTENCE_END_ID = vocab.word2id('</s>')

PAD_TOKEN_ID = vocab.word2id('[PAD]')
UNKNOWN_TOKEN_ID = vocab.word2id('[UNK]')
START_DECODING_ID = vocab.word2id('[START]')
STOP_DECODING_ID = vocab.word2id('[STOP]')

max_size of vocab was specified as 20000; we now have 20000 words. Stopping reading.
Finished constructing vocabulary of 20000 total words. Last word added: then-president


In [5]:
from decoder import *

%load_ext autoreload
%autoreload 2

In [7]:
from decoder import *
from pointerTBRU import BeamSearchProviderTBRU, LSTMTBRU, AttentionTBRU, ConcatTBRU, PointerTBRU, ContextTBRU, CoverageAttentionTBRU

def build_decoder_model2():
    master = DRAGNNDecoderMaster()
    embeddings_computer = EmbeddingComputer(VOCAB_SIZE + ADDITIONAL_WORDS, 100, PAD_TOKEN_ID)
    master.add_component_encoder(TBRU("embed", TaggerRecurrent("input", "embed", False), embeddings_computer, (1,), True).cuda())
    #master.add_component(TBRU("extractive", TaggerRecurrent("embed", "extractive"), TaggerComputer(1000, 1000), (1,), True).cuda())
    master.add_component_encoder(TBRU("rnn", RNNSolidRecurrent("embed", "rnn"), RNNSolidComputer(100, 50), (1,), True).cuda())
    
    master.add_component_decoder(TBRU("decoder_embed", TaggerRecurrent(None, "decoder_embed", True),embeddings_computer, (1,), True).cuda())
    master.add_component_decoder(BeamSearchProviderTBRU("beamProvider", (1,), None, "decoder", True, False))
    pTBRU = AttentiveLSTMTBRU("decoder", (1,), True, 100, 100, 100, "rnn", "decoder_embed", False)
    master.add_component_decoder(pTBRU.cuda())
    master.add_component_decoder(TBRU("output", TaggerRecurrent("decoder", "output", False), TaggerComputer(100, VOCAB_SIZE), (1,), True).cuda())
    return master

def build_decoder_model():
    master = DRAGNNDecoderMaster()
    embeddings_computer = EmbeddingComputer(VOCAB_SIZE + ADDITIONAL_WORDS, 100, PAD_TOKEN_ID)
    master.add_component_encoder(TBRU("embed", TaggerRecurrent("input", "embed", False), embeddings_computer, (1,), True).cuda())
    #master.add_component(TBRU("extractive", TaggerRecurrent("embed", "extractive"), TaggerComputer(1000, 1000), (1,), True).cuda())
    master.add_component_encoder(TBRU("rnn", RNNSolidRecurrent("embed", "rnn"), RNNSolidComputer(100, 50), (1,), True).cuda())
    
    
    master.add_component_decoder(TBRU("decoder_embed", TaggerRecurrent(None, "decoder_embed", True),embeddings_computer, (1,), is_solid=True).cuda())
    
    master.add_component_decoder(BeamSearchProviderTBRU("beamProvider", None, 0, "decoder", True, is_solid=False))
    master.add_component_decoder(BeamSearchProviderTBRU("beamProvider1", None, 0, "lstm_state_layer", True, is_solid=False))
    master.add_component_decoder(BeamSearchProviderTBRU("beamProvider2", None, 1, "coverage_layer", True, is_solid=False))
    
    
    pTBRU = LSTMTBRU("decoder", "lstm_state_layer", False, 100, 100, "rnn", "decoder_embed", False)
    master.add_component_decoder(pTBRU.cuda())
    master.add_component_decoder(CoverageAttentionTBRU("attention_layer", "coverage_layer", False, 100, 100, 100, 'decoder','rnn', is_first=False, solid_modifiable=False).cuda())
    master.add_component_decoder(ContextTBRU('context', False, 'attention_layer', 'rnn', is_first=False, solid_modifiable=False))
    master.add_component_decoder(ConcatTBRU('context_concat', False, 'decoder', 'context', is_first=False, solid_modifiable=False))
    master.add_component_decoder(TBRU("output", TaggerRecurrent("context_concat", "output", False), TaggerComputer(200, VOCAB_SIZE), (1,), is_solid=False, solid_modifiable=False).cuda())
    master.add_component_decoder(PointerTBRU("pointer_final", False, 300, VOCAB_SIZE + ADDITIONAL_WORDS, "attention_layer", "context", "output", "decoder_embed", "input", "decoder", solid_modifiable=False).cuda())
    return master

model = build_decoder_model()
torch.cuda.memory_allocated()

24981504

In [8]:
def add_padding(articles):
    lens = [len(article) for article in articles]
    max_len = max(lens)
    
    for i in range(len(articles)):
        articles[i].extend([vocab.word2id(PAD_TOKEN)]*(max_len - len(articles[i])))
    return np.array(articles).T

def add_padding_for_tagging(articles, targets):
    lens = [len(article) for article in articles]
    max_len = max(lens)
    
    for i in range(len(articles)):
        targets[i].extend([0]*(max_len - len(articles[i])))
        articles[i].extend([vocab.word2id(PAD_TOKEN)]*(max_len - len(articles[i])))
    return np.array(articles).T, np.array(targets)

In [9]:
import random

def calculate_mask(articles):
    mask = (articles == vocab.word2id(PAD_TOKEN))
    mask = np.logical_xor(mask, np.ones(articles.shape))
    return mask

def get_target(self, article, abstract):
    return [ int(i in abstract and i != vocab.word2id(UNKNOWN_TOKEN)) for i in article]

def push_abs_ptr(article, abstract, i, abs_ptr):
    while abs_ptr < len(abstract) and (not abstract[abs_ptr] in article[i+1:] 
                                               or abstract[abs_ptr] == vocab.word2id(SENTENCE_START)
                                               or abstract[abs_ptr] == vocab.word2id(SENTENCE_STOP)
                                               or abstract[abs_ptr] == vocab.word2id(UNKNOWN_TOKEN)):
        abs_ptr += 1
    return abs_ptr

class Batcher():
    
    def __init__(self, filename, batch_size, max_article_len, max_target_len):
        self.batch_size = batch_size
        generator = example_gen(filename)

        self.batches = []
        unknown_words_cnt = 0
        self.articles = []
        self.targets = []
        self.unknown_words = []
        self.decoder_inputs = []
        while True:
            articles = []
            targets = []
            unknown_words = []
            decoder_inputs = []
            for i in range(batch_size):
                try:
                    
                    article_text, abstract_text = next(generator)
                    article_ids, unknown_words_list = article2ids(article_text, vocab)
                    art_len = min(max_article_len, len(article_ids))
                    article_ids = article_ids[:art_len]
                    target = abstract2ids(abstract_text, vocab, unknown_words_list)
                    tar_len = min(max_target_len, len(target))
                    target = target[:tar_len - 1]
                    target.append(STOP_DECODING_ID)
                    decoder_input = [START_DECODING_ID]
                    decoder_input.extend(abstract2ids(abstract_text, vocab, unknown_words_list))
                    decoder_input = decoder_input[:tar_len]
                    articles.append(article_ids)
                    targets.append(target)
                    decoder_inputs.append(decoder_input)
                    unknown_words.append(unknown_words_list)
                    unknown_words_cnt = max(len(unknown_words_list), unknown_words_cnt)
                except:
                    break
            if len(articles) == 0:
                break
            self.articles.extend(articles)
            self.targets.extend(targets)
            self.unknown_words.extend(unknown_words)
            self.decoder_inputs.extend(decoder_inputs)
            articles = add_padding(articles)
            targets = add_padding(targets)
            decoder_inputs = add_padding(decoder_inputs)
            mask = calculate_mask(articles)
            self.batches.append( (articles, targets, mask, decoder_inputs) )
        print(len(self.batches))
        print(unknown_words_cnt)
    
    def generator(self):
        for batch in self.batches:
            yield batch
            
    def get_random_sample(self):
        i = random.randint(0, len(self.articles) - 1)
        return np.array([self.articles[i]]).T, np.array(self.targets[i]), np.array([self.decoder_inputs[i]]).T, self.unknown_words[i]

In [10]:
from nltk.translate.bleu_score import sentence_bleu

SENTENCE_START_ID = vocab.word2id('<s>')
SENTENCE_END_ID = vocab.word2id('</s>')

PAD_TOKEN_ID = vocab.word2id('[PAD]')
UNKNOWN_TOKEN_ID = vocab.word2id('[UNK]')
START_DECODING_ID = vocab.word2id('[START]')
STOP_DECODING_ID = vocab.word2id('[STOP]')

def calculate_bleu(result, target, weights): #TODO
    if not isinstance(result, list):
        result = result.tolist()
    if STOP_DECODING_ID in result:
        result = result[:result.index(STOP_DECODING_ID)]
    if not isinstance(target, list):
        target = target.tolist()
    if STOP_DECODING_ID in target:
        target = target[:target.index(STOP_DECODING_ID)]
    while PAD_TOKEN_ID in target:
        target.remove(PAD_TOKEN_ID)
    BLEUscore = sentence_bleu([target], result, weights=weights)
    return BLEUscore
    
def calculate_bleu_ngramm(result, target, n):
    cnt = 0
    for i in range(len(result) - n + 1):
        if check_ngramm_in_string(result[i:i+n], target):
            cnt += 1
    return cnt / (len(result) - n + 1)
    
def check_ngramm_in_string(ngramm, target):
    for i in range(len(target) - len(ngramm) + 1):
        flag = True
        for j in range(len(ngramm)):
            flag = flag and (ngramm[j] == target[i + j])
        if flag:
            return True
    return False

def calculate_logits_bleu_and_rouge(logits, target, weights):
    result = logits.argmax(-1).cpu().detach().numpy().T
    
    target = target.T
    #print(result)
    #print(target)
    bleu = 0.
    rouge = 0.
    for i in range(result.shape[0]):
        bleu += calculate_bleu(result[i], target[i], weights)
        rouge += calculate_bleu(target[i], result[i], weights)
    return bleu / result.shape[0], rouge / result.shape[0]

def generate_summary(article, model, beam_width):
    symbols = [START_DECODING_ID]
    beam_ids = [0]
    probs = [1.]
    result = np.array([[]*beam_width])
    X_batch = LongTensor(article)
    inputs = InputLayerState("input", False, X_batch)
    model.eval_run_encoder(inputs)
    for i in range(40):
        hidden = model.decode((LongTensor([symbols]), LongTensor(beam_ids)))
        new_probs = []
        new_result = []
        for i, s in enumerate(symbols):
            values, indices = hidden[i].topk(beam_width)
            new_probs.extend(((values + 1) * probs[i]).cpu().detach().tolist())
            for j in range(beam_width):
                tmp = result[i].tolist()
                tmp.append(indices[j].item())
                new_result.append(tmp)
        top_idx = np.argsort(new_probs)[-beam_width:]
        probs = np.array(new_probs)[top_idx]
        result = np.array(new_result)[top_idx]
        symbols = result[:,-1]
        beam_ids = top_idx // beam_width
        #symbol[0] == STOP_DECODING_ID
    return result[0]

def gen_and_print_summary(batcher, model, beam_width):
    article_text, target, decoder_input, unk_words = batcher.get_random_sample()
    result = generate_summary(article_text, model, beam_width)
    result = outputids2words(result, vocab, unk_words)
    target = outputids2words(target, vocab, unk_words)
    print('result is \n' + result)
    print('target is \n' + target)
    print('BLEU = {:.4f}'.format(sentence_bleu([target], result, weights=(0.33, 0.33, 0.33, 0))))
    
def gen_and_print_summary_by_target(batcher, model):
    article_text, target, decoder_inputs, unk_words = batcher.get_random_sample()
    X_batch, y_batch, decoder_batch = LongTensor(article_text), LongTensor(target), LongTensor(decoder_inputs)
    inputs = InputLayerState("input", True, X_batch)
    targetLayer = InputLayerState("target", True, decoder_batch)
    logits = model.train_run(inputs, targetLayer)
    

    print(calculate_logits_bleu_and_rouge(logits, np.array([target]).T, (0.33,0.33,0.33,0)))
    result = logits.argmax(-1).squeeze(0).squeeze(-1).cpu().detach().tolist()
    
    result = outputids2words(result, vocab, unk_words)
    target = outputids2words(target, vocab, unk_words)
    print('result is \n' + result)
    print('target is \n' + target)

In [11]:
def calc_f1(tp, fp, tn, fn):
    precision = tp/(fp + tp)
    recall = tp/(tp + fn)
    f1 = 2*precision*recall/(precision+recall)
    return f1

def precalc_f1(articles_tokens, articles, target):
    mask = calculate_mask(articles_tokens).T
    result = (articles > 0.5)
    #print(result[0])
    #print(articles.shape, target.shape, mask.shape)
    n_res = np.logical_not(result)
    n_tar = np.logical_not(target)
    tp = (result * target * mask).sum()
    fp = (n_res * target * mask).sum()
    tn = (n_res * n_tar * mask).sum()
    fn = (result * n_tar * mask).sum()
    return tp, fp, tn, fn

In [12]:
train_data=Batcher("finished_files/chunked/train_000.bin", 8, 1000, 50)

125
146


In [13]:
gen_and_print_summary(train_data, model, 10)

torch.Size([1, 100])
torch.Size([1, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([10, 100])
torch.Size([

In [14]:
import math
import time

def do_epoch(model, criterion, data, batch_size, bleu_weights, optimizer=None):  
    epoch_loss = 0.
    bleu = 0.
    rouge = 0.
    batch_cnt = 1
    is_train = not optimizer is None
    model.train(is_train)

    with torch.autograd.set_grad_enabled(is_train):
        for i, (article_text, target, mask, decoder_inputs) in enumerate(data.generator()):
            batch_cnt =  i + 1
            X_batch, y_batch, decoder_batch = LongTensor(article_text), LongTensor(target), LongTensor(decoder_inputs)
            inputs = InputLayerState("input", True, X_batch)
            targetLayer = InputLayerState("target", True, decoder_batch)
            logits = model.train_run(inputs, targetLayer)
        
            #print(logits.view(1, -1).shape)
            loss = criterion(logits.view(-1, logits.shape[-1]), y_batch.view(-1))
            epoch_loss += loss.item()
            
            cur_bleu, cur_rouge = calculate_logits_bleu_and_rouge(logits, target, bleu_weights)
            
            bleu += cur_bleu
            rouge += cur_rouge
            
            if is_train:
                optimizer.zero_grad()
                loss.backward()
                #nn.utils.clip_grad_norm_(model.parameters(), 1.)
                optimizer.step()
            
            print('\r[{}]: Loss = {:.4f}, BLEU = {:.4f}, ROUGE = {:.4f}'.format(i, loss.item(), cur_bleu, cur_rouge), end='')
    
    print()
    gen_and_print_summary(data, model, 10)
    #gen_and_print_summary_by_target(data, model)
    return epoch_loss, bleu / batch_cnt, rouge / batch_cnt

def fit(model, criterion, optimizer, train_data, epochs_count=1, 
        batch_size=32, val_data=None, val_batch_size=None):
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    bleu_weights = (0.5, 0.5, 0, 0)
    for epoch in range(epochs_count):
        start_time = time.time()
        train_loss, bleu, rouge = do_epoch(model, criterion, train_data, batch_size, bleu_weights, optimizer)
        output_info = '\rEpoch {} / {}, Epoch Time = {:.2f}s: Train Loss = {:.4f}: BLEU = {:.4f}, ROUGE = {:.4f}'
        if not val_data is None:
            val_loss, bleu, rouge = do_epoch(model, criterion, val_data, val_batch_size, bleu_weights, None)
            epoch_time = time.time() - start_time
            output_info += ', Val Loss = {:.4f}'
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, bleu, rouge, val_loss))
        else:
            epoch_time = time.time() - start_time
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, bleu, rouge))

In [15]:
criterion = nn.CrossEntropyLoss(ignore_index = vocab.word2id(PAD_TOKEN)).cuda()
optimizer = optim.Adam(model.parameters())

fit(model, criterion, optimizer, epochs_count=50, batch_size=32, train_data=Batcher("finished_files/chunked/train_000.bin",8, 40, 20),
    val_data=None, val_batch_size=32)

125
146
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])


ValueError: Expected input batch_size (8) to match target batch_size (160).

In [None]:
model.save_model("first_try_decoder")

In [None]:
fit(model, criterion, optimizer, epochs_count=50, batch_size=32, train_data="finished_files/chunked/train_000.bin",
    val_data="finished_files/chunked/val_000.bin", val_batch_size=32)