In [None]:
pip install transformers -U

In [2]:
%pylab inline
import random
import numpy as np
from tqdm import tnrange
from time import time
import matplotlib.pyplot as plt
from IPython.display import clear_output

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer, BertModel

Populating the interactive namespace from numpy and matplotlib


In [0]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [4]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
BATCH_SIZE = 16
DEC_SOS_IND = 0
DEC_EOS_IND = 1
DEC_UNK_IND = 2
MAX_SUMMARY_LEN = 502
MAX_DOC_LEN = 512
N_TRAIN = 44972
N_VAL = 5622
N_TEST = 5622
BERT_DIM = 768

## Reading datasets

In [0]:
def read_multi_news():
    def src2dataset(data):
        data = re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(\+[A-Za-z0-9]+)|(\-[A-Za-z0-9]+)|(\*[A-Za-z0-9]+)|(\w+:\/\/\S+)", "", data)
        data = data.replace('https', '')
        data = data.replace('http', '')
        return data.replace('story_separator_special_tag', '[SEP]').replace(',', '').replace('–', ' ').split('\n')[:-1]
    
    def tgt2dataset(data):
        data = re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(\+[A-Za-z0-9]+)|(\-[A-Za-z0-9]+)|(\*[A-Za-z0-9]+)|(\w+:\/\/\S+)", "", data)
        data = data.replace('https', '')
        data = data.replace('http', '')
        data = data.replace('–', ' ')
        d = data.split('\n')[:-1]
        return [s.split() for s in d]
        
    with open('data/train.txt.src.tokenized.fixed.cleaned.final.truncated.txt', encoding="utf8") as f:
        train = src2dataset(f.read())
    with open('data/train.txt.tgt.tokenized.fixed.cleaned.final.truncated.txt', encoding="utf8") as f:
        train_sum = tgt2dataset(f.read())
        
    with open('data/val.txt.src.tokenized.fixed.cleaned.final.truncated.txt', encoding="utf8") as f:
        val = src2dataset(f.read())
    with open('data/val.txt.tgt.tokenized.fixed.cleaned.final.truncated.txt', encoding="utf8") as f:
        val_sum = tgt2dataset(f.read())
        
    with open('data/test.txt.src.tokenized.fixed.cleaned.final.truncated.txt', encoding="utf8") as f:
        test = src2dataset(f.read())
    with open('data/test.txt.tgt.tokenized.fixed.cleaned.final.truncated.txt', encoding="utf8") as f:
        test_sum = tgt2dataset(f.read())
        
    return (train, train_sum), (val, val_sum), (test, test_sum)

In [0]:
(train, train_sum), (val, val_sum), (test, test_sum) = read_multi_news()

In [16]:
max([len(s) for s in train_sum])

499

## Tokenizing documents

In [0]:
def tokenize_data(data, tokenizer):
    encodings = []
    for i in tnrange(len(data)):
        encodings.append(tokenizer.encode(data[i], add_special_tokens=False, max_length=512, return_tensors='pt')[0].view(-1))
    return encodings

In [0]:
train = tokenize_data(train, bert_tokenizer)

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=44972), HTML(value='')))




In [0]:
torch.save(train, 'data/train2.0.pt')

In [0]:
val = tokenize_data(val, bert_tokenizer)

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=5622), HTML(value='')))




In [0]:
torch.save(val, 'data/val2.0.pt')

In [0]:
test = tokenize_data(test, bert_tokenizer)

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=5622), HTML(value='')))




In [0]:
torch.save(test, 'data/test2.0.pt')

## Load datasets

In [0]:
train = torch.load('data/train2.0.pt')
val = torch.load('data/val2.0.pt')
test = torch.load('data/test2.0.pt')

## Tokenizer for summaries

In [0]:
class summary_lang():
    def __init__(self):
        self.word2id = {'<SOS>': DEC_SOS_IND, '<EOS>' : DEC_EOS_IND, '<UNK>' : DEC_UNK_IND}
        self.id2word = {DEC_SOS_IND : '<SOS>', DEC_EOS_IND : '<EOS>', DEC_UNK_IND : '<UNK>'}
        self.word_counts = {}
        self.count = 3
        
    def __len__(self):
        return self.count
        
    def add_summaries(self, summaries, counter, thr=3):
        for summary in summaries:
            for word in summary:
                if counter[word] <= thr:
                    continue
                if word in self.word2id:
                    self.word_counts[word] += 1
                else:
                    self.word2id[word] = self.count
                    self.id2word[self.count] = word
                    self.count += 1
                    self.word_counts[word] = 1

In [0]:
from collections import Counter

c = Counter([word for s in train_sum for word in s])

In [0]:
sum_lang = summary_lang()
sum_lang.add_summaries(train_sum, c, 3)

In [0]:
torch.save(sum_lang, 'data/vocab2.0.pt')

## Load vocab

In [0]:
sum_lang = torch.load('data/vocab2.0.pt')

In [0]:
dec_vocab_size = len(sum_lang)

In [0]:
dec_vocab_size

49295

## Tokenizing summaries

In [0]:
def tokenize_summaries(summaries, lang):
    encodings = []
    for i in tnrange(len(summaries)):
        encoding = [lang.word2id['<SOS>']]
        encoding.extend([lang.word2id[w] if w in lang.word2id else lang.word2id['<UNK>'] for w in summaries[i]])
        encoding.append(lang.word2id['<EOS>'])
        encodings.append(torch.tensor([encoding], dtype=torch.long))
    return encodings

In [0]:
train_sum = tokenize_summaries(train_sum, sum_lang)

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=44972), HTML(value='')))




In [0]:
torch.save(train_sum, 'data/train_sum2.0.pt')

In [0]:
val_sum = tokenize_summaries(val_sum, sum_lang)

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=5622), HTML(value='')))




In [0]:
torch.save(val_sum, 'data/val_sum2.0.pt')

In [0]:
test_sum = tokenize_summaries(test_sum, sum_lang)

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=5622), HTML(value='')))




In [0]:
torch.save(test_sum, 'data/test_sum2.0.pt')

## Load tokenized summaries

In [0]:
len(bert_tokenizer.vocab)

30522

In [0]:
train_sum = torch.load('data/train_sum2.0.pt')
val_sum = torch.load('data/val_sum2.0.pt')
test_sum = torch.load('data/test_sum2.0.pt')