# Summary

In [1]:
from abstractor.train import get_training_batch as get_abstractor_training_batch
from abstractor.utils import AbstractorModel, AbstractorModelRNN
from abstractor.utils import obtain_initial_hidden_states
from bert.utils import obtain_sentence_embeddings
from bert.utils import obtain_word_embeddings
from data.utils import load_training_dictionaries
from extractor.train import get_training_batch as get_extractor_training_batch
from extractor.utils import ExtractorModel
from pytorch_transformers import BertModel
from pytorch_transformers import BertTokenizer
from rl_connection.utils import RLModel
from rl_connection.train import get_training_batch as get_rl_training_batch
from rouge import Rouge

import numpy as np
import torch

## Load data

In [2]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [3]:
data = load_training_dictionaries()

## Extractor

In [4]:
# Load extractor model:
extractor_model = ExtractorModel(bert_tokenizer, bert_model)
extractor_model_path = "results/models/extractor.pt"
extractor_model.load_state_dict(torch.load(extractor_model_path))

<All keys matched successfully>

In [5]:
documents, extraction_labels = get_extractor_training_batch(data, batch_size=2)

sentence_embeddings, mask = obtain_sentence_embeddings(
    extractor_model.bert_model, 
    extractor_model.bert_tokenizer, 
    documents
)

# Predict probability of extraction per sentence
extraction_probabilities = extractor_model(sentence_embeddings)

In [6]:
n_samples = len(documents)

for sample_idx in range(n_samples):
    n_to_extract = extraction_labels.sum(dim=1)[sample_idx].int() 
    ext_prob = extraction_probabilities[sample_idx] * mask[sample_idx]
    ext_sent_indicies = torch.topk(ext_prob, k=n_to_extract)[1]
    
    targets = np.array(documents[sample_idx])[extraction_labels[sample_idx][:len(documents[sample_idx])].numpy().astype(bool)]
    print("----> TARGET <----")
    for target in targets:
        print(f"{target}\n")
    print()
          
    print("----> PREDICTION <----")
    for x in np.array(documents[sample_idx])[ext_sent_indicies]:
        print(f"{x} \n")
    print("\n\n-------\n\n")

----> TARGET <----
that may sound like an esoteric adage , but when zully broussard selflessly decided to give one of her kidneys to a stranger , her generosity paired up with big data . it resulted in six patients receiving transplants .

that changed when a computer programmer named david jacobs received a kidney transplant . he had been waiting on a deceased donor list , when a live donor came along -- someone nice enough to give away a kidney to a stranger .


----> PREDICTION <----
that changed when a computer programmer named david jacobs received a kidney transplant . he had been waiting on a deceased donor list , when a live donor came along -- someone nice enough to give away a kidney to a stranger . 

that may sound like an esoteric adage , but when zully broussard selflessly decided to give one of her kidneys to a stranger , her generosity paired up with big data . it resulted in six patients receiving transplants . 



-------


----> TARGET <----
" weasels will go for anyt

## Abstractor

In [7]:
# Load data:
abstractor_model = AbstractorModelRNN(bert_tokenizer, bert_model)
abstractor_model_path = "results/models/abstractor.pt"
abstractor_model.load_state_dict(torch.load(abstractor_model_path))

<All keys matched successfully>

In [8]:
source_documents, target_summaries = get_abstractor_training_batch(data, 2)

# Obtain embeddings
source_document_embeddings, source_mask, source_tokens = obtain_word_embeddings(
    abstractor_model.bert_model, abstractor_model.bert_tokenizer, source_documents, static_embeddings=False
)
target_summary_embeddings, target_mask, target_tokens = obtain_word_embeddings(
    abstractor_model.bert_model, abstractor_model.bert_tokenizer, target_summaries, static_embeddings=True
)

In [9]:
# Obtain extraction probability for each word in vocabulary
extraction_probabilities, teacher_forcing = abstractor_model(
    source_document_embeddings,
    target_summary_embeddings,
    teacher_forcing_pct=0
)  # (batch_size, n_target_words, vocab_size)

vals, predicted_idx = torch.topk((extraction_probabilities), k=1, dim=2)

for x in [abstractor_model.bert_tokenizer.convert_ids_to_tokens(p) for p in predicted_idx.squeeze().tolist()]:
    print(f"{x}")
    print("")


# Obtain extraction probability for each word in vocabulary
extraction_probabilities, teacher_forcing = abstractor_model(
    source_document_embeddings,
    target_summary_embeddings,
    teacher_forcing_pct=1
)  # (batch_size, n_target_words, vocab_size)

vals, predicted_idx = torch.topk((extraction_probabilities), k=1, dim=2)

for x in [abstractor_model.bert_tokenizer.convert_ids_to_tokens(p) for p in predicted_idx.squeeze().tolist()]:
    print(f"{x}")
    print("")

['zu', '##lly', 'bro', '##uss', '##ard', 'decided', 'to', 'give', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'to']

['a', 'photo', 'of', 'a', 'green', 'wood', '##pe', '##cker', 'flying', 'with', 'a', 'weasel', 'on', 'its', 'back', 'has', 'gone', 'viral', 'on', 'twitter', '.', 'the', 'image', 'was', 'snapped', 'by', 'amateur', 'photographer', 'martin', 'le', '-', 'may', 'near', 'london', '.', '[SEP]', 'computer', 'may', 'near', 'london', '.', '[SEP]', 'computer', 'may', 'near', 'london', '.', '[SEP]', 'computer', 'may', 'near', 'london', '.']

['zu', '##lly', 'bro', '##uss', '##ard', 'decided', 'to', 'give', 'a', 'kidney', 'to', 'a', 'stranger', '.', 'a', 'kidney', 'computer', 'program', 'helped', 'her', 'donation', 's

## Reinforcement Learning

In [10]:
rl_model = RLModel(extractor_model, abstractor_model)
rl_model.load_state_dict(torch.load("results/models/rl.pt"))

<All keys matched successfully>

In [11]:
source_documents, target_summaries = get_rl_training_batch(data, batch_size=2)

# Obtain embeddings
source_sentence_embeddings, source_mask = obtain_sentence_embeddings(
    rl_model.extractor_model.bert_model,
    rl_model.extractor_model.bert_tokenizer,
    source_documents
)
stop_action_index = source_sentence_embeddings.shape[1]
target_summary_embeddings, target_mask, target_tokens = obtain_word_embeddings(
    rl_model.abstractor_model.bert_model,
    rl_model.abstractor_model.bert_tokenizer,
    target_summaries,
    static_embeddings=True
)

In [12]:
# Run trajectory
actions, log_probs, entropys, values = rl_model.sample_actions(source_sentence_embeddings, source_mask)

# Obtain abstracted sentence from abstractor
predicted_tokens, word_probabilities = rl_model.create_abstracted_sentences(
    actions,
    source_documents,
    stop_action_index,
    teacher_forcing_pct=0.0,
    target_summary_embeddings=target_summary_embeddings
)
        

In [13]:
# Look at extractions
for art_idx, doc_sentences in enumerate(actions):
    for sent_idx in doc_sentences[:-1]:
        print(source_documents[art_idx][sent_idx])
        print()
    print("\n\n-------\n\n")

that changed when a computer programmer named david jacobs received a kidney transplant . he had been waiting on a deceased donor list , when a live donor came along -- someone nice enough to give away a kidney to a stranger .

that may sound like an esoteric adage , but when zully broussard selflessly decided to give one of her kidneys to a stranger , her generosity paired up with big data . it resulted in six patients receiving transplants .



-------


weasels would not normally target green woodpeckers , pacheco said -- their predators are normally the size of a stoat or larger . but the birds are known to spend a fair amount of time on the ground pulling up worms and hunting insects .

the pluckiness of the weasel spawned a number of parodies on twitter , with manipulated images showing the creature in turn being ridden by russian president vladimir putin , popstar miley cyrus , football star john terry -- and even what appears to be a dog red panda dressed in a darth vader costu

In [14]:
# Look at abstractions
for predicted_abstraction in predicted_tokens:
    solution = list()
    for token in predicted_abstraction:
        solution.append(rl_model.abstractor_model.bert_tokenizer.ids_to_tokens[int(token)])
    print(" ".join(solution))
    print("\n\n")

zu ##lly bro ##uss ##ard decided to give a kidney to a stranger . a kidney to a stranger . a kidney to a stranger . a kidney to a stranger . a kidney to a stranger . a kidney to a stranger . a kidney to a stranger . a kidney to



a photo of a green wood ##pe ##cker flying with a weasel on its back has gone viral on twitter . the image was snapped by amateur photographer martin le - may near london . [SEP] computer may near london . [SEP] computer may near london . [SEP] computer may near london .



