# Summary

In [1]:
from abstractor.train import get_training_batch as get_abstractor_training_batch
from abstractor.utils import AbstractorModel, AbstractorModelRNN
from abstractor.utils import obtain_initial_hidden_states
from bert.utils import obtain_sentence_embeddings
from bert.utils import obtain_word_embeddings
from data.utils import load_training_dictionaries
from extractor.train import get_training_batch as get_extractor_training_batch
from extractor.utils import ExtractorModel
from pytorch_transformers import BertModel
from pytorch_transformers import BertTokenizer
from rl_connection.utils import RLModel
from rl_connection.train import get_training_batch as get_rl_training_batch
from rouge import Rouge

import numpy as np
import torch

## Load data

In [2]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [3]:
data = load_training_dictionaries()

## Extractor

In [4]:
# Load extractor model:
extractor_model = ExtractorModel(bert_tokenizer, bert_model)
extractor_model_path = "results/models/extractor.pt"
extractor_model.load_state_dict(torch.load(extractor_model_path))

<All keys matched successfully>

In [7]:
documents, extraction_labels = get_extractor_training_batch(data, batch_size=2)

sentence_embeddings, mask = obtain_sentence_embeddings(
    extractor_model.bert_model, 
    extractor_model.bert_tokenizer, 
    documents
)

# Predict probability of extraction per sentence
extraction_probabilities = extractor_model(sentence_embeddings)

In [8]:
n_samples = len(documents)

for sample_idx in range(n_samples):
    n_to_extract = extraction_labels.sum(dim=1)[sample_idx].int() 
    ext_prob = extraction_probabilities[sample_idx] * mask[sample_idx]
    ext_sent_indicies = torch.topk(ext_prob, k=n_to_extract)[1]
    
    targets = np.array(documents[sample_idx])[extraction_labels[sample_idx][:len(documents[sample_idx])].numpy().astype(bool)]
    print("----> TARGET <----")
    for target in targets:
        print(f"{target}\n")
    print()
          
    print("----> PREDICTION <----")
    for x in np.array(documents[sample_idx])[ext_sent_indicies]:
        print(f"{x} \n")
    print("\n\n-------\n\n")

----> TARGET <----
newtown police say cayman was last seen wearing a gray down winter jacket , black ski pants and hiking boots . he could be in the radnor - wayne area , roughly 20 miles from philadelphia , or may have purchased a train ticket to philadelphia , according to an alert posted on facebook .

a message to families from the head of the shipley school , which cayman attends , read in part : " cayman 's sister savannah is in ninth grade at shipley and his parents , farid and becky , are terrific people . they have contacted police and are aware that we are sending you this email . we hope that cayman is ok and are saying our prayers . "


----> PREDICTION <----
a message to families from the head of the shipley school , which cayman attends , read in part : " cayman 's sister savannah is in ninth grade at shipley and his parents , farid and becky , are terrific people . they have contacted police and are aware that we are sending you this email . we hope that cayman is ok and

## Abstractor

In [15]:
# Load data:
abstractor_model = AbstractorModelRNN(bert_tokenizer, bert_model)
abstractor_model_path = "results/models/abstractor.pt"
abstractor_model.load_state_dict(torch.load(abstractor_model_path))

<All keys matched successfully>

In [16]:
source_documents, target_summaries = get_abstractor_training_batch(data, 2)

# Obtain embeddings
source_document_embeddings, source_mask, source_tokens = obtain_word_embeddings(
    abstractor_model.bert_model, abstractor_model.bert_tokenizer, source_documents, static_embeddings=False
)
target_summary_embeddings, target_mask, target_tokens = obtain_word_embeddings(
    abstractor_model.bert_model, abstractor_model.bert_tokenizer, target_summaries, static_embeddings=True
)

In [17]:
# Obtain extraction probability for each word in vocabulary
extraction_probabilities, teacher_forcing = abstractor_model(
    source_document_embeddings,
    target_summary_embeddings,
    teacher_forcing_pct=0
)  # (batch_size, n_target_words, vocab_size)

vals, predicted_idx = torch.topk((extraction_probabilities), k=1, dim=2)

for x in [abstractor_model.bert_tokenizer.convert_ids_to_tokens(p) for p in predicted_idx.squeeze().tolist()]:
    print(f"{x}")
    print("")


# Obtain extraction probability for each word in vocabulary
extraction_probabilities, teacher_forcing = abstractor_model(
    source_document_embeddings,
    target_summary_embeddings,
    teacher_forcing_pct=1
)  # (batch_size, n_target_words, vocab_size)

vals, predicted_idx = torch.topk((extraction_probabilities), k=1, dim=2)

for x in [abstractor_model.bert_tokenizer.convert_ids_to_tokens(p) for p in predicted_idx.squeeze().tolist()]:
    print(f"{x}")
    print("")

['a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad', 'a', 'jihad']

['rory', 'mc', '##il', '##roy', 'throws', 'club', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water', 'into', 'water']

['a', 'jihad', 'a', 'group', 'claims', 'responsibility', 'in', 'recording', 'audio', 'recording', ',', 'the', 'the', 'the', ',', 'the', 'mali', 'government', 'government', 'calls', 'a', 'mali', 'a', 'mali', 'terrorist', 'act', 'mali', 'one', 'one', 'citizen', '.', 'one', 'one', '.', '.', 'mali', 'one', 'are', 'killed', '.', '[SEP]', '.']

['rory

## Reinforcement Learning

In [None]:
rl_model = RLModel(extractor_model, abstractor_model)
rl_model.load_state_dict(torch.load("results/models/rl.pt"))

In [None]:
source_documents, target_summaries = get_rl_training_batch(data, batch_size=3)

# Obtain embeddings
source_sentence_embeddings, source_mask = obtain_sentence_embeddings(
    rl_model.extractor_model.bert_model,
    rl_model.extractor_model.bert_tokenizer,
    source_documents
)
stop_action_index = source_sentence_embeddings.shape[1]
target_summary_embeddings, target_mask, target_tokens = obtain_word_embeddings(
    rl_model.abstractor_model.bert_model,
    rl_model.abstractor_model.bert_tokenizer,
    target_summaries,
    static_embeddings=True
)

In [None]:
# Run trajectory
actions, log_probs, entropys, values = rl_model.sample_actions(source_sentence_embeddings, source_mask)

# Obtain abstracted sentence from abstractor
predicted_tokens, word_probabilities = rl_model.create_abstracted_sentences(
    actions,
    source_documents,
    stop_action_index,
    teacher_forcing_pct=0.0,
    target_summary_embeddings=target_summary_embeddings
)
        

In [None]:
# Look at extractions
for art_idx, doc_sentences in enumerate(actions):
    for sent_idx in doc_sentences[:-1]:
        print(source_documents[art_idx][sent_idx])
        print()
    print("\n\n-------\n\n")

In [None]:
# Look at abstractions
for predicted_abstraction in predicted_tokens:
    solution = list()
    for token in predicted_abstraction:
        solution.append(rl_model.abstractor_model.bert_tokenizer.ids_to_tokens[int(token)])
    print(" ".join(solution))
    print("\n\n")