In [None]:
!pip install bert_score

import csv
import logging
import random

import torch
import transformers
from bert_score import score
from transformers import GPT2DoubleHeadsModel, GPT2Tokenizer

transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

In [None]:
print("Cuda available:", torch.cuda.is_available())

In [None]:
def load_local_model(model):
    print("Loading", model)
    model = GPT2DoubleHeadsModel.from_pretrained("models/" + model + "/", local_files_only=True)
    model.to("cuda")
    return model

In [None]:
def load_tokenizer():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    special_tokens = {'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<pad>',
                      'additional_special_tokens': ['<TITLE>']}
    tokenizer.add_special_tokens(special_tokens)
    return tokenizer

In [None]:
def get_abstracts_for_qual_eval():
       abstracts = [{
        "Title": "GAN-BERT: Generative Adversarial Learning for Robust Text Classification with a Bunch of Labeled Examples",
        "Abstract": "Recent Transformer-based architectures, e.g., BERT, provide impressive results in many Natural Language Processing tasks. However, most of the adopted benchmarks are made of (sometimes hundreds of) thousands of examples. In many real scenarios, obtaining high- quality annotated data is expensive and time consuming; in contrast, unlabeled examples characterizing the target task can be, in general, easily collected. One promising method to enable semi-supervised learning has been proposed in image processing, based on Semi- Supervised Generative Adversarial Networks. In this paper, we propose GAN-BERT that ex- tends the fine-tuning of BERT-like architectures with unlabeled data in a generative adversarial setting. Experimental results show that the requirement for annotated examples can be drastically reduced (up to only 50-100 annotated examples), still obtaining good performances in several sentence classification tasks."},
        {"Title": "Multi-Agent Task-Oriented Dialog Policy Learning with Role-Aware Reward Decomposition",
         "Abstract": "Many studies have applied reinforcement learning to train a dialog policy and show great promise these years. One common approach is to employ a user simulator to obtain a large number of simulated user experiences for reinforcement learning algorithms. However, modeling a realistic user simulator is challenging. A rule-based simulator requires heavy domain expertise for complex tasks, and a data-driven simulator requires considerable data and it is even unclear how to evaluate a simulator. To avoid explicitly building a user simulator beforehand, we propose Multi-Agent Dialog Policy Learning, which regards both the system and the user as the dialog agents. Two agents interact with each other and are jointly learned simultaneously. The method uses the actor-critic framework to facilitate pretraining and improve scalability. We also propose Hybrid Value Network for the role-aware reward decomposition to integrate role-specific domain knowledge of each agent in the task-oriented dialog. Results show that our method can successfully build a system policy and a user policy simultaneously, and two agents can achieve a high task success rate through conversational interaction."},
        {"Title": "Chinese Relation Extraction with Multi-Grained Information and External Linguistic Knowledge",
         "Abstract": "Chinese relation extraction is conducted using neural networks with either character-based or word-based inputs, and most existing methods typically suffer from segmentation errors and ambiguity of polysemy. To address the issues, we propose a multi-grained lattice framework (MG lattice) for Chinese relation extraction to take advantage of multi-grained language information and external linguistic knowledge. In this framework, (1) we incorporate word-level information into character sequence inputs so that segmentation errors can be avoided. (2) We also model multiple senses of polysemous words with the help of external linguistic knowledge, so as to alleviate polysemy ambiguity. Experiments on three real-world datasets in distinct domains show consistent and significant superiority and robustness of our model, as compared with other baselines. We will release the source code of this paper in the future."},
        {"Title": "Empirically Estimating Order Constraints for Content Planning in Generation",
         "Abstract": "In a language generation system, a content planner embodies one or more \u201cplans\u201d that are usually hand\u2013crafted, sometimes through manual analysis of target text. In this paper, we present a system that we developed to automatically learn elements of a plan and the ordering constraints among them. As training data, we use semantically annotated transcripts of domain experts performing the task our system is designed to mimic. Given the large degree of variation in the spoken language of the transcripts, we developed a novel algorithm to find parallels between transcripts based on techniques used in computational genomics. Our proposed methodology was evaluated two\u2013fold: the learning and generalization capabilities were quantitatively evaluated using cross validation obtaining a level of accuracy of 89%. A qualitative evaluation is also provided."},
        {"Title": "Improving Distant Supervision for Information Extraction Using Label Propagation Through Lists",
         "Abstract": "Because of polysemy, distant labeling for information extraction leads to noisy training data. We describe a procedure for reducing this noise by using label propagation on a graph in which the nodes are entity mentions, and mentions are coupled when they occur in coordinate list structures. We show that this labeling approach leads to good performance even when off-the-shelf classifiers are used on the distantly-labeled data."},
        {"Title": "An Extension of BLANC to System Mentions",
         "Abstract": "BLANC is a link-based coreference evaluation metric for measuring the quality of coreference systems on gold mentions. This paper extends the original BLANC (\u201cBLANC-gold\u201d henceforth) to system mentions, removing the gold mention assumption. The proposed BLANC falls back seamlessly to the original one if system mentions are identical to gold mentions, and it is shown to strongly correlate with existing metrics on the 2011 and 2012 CoNLL data."},
        {"Title": "Multi-News: A Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model",
         "Abstract": "Automatic generation of summaries from multiple news articles is a valuable tool as the number of online publications grows rapidly. Single document summarization (SDS) systems have benefited from advances in neural encoder-decoder model thanks to the availability of large datasets. However, multi-document summarization (MDS) of news articles has been limited to datasets of a couple of hundred examples. In this paper, we introduce Multi-News, the first large-scale MDS news dataset. Additionally, we propose an end-to-end model which incorporates a traditional extractive summarization model with a standard SDS model and achieves competitive results on MDS datasets. We benchmark several methods on Multi-News and hope that this work will promote advances in summarization in the multi-document setting."},
        {"Title": "Compositional Questions Do Not Necessitate Multi-hop Reasoning",
         "Abstract": "Multi-hop reading comprehension (RC) questions are challenging because they require reading and reasoning over multiple paragraphs. We argue that it can be difficult to construct large multi-hop RC datasets. For example, even highly compositional questions can be answered with a single hop if they target specific entity types, or the facts needed to answer them are redundant. Our analysis is centered on HotpotQA, where we show that single-hop reasoning can solve much more of the dataset than previously thought. We introduce a single-hop BERT-based RC model that achieves 67 F1-comparable to state-of-the-art multi-hop models. We also design an evaluation setting where humans are not shown all of the necessary paragraphs for the intended multi-hop reasoning but can still answer over 80% of questions. Together with detailed error analysis, these results suggest there should be an increasing focus on the role of evidence in multi-hop reasoning and possibly even a shift towards information retrieval style evaluations with large and diverse evidence collections."},
        {"Title": "Multi-Domain Dialogue Acts and Response Co-Generation",
         "Abstract": "Generating fluent and informative responses is of critical importance for task-oriented dialogue systems. Existing pipeline approaches generally predict multiple dialogue acts first and use them to assist response generation. There are at least two shortcomings with such approaches. First, the inherent structures of multi-domain dialogue acts are neglected. Second, the semantic associations between acts and responses are not taken into account for response generation. To address these issues, we propose a neural co-generation model that generates dialogue acts and responses concurrently. Unlike those pipeline approaches, our act generation module preserves the semantic structures of multi-domain dialogue acts and our response generation module dynamically attends to different acts as needed. We train the two modules jointly using an uncertainty loss to adjust their task weights adaptively. Extensive experiments are conducted on the large-scale MultiWOZ dataset and the results show that our model achieves very favorable improvement over several state-of-the-art models in both automatic and human evaluations."},
        {"Title": "Structured Tuning for Semantic Role Labeling",
         "Abstract": "Recent neural network-driven semantic role labeling (SRL) systems have shown impressive improvements in F1 scores. These improvements are due to expressive input representations, which, at least at the surface, are orthogonal to knowledge-rich constrained decoding mechanisms that helped linear SRL models. Introducing the benefits of structure to inform neural models presents a methodological challenge. In this paper, we present a structured tuning framework to improve models using softened constraints only at training time. Our framework leverages the expressiveness of neural networks and provides supervision with structured loss components. We start with a strong baseline (RoBERTa) to validate the impact of our approach, and show that our framework outperforms the baseline by learning to comply with declarative constraints. Additionally, our experiments with smaller training sizes show that we can achieve consistent improvements under low-resource scenarios."},
    ]

    return abstracts

In [None]:
def import_abstracts(count=-1):
    pairs = []
    with open("abstracts.csv", newline='', encoding='UTF-8') as testfile:
        reader = csv.DictReader(testfile)
        for row in reader:
            pairs.append({"Title": row["title"], "Abstract": row["abstract"]})

    if count == -1:
        return pairs
    return random.sample(pairs, count)

In [None]:
def import_test_abstracts(model_name, count=-1):
    abstracts = []
    with open(f"models/{model_name}/test.txt", "r") as file:
        data = file.read()
        inputs_idx = data.index("inputs")
        inputs = data[inputs_idx + 10:len(data) - 18]
        sequences = inputs.split('<|endoftext|> ", ')
        for seq in sequences:
            s = seq.split(" <TITLE> ")
            abstracts.append({"Abstract": s[0][1:], "Title": s[1]})

        if count == -1:
            return abstracts
        return random.sample(abstracts, count)

In [None]:
def get_titles_from_abstracts(model, tokenizer, abstracts, 
                              num_return_sequences=5, max_length=25, min_length=5,
                              diversity_penalty=1.0, length_penalty=10.0, repetition_penalty=2.0):
    titles = []
    for i, abstract in enumerate(abstracts):
        if (i + 1) % 100 == 0:
            print(f"{i + 1}/{len(abstracts)}")
        text = abstract["Abstract"] + " <TITLE> "
        inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")

        title_ids = model.generate(inputs,
                                   num_beams=num_return_sequences,
                                   num_beam_groups=num_return_sequences,
                                   num_return_sequences=num_return_sequences,
                                   max_length=len(inputs[0]) + max_length,
                                   min_length=len(inputs[0]) + min_length,
                                   early_stopping=True,
                                   diversity_penalty=diversity_penalty,
                                   length_penalty=length_penalty,
                                   repetition_penalty=repetition_penalty,
                                   no_repeat_ngram_size=2,
                                   )

        titles.append({"Original": abstract["Title"],
                       "Titles": [tokenizer.decode(title, skip_special_tokens=False, clean_up_tokenization_spaces=False)
                                  for title in title_ids]})

    return titles

In [None]:
def calculate_bert_score(original, candidates, mode="max", verbose=False):
    originals = [original] * len(candidates)
    
    p, r, f1 = score(candidates, originals, lang="en")

    if verbose:
        print(f"Original title: {original}")
        print()
        for i, candidate in enumerate(candidates):
            print(f"{candidate}")
            print(f"Precision: {p[i]:.3f}, Recall: {r[i]:.3f}, F1: {f1[i]:.3f}")
            print()
        print()

    if mode == "best":
        return candidates[torch.argmax(f1).item()]
    if mode == "max":
        return torch.max(f1).item()
    if mode == "mean":
        return torch.mean(f1).item()
    return p, r, f1

In [None]:
def quantitative_evaluation():
    model_names = ["2048_default", "2048_3e-4", "2048_3e-6", "2048_epochs"]

    tokenizer = load_tokenizer()

    for model_name in model_names:
        model = load_local_model(model_name)
        model.resize_token_embeddings(len(tokenizer))
        pairs = import_test_abstracts(model_name)

        titles = get_titles_from_abstracts(model=model,
                                           tokenizer=tokenizer,
                                           abstracts=pairs,
                                           max_length=25,
                                           min_length=5)

        f1 = []
        for i, title_set in enumerate(titles):
            if (i + 1) % 100 == 0:
            print(f"{i + 1}/{len(titles)}")
            
            trimmed_titles = [title.split(" <TITLE> ")[1].split("<|endoftext|>")[0] for title in title_set["Titles"]]

            f1.append(calculate_bert_score(title_set["Original"],
                                           trimmed_titles,
                                           mode="max"))

        print()
        print(f1)
        print(sum(f1) / len(f1))
        
quantitative_evaluation()

In [None]:
def qualitative_evaluation():
    model_names = ["2048_default", "2048_3e-4", "2048_3e-6", "2048_epochs"]
    pairs = get_abstracts_for_qual_eval()

    for pair in pairs:
        print(pair["Title"])
        print(pair["Abstract"])
        print()

    tokenizer = load_tokenizer()

    for model_name in model_names:
        model = load_local_model(model_name)
        model.resize_token_embeddings(len(tokenizer))
        titles = get_titles_from_abstracts(model=model,
                                           tokenizer=tokenizer,
                                           abstracts=pairs,
                                           max_length=25,
                                           min_length=5)

        for title_set in titles:
            print(title_set["Original"])
            for title in title_set["Titles"]:
                print(title.split(" <TITLE> ")[1].split("<|endoftext|>")[0])
            print()
            
qualitative_evaluation()

In [None]:
def get_titles_for_single_abstract(model_name, abstract)
    model = load_local_model(model_name)
    tokenizer = load_tokenizer()
    model.resize_token_embeddings(len(tokenizer))
    
    abstracts = [{"Title": "", "Abstract": abstract}]
    
    titles = get_titles_from_abstracts(model, tokenizer, abstracts)
    
    for title in titles[0]["Titles"]:
        print(title.split(" <TITLE> ")[1].split("<|endoftext|>")[0])
    
    return titles[0]
    