transformations/style_paraphraser/transformation.py

import re

import nltk
import numpy as np
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType
from transformations.style_paraphraser.paraphraser_helpers.style_paraphraser import (
    Instance,
)

"""
    Note: This codebase is based upon, adapted and refactored from code
    from this repository:
    https://github.com/martiansideofthemoon/style-transfer-paraphrase

"""

MODELS_SUPPORTED = {
    "Bible": "filco306/gpt2-bible-paraphraser",
    "Basic": "filco306/gpt2-base-style-paraphraser",
    "Shakespeare": "filco306/gpt2-shakespeare-paraphraser",
    "Tweets": "filco306/gpt2-tweet-paraphraser",
    "Switchboard": "filco306/gpt2-switchboard-paraphraser",
    "Romantic poetry": "filco306/gpt2-romantic-poetry-paraphraser",
}

MAX_PARAPHRASE_LENGTH = 100

BASE_CONFIG = {
    "max_total_length": MAX_PARAPHRASE_LENGTH,
    "max_prefix_length": int(MAX_PARAPHRASE_LENGTH / 2),
    "max_suffix_length": int(MAX_PARAPHRASE_LENGTH / 2),
}


class StyleTransferParaphraser(SentenceOperation):
    tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION]
    languages = ["en"]
    heavy = True
    keywords = [
        "model-based",
        "transformer-based",
        "tokenizer-required",
        "unnatural-sounding",
        "unnaturally-written",
        "possible-meaning-alteration",
        "high-coverage",
    ]
    """
        Style transfer paraphraser, using a GPT2-model of choice.

        Args:
            style : str
                The style to use. Options include Bible, Shakespeare, Basic, Romantic Poetry, Switchboard and Tweets.
            device : device to use for computations.
                Default: None, and it will then resort to CUDA if available, else CPU.
            upper_length :
                The maximum length.
                Options: "eos" or "same_N" (e.g., "same_5"), where N will be the max_length.
                    "eos" means the maximum length is the length of the sentence a paraphrase is generated for.
                    "same_N" means the the length of the original sentence + N.
            beam_size : size of the beam during beam search (if top_p == 0.0)
                Default: 1
            top_p : float
                top_p sampling, between 0.0 and 1.0
                Default: 0.0 (meaning using a greedy approach)
            temperate : float
                Sampling temperate
                Default: 0.0
            use_twostep : bool
                Default: True
                Whether to use the two-step style transfer procedure as in the original paper.
                If False, only one model is used (which gives a lower performance. )
                NOTE: The two-step approach loads two GPT2 models in memory, which is very heavy
                and may cause memory issues.
    """

    def __init__(
        self,
        style: str = "Basic",
        device=None,
        upper_length="same_5",
        beam_size: int = 1,
        top_p: int = 0.0,
        temperature: float = 0.0,
        use_twostep: bool = True,
    ):
        try:
            nltk.data.find("tokenizers/punkt")
        except LookupError:
            nltk.download("punkt")
        self.style = style
        self.use_twostep = use_twostep
        assert (
            style in MODELS_SUPPORTED.keys()
        ), f"Style not supported. The following styles are supported: {', '.join(list(MODELS_SUPPORTED.keys()))}"
        model_path = MODELS_SUPPORTED[style]
        self.args = {}
        self.device = device
        if self.device is None:
            self.device = torch.device(
                "cpu" if torch.cuda.is_available() else "cuda"
            )
        self.args["upper_length"] = upper_length
        self.args["stop_token"] = "eos" if upper_length == "eos" else None
        self.args["beam_size"] = beam_size
        self.args["temperature"] = temperature
        self.args["top_p"] = top_p
        self.args["top_k"] = 1
        self.args["device"] = self.device
        self.config = BASE_CONFIG

        self.config["global_dense_length"] = 0
        model = GPT2LMHeadModel.from_pretrained(model_path)
        model.to(self.device)
        self.gpt2_model = model  # GPT2ParentModule(gpt2=model, device=device)
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.basic_model = (
            GPT2LMHeadModel.from_pretrained(MODELS_SUPPORTED["Basic"]).to(
                self.device
            )
            if self.use_twostep is True and self.style != "Basic"
            else None
        )
        self.basic_tokenizer = (
            GPT2Tokenizer.from_pretrained(MODELS_SUPPORTED["Basic"])
            if self.use_twostep is True and self.style != "Basic"
            else None
        )

    def modify_p(self, top_p):
        """Set top_p to another value"""
        self.args["top_p"] = top_p

    def _paraphrase(
        self, sentence, use_basic: bool, top_p=None, max_outputs: int = 1
    ):
        """
        Helper function to generate a paraphrase.
        One step
        """
        sent_text = nltk.sent_tokenize(sentence)

        contexts = [sent_text] * max_outputs

        to_ret = []
        for context_ in contexts:
            instances = []
            for context in context_:
                context_ids = self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(context)
                )

                instance = Instance(
                    self.args,
                    self.config,
                    {"sent1_tokens": context_ids, "sent2_tokens": context_ids},
                )
                instance.preprocess(self.tokenizer)
                global_dense_vectors = np.zeros((1, 768), dtype=np.float32)
                instance.gdv = global_dense_vectors
                instances.append(instance)

            gpt2_sentences = torch.tensor(
                [inst.sentence for inst in instances]
            ).to(self.device)
            segments = torch.tensor([inst.segment for inst in instances]).to(
                self.device
            )
            init_context_size = instances[0].init_context_size
            eos_token_id = self.tokenizer.eos_token_id

            model = self.gpt2_model if use_basic is False else self.basic_model
            with torch.no_grad():
                output = model.generate(
                    input_ids=gpt2_sentences[:, 0:init_context_size],
                    max_length=gpt2_sentences.shape[1],
                    return_dict_in_generate=True,
                    eos_token_id=eos_token_id,
                    output_scores=True,
                    do_sample=self.args["top_k"] > 0 or top_p > 0.0,
                    top_k=self.args["top_k"],
                    top_p=top_p,
                    temperature=self.args["temperature"]
                    if self.args["temperature"] > 0
                    else None,
                    num_beams=self.args["beam_size"],
                    token_type_ids=segments[:, 0:init_context_size],
                )
            # import ipdb; ipdb.sset_trace()
            all_output = []
            for out_num in range(len(output)):
                instance = instances[out_num]
                curr_out = output[
                    out_num, instance.init_context_size :  # noqa: E203
                ].tolist()

                if self.tokenizer.eos_token_id in curr_out:
                    curr_out = curr_out[
                        : curr_out.index(self.tokenizer.eos_token_id)
                    ]

                if self.args["upper_length"].startswith("same"):
                    extra = int(self.args["upper_length"].split("_")[-1])
                    curr_out = curr_out[: len(instance.sent1_tokens) + extra]

                all_output.append(
                    self.tokenizer.decode(
                        curr_out,
                        clean_up_tokenization_spaces=True,
                        skip_special_tokens=True,
                    )
                )
            to_ret.append(re.sub("!?\\??\\.+", ".", ". ".join(all_output)))
        return to_ret[:max_outputs]

    def generate(self, sentence, top_p=None, max_outputs: int = 1):
        """
        Generate paraphrases for a batch of outputs - or for the same but with a top_p != 0.0
        sentence : str
            Sentence to paraphrase.
        top_p : float
            top_p sampling, between 0.0 and 1.0
            Default None, resorting to the model's top_p value
        max_outputs : int
            Number of samples to generate for a sentence.
            Note: These will be the exact same if you use a greedy sampling (top_p=0.0), so if max_outputs > 2, makes sure top_p != 0.0.
        """
        if self.basic_model is not None:
            sentences = self._paraphrase(
                sentence=sentence,
                use_basic=True,
                top_p=top_p,
                max_outputs=max_outputs,
            )
            out = [
                self._paraphrase(
                    sentence=sentence_,
                    use_basic=False,
                    top_p=top_p,
                    max_outputs=1,
                )[0]
                for sentence_ in sentences
            ]
        else:
            out = self._paraphrase(
                sentence=sentence,
                use_basic=False,
                top_p=top_p,
                max_outputs=max_outputs,
            )
        return out


# Sample code to demonstrate usage of the this perturbation module.
# This can be uncommented to be used to test the module.

if __name__ == "__main__":
    import argparse
    import sys

    parser = argparse.ArgumentParser()
    parser.add_argument("--style", default="Shakespeare", type=str)
    parser.add_argument(
        "--input_sentence",
        default="Hi there! How are you doing today? ",
        type=str,
    )
    parser.add_argument("--top_p_value", default=0.6, type=float)
    args = parser.parse_args()

    if not torch.cuda.is_available():
        print(
            "Please check if a GPU is available or your Pytorch installation is correct."
        )
        sys.exit()

    print("Loading paraphraser...")
    paraphraser = StyleTransferParaphraser(args.style, upper_length="same_5")

    input_sentence = args.input_sentence
    paraphraser.modify_p(top_p=0.0)
    greedy_decoding = paraphraser.generate(input_sentence)
    print("\ngreedy sample:\n{}\n".format(greedy_decoding))

text = "William Shakespeare was an English playwright, poet, and actor, widely regarded as the greatest writer in the English language and the world's greatest dramatist. "
nltk.download("punkt")
sent_text = nltk.sent_tokenize(text)