In [1]:
%load_ext autoreload
%autoreload 2
    
%load_ext tensorboard
import sys, os
sys.path.append('../paraphrase/')
from paraphraser_args import ModelArguments, DataTrainingArguments, TrainingArguments
from paraphraser_dataloader import load_dataset_pseudo, load_dataset_pseudo_binary
from paraphraser_trainer import ParaphraserTrainer
from transformers import AutoTokenizer, AutoModelWithLMHead, HfArgumentParser

In [12]:
def get_model_tokenizer(task, model_nick):
    data_dir = "../data/pseudo"
    model_name = "t5-small"
    meta_task_type = "binary_single"
    meta_task = 'transfer'


    output_dir = "../models/"
    epochs = "5"
    train_batch_size = "16"
    eval_batch_size = "16"
    save_log_steps = "800"

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses([
        "--model_name_or_path",
        model_name,
        "--model_nick",
        model_nick,
        "--data_dir",
        data_dir,
        "--output_dir",
        os.path.join(output_dir, model_nick),
        "--cache_dir",
        os.path.join(output_dir,"cache"),
        "--overwrite_cache",
        "--per_device_train_batch_size",
        train_batch_size,
        "--per_device_eval_batch_size",
        eval_batch_size,
        "--max_seq_len",
        "64",
        "--gradient_accumulation_steps",
        "1",
        "--num_train_epochs",
        epochs,
        "--logging_steps",
        save_log_steps,
        "--save_steps",
        save_log_steps,
        "--data_parallel",
        "True",
        "--meta_task",
        meta_task,
        "--meta_task_type",
        meta_task_type
    ])

    # Eval
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelWithLMHead.from_pretrained(os.path.join(output_dir, model_nick))   
    return tokenizer, model 

In [13]:
shake_bin_tok, shake_bin_model = get_model_tokenizer('shakespeare','t5_transfer_shakespeare_binary')
wiki_bin_tok, wiki_bin_model = get_model_tokenizer('wiki','t5_transfer_wiki_binary')
abs_bin_tok, abs_bin_model = get_model_tokenizer('abstract','t5_transfer_abstract_binary')


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /home/vivek/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "out

In [14]:
def get_preds(paraphrases, tokenizer, model):
    for p in paraphrases:
        transfer_example = "transfer: "+p
        t = tokenizer(transfer_example, return_tensors='pt')
        en = model.generate(input_ids= t.input_ids, attention_mask = t.attention_mask, max_length=70, 
                                            num_beams=12,
                                            #    early_stopping=True,
                                            encoder_no_repeat_ngram_size=5,
                                            num_beam_groups=3,
                                            diversity_penalty=0.5,
                                            num_return_sequences=3
                                            )
        transfers = tokenizer_transfer.batch_decode(gen, skip_special_tokens=True)
    return transfers


In [20]:
paraphrases = ["Our project is called Marvin, and it has the latest advancements in ML"]
print(get_preds(paraphrases, shake_bin_tok, shake_bin_model))
print(get_preds(paraphrases, wiki_bin_tok, wiki_bin_model))
print(get_preds(paraphrases, abs_bin_tok, abs_bin_model))


['Our project is Marvin, and hath the latest advancements of ML.', 'Our project is Marvin, and has the latest advancement in ML.', 'Our project is Marvin, and has the latest advancement of ML.']
['Our project, called Marvin, has the latest advances in ML', 'Our project, called Marvin, has the latest advancement in ML', 'Our project, called Marvin, has the latest advances in ML.']
['Our project, Marvin, has state-of-the-art advances in ML', 'Our project, called Marvin, has state-of-the-art advancements in the field of ML', 'Our project, called Marvin, has the state-of-the-art advancements in the field of ML']


In [None]:
paraphrases = [""]

['Dost thou think Marvin’s text editors next generation?', 'Is Marvin the next generation of editors of text?', 'Dost thou think Marvin is next generation of text editor?']

['Consider Marvin as the next generation of Text Editors?', 'Consider Marvin as the next generation text editors?', 'Think Marvin is a new generation of text editors']

['Marvin is a tool that can help people unfamiliar with the english language', 'Marvin is a tool that can help people unfamiliar with the english language.', 'Marvin is a tool that can help people unfamiliar with the language of english']