In [1]:
!pip install torch
!pip install sentencepiece
!pip install transformers



In [2]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [3]:
device = torch_device
from torch.utils.data import Dataset, DataLoader
import os
import pandas as pd

In [4]:
train_encodings_1 = tokenizer.batch_encode_plus(pd.read_csv('data/ms_all.csv')[['#1 String']].values[:,0], truncation=True, padding=True)
train_encodings_2 = tokenizer.batch_encode_plus(pd.read_csv('data/ms_all.csv')[['#2 String']].values[:,0], truncation=True, padding=True)

In [5]:
import torch

class MSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings_1, encodings_2):
        self.inputs = train_encodings_1
        self.targets = train_encodings_2

    def __getitem__(self, idx):
        input = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        target = {'decoder_'+str(key): torch.tensor(val[idx]) for key, val in self.targets.items()}
        input["labels"] = target["decoder_input_ids"]
        return {**input, **target}

    def __len__(self):
        return len(self.inputs['input_ids'])

In [6]:
train_dataset = MSDataset(train_encodings_1, train_encodings_2)

In [7]:
from transformers import Trainer, TrainingArguments

In [12]:
for param in model.base_model.parameters():
    param.requires_grad = True

In [9]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

In [10]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=train_dataset            # evaluation dataset
)

In [None]:
trainer.train()

Step,Training Loss


In [11]:
def get_response(input_text):
    encoding = tokenizer.prepare_seq2seq_batch(input_text, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(torch_device), encoding["attention_mask"].to(torch_device)
    translated = model.generate(input_ids=input_ids, 
                                attention_mask=attention_masks,
                                do_sample=True,
                                min_length= len(input_text[0].split()) - 2,
                                top_k=120,
                                top_p=0.95,
                                temperature=0.98,
                                early_stopping=True,
                                num_return_sequences=1,
                                no_repeat_ngram_size = 3
                               )
    
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [12]:
sentence = "While it’s easy to get starstruck by its Pro sibling, the OnePlus 9 is a capable alternative at a lower cost. You’re looking at quite a gap, especially in the US where (for some unknown reason), the base 8/128 GB version of the 9 Pro is not available. This means you can have the vanilla phone for $730 or the 12/256 GB Pro for $1,070"
context = sentence.split('.')
if len(context[-1]) < 5:
    context = context[:-1]
target = get_response(context)

print(sentence)
print()
print(' '.join(target))



While it’s easy to get starstruck by its Pro sibling, the OnePlus 9 is a capable alternative at a lower cost. You’re looking at quite a gap, especially in the US where (for some unknown reason), the base 8/128 GB version of the 9 Pro is not available. This means you can have the vanilla phone for $730 or the 12/256 GB Pro for $1,070

While While While it it is it is is is it it it has it it''sss it it its its its it's it itss its itsss itss itssitsitsits its its's itsits itsitsitsssinsinsins ))”” ()))),,)”)”)”)””)”),”)”),”),”),”)”)););”((();););;;;););))”)”);;);;((””));””)”)()()()(( It It ItItItIt ItIt It It it it is is is that that that it it that it is it it it a it it the the the it the a a it a a the the an a a a be a a an an an a an the the a an a
