In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

If the data is not here, uncomment the lines below and download it

In [3]:
# !wget https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip
# !unzip filtered_paranmt.zip

# Train a paraphraser on the mined data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('filtered.tsv', sep='\t', encoding='utf-8')
print(df.shape)
df.sample(10)

(223823, 7)


Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
191205,191205,She must be out there somewhere.,"he must be out there, damn it.",0.629855,0.060606,6.1e-05,0.995908
197885,197885,"""Nuts!","""bullshit!",0.743836,0.363636,0.028349,0.999605
143495,143495,"You are only about one half loser, the other h...","from the second half, you're the winner.",0.778499,0.293103,0.93293,5.2e-05
112935,112935,Kick some NSC booty.,kick some NSC asses.,0.901688,0.0,0.35905,0.998997
52732,52732,Because it's where they used to keep prisoners...,because they held prisoners there before they ...,0.837865,0.1625,0.953626,0.074862
95104,95104,He wants us to blow him away!,he wants us to kill him!,0.766668,0.166667,0.000662,0.999194
200609,200609,'You don't bash a man's brains out when he's t...,"""you can't break the head of a man tied up in ...",0.757032,0.111111,0.89377,0.006597
85321,85321,"""Here's the bats, if you got the balls"".","""take them, if you have the balls!""",0.786242,0.121951,0.000385,0.998311
73313,73313,I don't want to fucking hear it.,I don't want to hear anything.,0.828845,0.060606,0.980057,5.8e-05
146935,146935,"One, you can be a waitress, or you can be a ca...",one that you're gonna be a waitress...... or y...,0.708999,0.214286,0.000142,0.999406


In [4]:
(df.ref_tox > df.trn_tox).mean()

0.5521639867216506

In [5]:
xx = []
yy = []
for i, row in df.iterrows():
    if row.ref_tox > row.trn_tox:
        xx.append(row.reference)
        yy.append(row.translation)
    else:
        yy.append(row.reference)
        xx.append(row.translation)
        
xydf = pd.DataFrame({'source': xx, 'target': yy})

# Prepare datasets

In [6]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer, T5TokenizerFast,
    get_linear_schedule_with_warmup
)
import torch

In [7]:
model_name = "ceshine/t5-paraphrase-paws-msrp-opinosis"

In [8]:
tokenizer = T5TokenizerFast.from_pretrained(model_name)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
df_train, df_test = train_test_split(xydf, test_size=300)
print(df_train.shape[0], df_test.shape[0])

223523 300


In [11]:
%%time

x1 = tokenizer(df_train.source.tolist(), truncation=True)
y1 = tokenizer(df_train.target.tolist(), truncation=True)
x2 = tokenizer(df_test.source.tolist(), truncation=True)
y2 = tokenizer(df_test.target.tolist(), truncation=True)

CPU times: user 56.1 s, sys: 1.78 s, total: 57.9 s
Wall time: 8.06 s


In [12]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx < len(self.x['input_ids'])
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2
    
train_dataset = PairsDataset(x1, y1)
test_dataset = PairsDataset(x2, y2)
len(train_dataset), len(test_dataset)

(223523, 300)

In [13]:
from torch.utils.data import Dataset, DataLoader

In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)

# Fine tune t5

In [15]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import torch

In [16]:
checkpoint_name = 'SkolkovoInstitute/t5-paraphrase-paws-msrp-opinosis-paranmt'

In [17]:
model = T5ForConditionalGeneration.from_pretrained(checkpoint_name)

In [18]:
device = torch.device('cuda:0')
model.to(device);

In [19]:
import transformers

In [20]:
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self):
        return device

In [21]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [22]:
save_name = 'models/t5-cechine-nmt-mined-detox'

todo: maybe, batch > 4 would do as well

In [23]:
training_args = TrAr(
    output_dir=save_name,   # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,             # total # of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=300,               # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=3e-5,
    logging_dir='./logs',           # directory for storing logs
    logging_steps=100,
    eval_steps=100,
    evaluation_strategy='steps',
    save_total_limit=1,
    save_steps=5000,
)

In [24]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [26]:
import gc
gc.collect()
torch.cuda.empty_cache();

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
100,1.0846,0.793464,1.2292,244.053
200,1.0902,0.788172,1.2268,244.531
300,1.1094,0.785306,1.2381,242.309
400,1.0773,0.784683,1.2171,246.494
500,1.0891,0.783029,1.2592,238.245
600,1.0772,0.781447,1.2602,238.06
700,1.0769,0.77966,1.2125,247.413
800,1.0579,0.778833,1.2225,245.389
900,1.0781,0.778808,1.2238,245.132
1000,1.0673,0.778335,1.2233,245.228


In [28]:
trainer.evaluate()

{'eval_loss': 0.7315998077392578,
 'eval_runtime': 1.383,
 'eval_samples_per_second': 216.921,
 'epoch': 3.0}

In [29]:
model.eval();

In [30]:
inputs = tokenizer('The internal policy of the fucking Trump is stupid.', return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}
for t in model.generate(**inputs, num_return_sequences=10, do_sample=False, num_beams=10):
    print(tokenizer.decode(t, skip_special_tokens=True))

the Trump administration's internal policy is nonsense.
the internal policy of Donald Trump is nonsense.
the internal policy of President Trump is nonsense.
the Trump administration's internal policy is crazy.
the president's internal policy is nonsense.
the internal policy of Mr. Trump is nonsense.
the Trump administration's internal policy is stupid.
the internal policy of Trump is nonsense.
the Trump administration's internal policy is bad.
the Trump internal policy is nonsense.


In [31]:
model.save_pretrained(save_name)