In [None]:
!pip install pytorch_lightning
!pip install nlp
!pip install transformers
!pip install flax

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import textwrap
from tqdm.auto import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.loggers import WandbLogger
from nlp import load_metric

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
! gdown 1aJIBtDs62oMZW7fQrKrO3IXCcA9V4fbh
! gdown 1EwyvKnHe2FjJFki3AQStyOcFA7XJ21pG
! gdown 1NW-_lj8A54e_fO2iNiu4AYxGTAjvUQev

In [4]:
with open('/content/train_merged.json') as f:
    data = json.load(f)
input, output = [], []
for i in data:
    for j in i['paragraphs']:
        for k in j['qas']:
            input.append('متن: ' + j['context'] + '، پرسش: ' + k['question'])
            temp = 'بدون پاسخ'
            if len(k['answers']):
                temp = k['answers'][0]['text']
            output.append(temp)
df = pd.DataFrame()
df['input'] = input
df['output'] = output

########################################################################################

with open('/content/test_merged.json') as f:
    data = json.load(f)
input, output = [], []
for i in data:
    for j in i['paragraphs']:
        for k in j['qas']:
            input.append('متن: ' + j['context'] + '، پرسش: ' + k['question'])
            temp = 'بدون پاسخ'
            if len(k['answers']):
                temp = ''
                for a in k['answers']:
                    temp = temp + ',' + a['text']
            output.append(temp)
df_test = pd.DataFrame()
df_test['input'] = input
df_test['output'] = output
len(df), len(df_test)

(68010, 6656)

In [5]:
df = df.iloc[50000:,:]

In [6]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

In [7]:
class bigbang(Dataset):
    def __init__(self, tokenizer, input_length, output_length):         
        self.dataset =  df
        self.input_length = input_length
        self.tokenizer = tokenizer
        self.output_length = output_length
    def __len__(self):
        return self.dataset.shape[0]
    def clean_text(self, text):
        text = text.replace('\n','')
        return text
    def convert_to_features(self, example_batch):
        input_ = self.clean_text(example_batch['input'])
        target_ = self.clean_text(example_batch['output'])
        source = self.tokenizer.batch_encode_plus([input_], max_length=self.input_length, 
                                                     padding='max_length', truncation=True, return_tensors="pt")
        targets = self.tokenizer.batch_encode_plus([target_], max_length=self.output_length, 
                                                     padding='max_length', truncation=True, return_tensors="pt")
        return source, targets
    def __getitem__(self, index):
        source, targets = self.convert_to_features(self.dataset.iloc[index])
        source_ids = source["input_ids"].squeeze()
        target_ids = targets["input_ids"].squeeze()
        src_mask    = source["attention_mask"].squeeze()
        target_mask = targets["attention_mask"].squeeze()
        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}  

In [8]:
model_path = '/content/drive/MyDrive/parsT5_QA/model_3'
configs = {
  "local_files_only": True,
  "from_flax": True
}

In [9]:
from google.colab import drive
drive.mount('./drive')

Mounted at ./drive


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_path, **configs)#('Ahmad/parsT5-base')#('google/mt5-base')
dataset = bigbang(tokenizer, 512, 150)
len(dataset)

18010

In [11]:
data = dataset[50]
print()
print("Shape of Tokenized Text: ", data['source_ids'].shape)
print()
print("Sanity check - Decode Text: ", tokenizer.decode(data['source_ids']))
print("====================================")
print("Sanity check - Decode Summary: ", tokenizer.decode(data['target_ids']))


Shape of Tokenized Text:  torch.Size([512])

Sanity check - Decode Text:  متن: بر این اساس ، همراه با محتوای منطقی ادعاها (جایی که محتوای منطقی با احتمال عکس متناسب است) ، پوپر به توسعه مفهوم مهم خود از واقعیت یا "درست بودن" ادامه داد. ایده شهودی در پشت واقعیت این است که ادعاها یا فرضیه های نظریه های علمی را می توان با توجه به میزان حقیقت و کذب آنها برآورد کرد. و از این طریق ، می توان یک نظریه را کم و بیش درست از نظریه دیگر بر مبنای کمی ارزیابی کرد ، پوپر با زور تأکید می کند ، هیچ ارتباطی با "احتمالات ذهنی" یا سایر ملاحظات صرف "معرفتی" ندارد.، پرسش: کدام اصطلاح به معنای مخالف واقعیت است؟</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [12]:
def get_dataset(tokenizer, args):
      return bigbang(tokenizer=tokenizer, input_length=args.max_input_length, 
                        output_length=args.max_output_length)

In [13]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparamss = hparams        
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, **configs)
        self.model = T5ForConditionalGeneration.from_pretrained(model_path)
        # self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        # self.tokenizer = AutoTokenizer.from_pretrained(hparams.tokenizer_name_or_path)
#         self.rouge_metric = load_metric('rouge')  
        n_observations_per_split = {
            "train": self.hparamss.n_train,
            "validation": self.hparamss.n_val,
            "test": self.hparamss.n_test,
        }
        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}
    def freeze_params(self, model):
        for par in model.parameters():
            par.requires_grad = False
    def lmap(self, f, x):
        """list(map(f, x))"""
        return list(map(f, x))
    def is_logger(self):
        return self.trainer.global_rank <= 0
    def parse_score(self, result):
        return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()} 
    def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
    )
    def _step(self, batch):
        labels = batch["target_ids"]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100
        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask'])
        loss = outputs[0]
        return loss
    def ids_to_clean_text(self, generated_ids):
        gen_text = self.tokenizer.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        return self.lmap(str.strip, gen_text)
    def _generative_step(self, batch) :
        t0 = time.time()
        generated_ids = self.model.generate(
            batch["source_ids"],
            attention_mask=batch["source_mask"],
            use_cache=True,
            decoder_attention_mask=batch['target_mask'],
            max_length=150, 
            num_beams=2,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True)
        preds = self.ids_to_clean_text(generated_ids)
        target = self.ids_to_clean_text(batch["target_ids"]) 
        gen_time = (time.time() - t0) / batch["source_ids"].shape[0]  
        loss = self._step(batch)
        base_metrics = {'val_loss': loss}
        summ_len = np.mean(self.lmap(len, generated_ids))
        base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target)
#         self.rouge_metric.add_batch(preds, target)
        return base_metrics
    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}
    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
    def validation_step(self, batch, batch_idx):
        return self._generative_step(batch)
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
#         rouge_results = self.rouge_metric.compute() 
#         rouge_dict = self.parse_score(rouge_results)
#         tensorboard_logs.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL'])
        self.target_gen= []
        self.prediction_gen=[]
        return {"avg_val_loss": avg_loss, 
#                 "rouge1" : rouge_results['rouge1'],
#                 "rougeL" : rouge_results['rougeL'],
                "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparamss.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparamss.learning_rate, eps=self.hparamss.adam_epsilon)
        self.opt = optimizer
        return [optimizer]
    def optimizer_step(self,
                     epoch=None,
                     batch_idx=None,
                     optimizer=None,
                     optimizer_idx=None,
                     optimizer_closure=None,
                     on_tpu=None,
                     using_native_amp=None,
                     using_lbfgs=None):
        optimizer.step()
        optimizer.zero_grad()
        optimizer_closure()
        self.lr_scheduler.step()
    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
        return tqdm_dict
    def train_dataloader(self):   
        n_samples = self.n_obs['train']
        train_dataset = get_dataset(tokenizer=self.tokenizer, args=self.hparamss)
        dataloader = DataLoader(train_dataset, batch_size=self.hparamss.train_batch_size, drop_last=True, shuffle=True, num_workers=2)
        t_total = (
            (len(dataloader.dataset) // (self.hparamss.train_batch_size * max(1, self.hparamss.n_gpu)))
            // self.hparamss.gradient_accumulation_steps
            * float(self.hparamss.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparamss.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader
    def val_dataloader(self):
        n_samples = self.n_obs['validation']
        validation_dataset = get_dataset(tokenizer=self.tokenizer, args=self.hparamss)
        return DataLoader(validation_dataset, batch_size=self.hparamss.eval_batch_size, num_workers=2)
    def test_dataloader(self):
        n_samples = self.n_obs['test']
        test_dataset = get_dataset(tokenizer=self.tokenizer, args=self.hparamss)
        return DataLoader(test_dataset, batch_size=self.hparamss.eval_batch_size, num_workers=2)

In [14]:
logger = logging.getLogger(__name__)
class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))

In [15]:
args_dict = dict(
    output_dir="", 
    model_name_or_path='Ahmad/parsT5-base',
    tokenizer_name_or_path='Ahmad/parsT5-base',
    max_input_length=512,
    max_output_length=150,
    freeze_encoder=False,
    freeze_embeds=False,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=1,
    gradient_accumulation_steps=8,
    n_gpu=1,
    resume_from_checkpoint=None, 
    val_check_interval = 10, 
    limit_val_batches = 0,
    n_val=5,
    n_train=-1,
    n_test=-1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [16]:
!mkdir -p t5_bigbang

In [17]:
args_dict.update({'output_dir': 't5_bigbang', 'num_train_epochs':1,
                 'train_batch_size': 4, 'eval_batch_size': 4})
args = argparse.Namespace(**args_dict)
print(args_dict)

{'output_dir': 't5_bigbang', 'model_name_or_path': 'Ahmad/parsT5-base', 'tokenizer_name_or_path': 'Ahmad/parsT5-base', 'max_input_length': 512, 'max_output_length': 150, 'freeze_encoder': False, 'freeze_embeds': False, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 4, 'eval_batch_size': 4, 'num_train_epochs': 1, 'gradient_accumulation_steps': 8, 'n_gpu': 1, 'resume_from_checkpoint': None, 'val_check_interval': 10, 'limit_val_batches': 0, 'n_val': 5, 'n_train': -1, 'n_test': -1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 42}


In [18]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=3
)
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    # early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_backend='apex',
    amp_level=args.opt_level,
    resume_from_checkpoint=args.resume_from_checkpoint,
    gradient_clip_val=args.max_grad_norm,
    # checkpoint_callback=checkpoint_callback,
    val_check_interval=args.val_check_interval,
    num_sanity_val_steps=0 ,
    limit_val_batches = 0,
    # logger=wandb_logger,
    callbacks=[LoggingCallback()],
)

In [19]:
model = T5FineTuner(args)

In [20]:
trainer = pl.Trainer(**train_params)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [21]:
trainer.fit(model)

  "When using `Trainer(accumulate_grad_batches != 1)` and overriding"
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 247 M 
-----------------------------------------------------
247 M     Trainable params
0         Non-trainable params
247 M     Total params
990.158   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [22]:
!mkdir model
model.model.save_pretrained('./model')
model.tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/tokenizer.json')

### testing

In [23]:
df = df_test

In [24]:
tokenizer = AutoTokenizer.from_pretrained('/content/model')
model = T5ForConditionalGeneration.from_pretrained('/content/model')
dataset = bigbang(tokenizer, 512, 150)

In [25]:
model.to('cuda')
dec, targets = [], []
for i in range(len(df_test)//32):
    loader = DataLoader(dataset, 32, shuffle=True)
    it = iter(loader)
    batch = next(it)
    batch["source_ids"].shape
    outs = model.generate(
                batch["source_ids"].cuda(),
                attention_mask=batch["source_mask"].cuda(),
                use_cache=True,
                decoder_attention_mask=batch['target_mask'].cuda(),
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
            )

    dec = dec + [tokenizer.decode(ids) for ids in outs]

# texts = [tokenizer.decode(ids) for ids in batch['source_ids']]
    targets = targets + [tokenizer.decode(ids) for ids in batch['target_ids']]

In [26]:
d_test = pd.DataFrame()
d_test['pred'] = dec
d_test['target'] = targets

In [27]:
d_test['pred'] = [i.replace('<pad>', '').replace('</s>', '').strip() for i in d_test['pred']]
d_test['target'] = [i.replace('<pad>', '').replace('</s>', '') for i in d_test['target']]

In [28]:
d_test

Unnamed: 0,pred,target
0,بدون پاسخ,بدون پاسخ
1,بدون پاسخ,",اسفنج,اسفنج,اسفنج"
2,بدون پاسخ,بدون پاسخ
3,نوآوری در فن آوری,",جزئی,جزئی,جزئی"
4,ثبت نام اقامت,بدون پاسخ
...,...,...
6651,34 میلیون پوند,",34 میلیون پوند,34 میلیون پوند,34 میلیون پوند ..."
6652,نمونه های زمین، ماه ، مریخ و شهاب سنگ ها,بدون پاسخ
6653,بدون پاسخ,بدون پاسخ
6654,بدون پاسخ,بدون پاسخ


In [29]:
d_test.to_csv('./model/outs.csv')

In [30]:
!cp -r ./model /content/drive/MyDrive/parsT5_QA/model_4