In [1]:
import numpy
# fine tune mt5 on dataset
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from simpletransformers.t5 import T5Model, T5Args
from transformers import pipeline
#import train split
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
import torch
import torch.nn as nn
from google.transliteration import transliterate_word
import klib
import os
#bleu score
from nltk.translate.bleu_score import sentence_bleu

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#tokenize
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
tokenizer.add_special_tokens({'additional_special_tokens': ['<sep>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<pad>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<s>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['</s>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<unk>']})

maxlen = 512
def tokenize_df(df):
    target = tokenizer(df['sentence'], padding='max_length', truncation=True, return_tensors="pt", max_length=maxlen)
    input = tokenizer(df['english_translation'], padding='max_length', truncation=True, return_tensors="pt", max_length=maxlen)
    input_ids = input['input_ids']
    attention_mask = input['attention_mask']
    target_ids = target['input_ids']
    target_attention_mask = target['attention_mask']
    decoder_input_ids = target_ids.clone()
    #convert to tensors
    input_ids = torch.tensor(input_ids).squeeze()
    attention_mask = torch.tensor(attention_mask).squeeze()
    target_ids = torch.tensor(target_ids).squeeze()
    target_attention_mask = torch.tensor(target_attention_mask).squeeze()
   # decoder_input_ids = torch.tensor(decoder_input_ids)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': target_ids,
        #'decoder_input_ids': decoder_input_ids,
        #'decoder_attention_mask': target_attention_mask
    }




In [3]:
train = load_dataset('csv', data_files='train.csv')
val = load_dataset('csv', data_files='val.csv')
test = load_dataset('csv', data_files='test.csv')
train = train.map(tokenize_df, batched=True, batch_size=128,remove_columns=['sentence','english_translation'])
val = val.map(tokenize_df, batched=True, batch_size=128,remove_columns=['sentence','english_translation'])
test = test.map(tokenize_df, batched=True, batch_size=128,remove_columns=['sentence','english_translation'])


Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-27c3049462760e81/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 143.72it/s]
Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-e471cfb17a39f3d5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 426.99it/s]
Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-7fff95b9c64c70e2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 436.23it/s]
Loading cached processed dataset at /home/aparna/.cache/huggingface/datasets/csv/default-27c3049462760e81/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-28492876665d7691.arrow
Loading cached processed dataset at /home/aparna/.cache/huggingface/datasets/csv/default-e471cfb17a39f3d5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe4354

In [4]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

def tokenize_sentence(arg):
    encoded_arg =tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    # Replace -100
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Convert id tokens to text
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Insert a line break (\n) in each sentence for ROUGE scoring
    # (Note : Please change this code, when you perform on other languages except for Japanese)
    text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
    sent_tokenizer_jp = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(l))) for l in text_labels]
    # compute ROUGE score with custom tokenization
    #blue
    #tokens 
    text_labels_tokens = [tokenize_sentence(l) for l in text_labels]
    text_preds_tokens = [tokenize_sentence(p) for p in text_preds]
    
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    ),sentence_bleu(text_labels_tokens,text_preds_tokens)

In [9]:
from torch.utils.data import DataLoader
#tokenizer = MT5Tokenizer.from_pretrained("./mt5")
def testing(model):
    metrics =[]
    sample_dataloader = DataLoader(
      test["train"].with_format("torch"),
      collate_fn=DataCollatorForSeq2Seq(tokenizer, model=model),
      batch_size=5)
    for batch in sample_dataloader:
      with torch.no_grad():
        preds = model.generate(
          batch["input_ids"],
          num_beams=15,
          num_return_sequences=1,
          no_repeat_ngram_size=1,
          remove_invalid_values=True,
          max_length=128,
        )
      labels = batch["labels"]
      metric = metrics_func([preds, labels])
      metrics.append(metric)
    return metrics

def average_metric(metrics):
    rouge = 0
    rouge2 = 0
    rougeL = 0
    rougeLsum = 0
    bleu = 0
    for metric in metrics:
        rouge += metric[0]['rouge1']
        rouge2 += metric[0]['rouge2']
        rougeL += metric[0]['rougeL']
        rougeLsum += metric[0]['rougeLsum']
        bleu += metric[1]
    return rouge/len(metrics),rouge2/len(metrics),rougeL/len(metrics),rougeLsum/len(metrics),bleu/len(metrics)
      

In [5]:
model = MT5ForConditionalGeneration.from_pretrained("/ssd_scratch/cvit/aparna/mt5-synthetic")
metrics = testing(model)

OSError: Can't load the configuration of '/ssd_scratch/cvit/aparna/mt5-synthetic'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/ssd_scratch/cvit/aparna/mt5-synthetic' is the correct path to a directory containing a config.json file

In [12]:
print("basecase_mt5")
scores = average_metric(metrics)
print("rouge:",scores[0])
print("rouge2:",scores[1])
print("rougeL:",scores[2])
print("rougeLsum:",scores[3])
print("bleu:",scores[4])


basecase_mt5
rouge: 0.4729317578176422
rouge2: 0.2991320730642171
rougeL: 0.4538605700654378
rougeLsum: 0.4561573986969596
bleu: 0.0


In [6]:
from torch.utils.data import DataLoader

# Predict with test data (first 5 rows)
sample_dataloader = DataLoader(
  test["train"].with_format("torch"),
  collate_fn=DataCollatorForSeq2Seq(tokenizer, model=model),
  batch_size=5)
for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"],
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  inputs = batch["input_ids"]
  break

# Replace -100 (see above)
inputs = np.where(inputs != -100, inputs, tokenizer.pad_token_id)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

# Convert id tokens to text
text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
text_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)


# Show result
print("***** Input's Text *****")
print(text_inputs[2])
print("***** codemix (True Value) *****")
print(text_labels[2])
print("***** codemix (Generated Text) *****")
print(text_preds[2])

NameError: name 'model' is not defined

In [16]:
for i in range(5):
    print("***** Input's Text *****")
    print(text_inputs[i])
    print("***** codemix (True Value) *****")
    print(text_labels[i])
    print("***** codemix (Generated Text) *****")
    print(text_preds[i])

***** Input's Text *****
@hurdangi haan.. @sagarikaghose sister will eat green mango today @the_hindu
***** codemix (True Value) *****
@hurdangi haan.. @sagarikaghose Didi aaj hare rang ke aam khaengi @the_hindu
***** codemix (Generated Text) *****
@hurdangi haan.@sagarikaghose bhai green mango peene ke saath kharab kar jaao #the_hindu
***** Input's Text *****
wait brother, do not cry this much, its #GST not a bomb. have some shame. @digvijaya_28 @INCIndia " country brought it out "now you sit and cry
***** codemix (True Value) *****
Are bas kar bhai itna nahi rone "ka #GST hai bomb nahi. Kuch to sharm karo. @digvijaya_28 @INCIndia " desh nikal liya "aage u sit and cry
***** codemix (Generated Text) *****
wait bhai, do not cry this much #GST nahi bomb. Haan kuch ho chuka hai @digvijaya_28@INCIndia " country brought it out"
***** Input's Text *****
@rynkee it is thi thought which we want to change. @PunsTurnMeOn
***** codemix (True Value) *****
@rynkee yehi soch to badalni hai @PunsTurn