In [1]:
import numpy
# fine tune mt5 on dataset
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from simpletransformers.t5 import T5Model, T5Args
from transformers import pipeline
#import train split
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
import torch
import torch.nn as nn
from google.transliteration import transliterate_word
import klib
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load dataset
#English-Hindi code-mixed parallel corpus.csv
df = pd.read_csv('PHNC/English-Hindi code-mixed parallel corpus.csv')
df = df.dropna()
df = df.reset_index(drop=True)
# add column for prefix
df['prefix'] = 'translate English to Hinglish: '
df.head()


Unnamed: 0,Sentence,English_Translation,prefix
0,@someUSER congratulations on you celebrating b...,@some users congratulate you for celebrating B...,translate English to Hinglish:
1,@LoKarDi_RT uske liye toh bahot kuch karna pad...,"@Lokardi_ rat we should a lot more for that, b...",translate English to Hinglish:
2,@slimswamy yehi to hum semjhane ki koshish kar...,"@Slimswami ehi, this is what i'm expecting you...",translate English to Hinglish:
3,@DramebaazKudi cake kaha hai ??,@Where is Dramebajakudi where is the cake?,translate English to Hinglish:
4,@someUSER i'm in hawaii at the moment . home ...,@some user Don't want to come home next friday...,translate English to Hinglish:


In [3]:
#data cleaning 

df=klib.data_cleaning(df)

Shape of cleaned data: (13737, 2) - Remaining NAs: 0


Dropped rows: 1
     of which 1 duplicates. (Rows (first 150 shown): [1091])

Dropped columns: 1
     of which 1 single valued.     Columns: ['prefix']
Dropped missing values: 0
Reduced memory by at least: 0.1 MB (-32.26%)



In [4]:
#split train, val, test
# convert df  so that it can be used by transformers


train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

#print lens
print(len(train))
print(len(val))
print(len(test))

#save train, val, test
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)


8791
2198
2748


In [17]:
df.columns

Index(['sentence', 'english_translation'], dtype='object')

In [5]:
#tokenize
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
tokenizer.add_special_tokens({'additional_special_tokens': ['<sep>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<pad>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<s>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['</s>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<unk>']})



0

In [24]:
df['sentence']

0        @someUSER congratulations on you celebrating b...
1        @LoKarDi_RT uske liye toh bahot kuch karna pad...
2        @slimswamy yehi to hum semjhane ki koshish kar...
3                          @DramebaazKudi cake kaha hai ??
4        @someUSER i'm in hawaii at the moment .  home ...
                               ...                        
13732    Dr Kumar Vishwas: "Koi deewana kehta hai.. koi...
13733    Me: Aaj kuch toofani karte hai.

Mom: Pani ki ...
13734    Pyar mangi to Jaan dengi,milk mango to kher de...
13735     @imcomplicated__ kaale kaale baal gaal gore gore
13736                            Ye sab aunty'on ke saath?
Name: sentence, Length: 13737, dtype: string

In [11]:
maxlen = 512
def tokenize_df(df):
    target = tokenizer(df['sentence'], padding='max_length', truncation=True, return_tensors="pt", max_length=maxlen)
    input = tokenizer(df['english_translation'], padding='max_length', truncation=True, return_tensors="pt", max_length=maxlen)
    input_ids = input['input_ids']
    attention_mask = input['attention_mask']
    target_ids = target['input_ids']
    target_attention_mask = target['attention_mask']
    decoder_input_ids = target_ids.clone()
    #convert to tensors
    input_ids = torch.tensor(input_ids).squeeze()
    attention_mask = torch.tensor(attention_mask).squeeze()
    target_ids = torch.tensor(target_ids).squeeze()
    target_attention_mask = torch.tensor(target_attention_mask).squeeze()
   # decoder_input_ids = torch.tensor(decoder_input_ids)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': target_ids,
        #'decoder_input_ids': decoder_input_ids,
        #'decoder_attention_mask': target_attention_mask
    }


In [12]:
#tokenize train, val, test
train = load_dataset('csv', data_files='train.csv')
val = load_dataset('csv', data_files='val.csv')
test = load_dataset('csv', data_files='test.csv')
train = train.map(tokenize_df, batched=True, batch_size=128,remove_columns=['sentence','english_translation'])
val = val.map(tokenize_df, batched=True, batch_size=128,remove_columns=['sentence','english_translation'])
test = test.map(tokenize_df, batched=True, batch_size=128,remove_columns=['sentence','english_translation'])


Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-a95dd74f5c6e7a88/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 680.78it/s]
Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-0292b3f7acb658fe/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 450.81it/s]
Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-024ca5486b0891b6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 574.01it/s]
  input_ids = torch.tensor(input_ids).squeeze()
  attention_mask = torch.tensor(attention_mask).squeeze()
  target_ids = torch.tensor(target_ids).squeeze()
  target_attention_mask = torch.tensor(target_attention_mask).squeeze()
                                                                 

In [13]:
train
#get sample 
sample = train['train'][0]
sample
#print shapes
print(len(sample['input_ids']))
print(len(sample['attention_mask']))
#print(len(sample['decoder_input_ids']))
#print(len(sample['decoder_attention_mask']))


512
512


In [28]:
val

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2198
    })
})

In [29]:
# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# batch_size = 8
# train_dataloader = DataLoader(

#             train,  # The training samples.

#             sampler = RandomSampler(train), # Select batches randomly

#             batch_size = batch_size # Trains with this batch size.

#         )

# validation_dataloader = DataLoader(

#             val, # The validation samples.

#             sampler = SequentialSampler(val), # Pull out batches sequentially.

#             batch_size = batch_size # Evaluate with this batch size.

#         )

# test_dataloader = DataLoader(


#             test, # The validation samples. 

#             sampler = SequentialSampler(test), # Pull out batches sequentially.

#             batch_size = batch_size # Evaluate with this batch size.

#         )

# #test train data loader
# for batch in train_dataloader:

#     print(batch)

#     break


In [14]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

def tokenize_sentence(arg):
    encoded_arg =tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    # Replace -100
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Convert id tokens to text
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Insert a line break (\n) in each sentence for ROUGE scoring
    # (Note : Please change this code, when you perform on other languages except for Japanese)
    text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
    sent_tokenizer_jp = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(l))) for l in text_labels]
    # compute ROUGE score with custom tokenization
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )

Using the latest cached version of the module from /home/aparna/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--rouge/b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Mon Mar 20 18:02:43 2023) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.


In [31]:
# finetuen mt5
os.environ["WANDB_DISABLED"] = "true"
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
model.resize_token_embeddings(len(tokenizer))

#training args


training_args = Seq2SeqTrainingArguments(
  output_dir = "mt5-codemixed",
  log_level = "error",
  num_train_epochs = 10,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adafactor",
  weight_decay = 0.01,
  per_device_train_batch_size = 2,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "steps",
  eval_steps = 100,
  predict_with_generate=True,
  generation_max_length = 128,
  save_steps = 500,
  logging_steps = 10,
  push_to_hub = False
)


#trainer
trainer = Seq2SeqTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train["train"],         # training dataset
    eval_dataset=val["train"],             # evaluation dataset
    tokenizer=tokenizer,               # tokenizer
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model), # data collator
    
)

#train
trainer.train()

#save model
trainer.save_model("./mt5")

  0%|          | 3/2740 [12:55<196:31:16, 258.49s/it]


In [15]:
from torch.utils.data import DataLoader
model = MT5ForConditionalGeneration.from_pretrained("./mt5v1")
#tokenizer = MT5Tokenizer.from_pretrained("./mt5")


sample_dataloader = DataLoader(
  test["train"].with_format("torch"),
  collate_fn=DataCollatorForSeq2Seq(tokenizer, model=model),
  batch_size=5)
for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"],
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  break
print(preds, labels)
metrics_func([preds, labels])

tensor([[     0,   1250,   7800, 198280,    259,  65334,    260,   1061,  39084,
          19349,    318,  51571,  62342,  16263, 144044,    603,   2148,    513,
           2941,    334,   1312,  88806,   2050,    432,    262,    268,    387,
           1759,    290, 165794,      1,      0,      0,      0,      0,      0,
              0,      0,      0],
        [     0,  27696,  62342,    259,    261,    342,    776, 113865,    714,
           2829,    387, 102339,   9065,  42716,    260,   1816,    321,  12961,
            623,   3663,    479,   1776,   1250,   6253, 182594,    262,    290,
           1795,   1061, 146525,  56696,    313,  11395,    330,  35714,    609,
           1350,    311,      1],
        [     0,   1250,  38393,    265,    299,    609,    339,   4592,   6504,
            259,   1542,    787,   3007,    288,   6313,    260,   1061,    559,
            604,    263, 152418,   7925,   7954,      1,      0,      0,      0,
              0,      0,      0,      0, 

{'rouge1': 0.47184469277492536,
 'rouge2': 0.32362358952522885,
 'rougeL': 0.44858887882143694,
 'rougeLsum': 0.44858887882143694}

In [16]:
from torch.utils.data import DataLoader

# Predict with test data (first 5 rows)
sample_dataloader = DataLoader(
  test["train"].with_format("torch"),
  collate_fn=DataCollatorForSeq2Seq(tokenizer, model=model),
  batch_size=5)
for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"],
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  inputs = batch["input_ids"]
  break

# Replace -100 (see above)
inputs = np.where(inputs != -100, inputs, tokenizer.pad_token_id)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

# Convert id tokens to text
text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
text_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)

# Show result
print("***** Input's Text *****")
print(text_inputs[2])
print("***** codemix (True Value) *****")
print(text_labels[2])
print("***** codemix (Generated Text) *****")
print(text_preds[2])

***** Input's Text *****
@rynkee it is thi thought which we want to change. @PunsTurnMeOn
***** codemix (True Value) *****
@rynkee yehi soch to badalni hai @PunsTurnMeOn
***** codemix (Generated Text) *****
@rynkees it is thi thought which we want to change.@PunsTurnMeOn


In [17]:
for i in range(5):
    print("***** Input's Text *****")
    print(text_inputs[i])
    print("***** codemix (True Value) *****")
    print(text_labels[i])
    print("***** codemix (Generated Text) *****")
    print(text_preds[i])

***** Input's Text *****
@hurdangi haan.. @sagarikaghose sister will eat green mango today @the_hindu
***** codemix (True Value) *****
@hurdangi haan.. @sagarikaghose Didi aaj hare rang ke aam khaengi @the_hindu
***** codemix (Generated Text) *****
@hurdangi haan.@sagarikaghose bhai green mango peene ke saath kharab kar jaao #the_hindu
***** Input's Text *****
wait brother, do not cry this much, its #GST not a bomb. have some shame. @digvijaya_28 @INCIndia " country brought it out "now you sit and cry
***** codemix (True Value) *****
Are bas kar bhai itna nahi rone "ka #GST hai bomb nahi. Kuch to sharm karo. @digvijaya_28 @INCIndia " desh nikal liya "aage u sit and cry
***** codemix (Generated Text) *****
wait bhai, do not cry this much #GST nahi bomb. Haan kuch ho chuka hai @digvijaya_28@INCIndia " country brought it out"
***** Input's Text *****
@rynkee it is thi thought which we want to change. @PunsTurnMeOn
***** codemix (True Value) *****
@rynkee yehi soch to badalni hai @PunsTurn