In [13]:
import numpy
# fine tune mt5 on dataset
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from simpletransformers.t5 import T5Model, T5Args
from transformers import pipeline
#import train split
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
import torch
import torch.nn as nn
from google.transliteration import transliterate_word
import klib
import os

In [14]:
#load dataset
#English-Hindi code-mixed parallel corpus.csv
df = pd.read_json('sentences_0.json')
df = df.dropna()
df = df.reset_index(drop=True)
# add column for prefix
df['prefix'] = 'translate English to Hinglish: '
df.head()


Unnamed: 0,en,cm,prefix
0,"Other, Private Use",nizI,translate English to Hinglish:
1,[SCREAMING],barTa chillA,translate English to Hinglish:
2,Spouse,pati/patnI,translate English to Hinglish:
3,I will never salute you!,maiM will kabhI Apako salAma,translate English to Hinglish:
4,and the stars and the trees bow themselves;,aura the stars aura the bUTiyA.N beleM themselves,translate English to Hinglish:


In [15]:
#data cleaning 

df=klib.data_cleaning(df)

Shape of cleaned data: (34147, 2) - Remaining NAs: 0


Dropped rows: 5500
     of which 5500 duplicates. (Rows (first 150 shown): [180, 375, 556, 566, 651, 801, 845, 866, 874, 1113, 1209, 1219, 1382, 1397, 1514, 1532, 1597, 1727, 1791, 1833, 1839, 1874, 1936, 1940, 1947, 1968, 1994, 2009, 2022, 2032, 2091, 2132, 2139, 2142, 2151, 2161, 2195, 2217, 2223, 2242, 2297, 2318, 2343, 2346, 2358, 2410, 2420, 2450, 2476, 2521, 2543, 2545, 2563, 2607, 2608, 2616, 2653, 2689, 2716, 2811, 2831, 2839, 2867, 2877, 3007, 3019, 3034, 3075, 3088, 3095, 3115, 3122, 3182, 3227, 3253, 3260, 3315, 3327, 3329, 3428, 3468, 3569, 3570, 3583, 3597, 3610, 3653, 3674, 3690, 3693, 3695, 3736, 3756, 3760, 3784, 3867, 3880, 3884, 3888, 3897, 3961, 4048, 4068, 4077, 4122, 4180, 4208, 4228, 4304, 4311, 4342, 4368, 4374, 4400, 4403, 4411, 4460, 4466, 4496, 4520, 4523, 4535, 4552, 4556, 4559, 4567, 4580, 4593, 4597, 4635, 4646, 4648, 4663, 4677, 4691, 4696, 4717, 4727, 4766, 4787, 4825, 4847, 4865, 4896, 4904, 4924, 49

In [16]:
#split train, val, test
# convert df  so that it can be used by transformers


train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

#print lens
print(len(train))
print(len(val))
print(len(test))

#save train, val, test
train.to_csv('train1.csv', index=False)
val.to_csv('val1.csv', index=False)
test.to_csv('test1.csv', index=False)


21853
5464
6830


In [17]:
df.columns

Index(['en', 'cm'], dtype='object')

In [18]:
#tokenize
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
tokenizer.add_special_tokens({'additional_special_tokens': ['<sep>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<pad>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<s>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['</s>']})
tokenizer.add_special_tokens({'additional_special_tokens': ['<unk>']})



0

In [40]:
df['en']


str

In [44]:
maxlen = 512
def tokenize_df(df):
    target = tokenizer([str(i) for i in df["cm"]], padding='max_length', truncation=True, return_tensors="pt", max_length=maxlen)
    input = tokenizer([str(i) for i in df["en"]], padding='max_length', truncation=True, return_tensors="pt", max_length=maxlen)
    #print("x")
    input_ids = input['input_ids']
    attention_mask = input['attention_mask']
    target_ids = target['input_ids']
    target_attention_mask = target['attention_mask']
    decoder_input_ids = target_ids.clone()
    #convert to tensors
    input_ids = torch.tensor(input_ids).squeeze()
    attention_mask = torch.tensor(attention_mask).squeeze()
    target_ids = torch.tensor(target_ids).squeeze()
    target_attention_mask = torch.tensor(target_attention_mask).squeeze()
   # decoder_input_ids = torch.tensor(decoder_input_ids)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': target_ids,
        #'decoder_input_ids': decoder_input_ids,
        #'decoder_attention_mask': target_attention_mask
    }


In [42]:
tokenize_df(df)


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [45]:
#tokenize train, val, test
train = load_dataset('csv', data_files='train1.csv')
val = load_dataset('csv', data_files='val1.csv')
test = load_dataset('csv', data_files='test1.csv')
train = train.map(tokenize_df, batched=True, batch_size=128,remove_columns=['en','cm'])
val = val.map(tokenize_df, batched=True, batch_size=128,remove_columns=['en','cm'])
test = test.map(tokenize_df, batched=True, batch_size=128,remove_columns=['en','cm'])


Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-d1b7b873547cdb08/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 658.55it/s]
Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-2ec1d98a11ac1402/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 659.27it/s]
Found cached dataset csv (/home/aparna/.cache/huggingface/datasets/csv/default-1d357afce8d6b0aa/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 647.07it/s]
  input_ids = torch.tensor(input_ids).squeeze()
  attention_mask = torch.tensor(attention_mask).squeeze()
  target_ids = torch.tensor(target_ids).squeeze()
  target_attention_mask = torch.tensor(target_attention_mask).squeeze()
Map:   2%|▏         | 512/21853 [00:00<00:09, 2211.15 examples/s]

x
x
x
x
x


Map:   5%|▍         | 1024/21853 [00:00<00:08, 2428.68 examples/s]

x
x
x
x
x


Map:   9%|▉         | 1920/21853 [00:00<00:07, 2588.13 examples/s]

x
x
x
x
x


Map:  11%|█         | 2432/21853 [00:00<00:07, 2562.37 examples/s]

x
x
x
x
x


Map:  14%|█▍        | 3072/21853 [00:01<00:07, 2558.27 examples/s]

x
x
x
x
x


Map:  17%|█▋        | 3712/21853 [00:01<00:07, 2532.34 examples/s]

x
x
x
x
x


Map:  19%|█▉        | 4224/21853 [00:01<00:06, 2637.83 examples/s]

x
x
x
x
x


Map:  23%|██▎       | 5120/21853 [00:01<00:06, 2681.23 examples/s]

x
x
x
x
x


Map:  25%|██▌       | 5504/21853 [00:02<00:06, 2601.03 examples/s]

x
x
x
x


Map:  29%|██▊       | 6272/21853 [00:02<00:06, 2562.63 examples/s]

x
x
x
x
x


Map:  30%|███       | 6656/21853 [00:02<00:06, 2491.83 examples/s]

x
x
x
x


Map:  32%|███▏      | 7040/21853 [00:02<00:06, 2466.58 examples/s]

x
x
x
x
x


Map:  36%|███▋      | 7936/21853 [00:03<00:05, 2427.98 examples/s]

x
x
x
x


Map:  37%|███▋      | 8192/21853 [00:03<00:05, 2399.93 examples/s]

x
x
x
x
x


Map:  41%|████      | 8960/21853 [00:03<00:05, 2550.25 examples/s]

x
x
x
x
x


Map:  45%|████▍     | 9728/21853 [00:03<00:04, 2654.45 examples/s]

x
x
x
x
x


Map:  46%|████▋     | 10112/21853 [00:03<00:04, 2683.43 examples/s]

x
x
x
x
x


Map:  50%|█████     | 11008/21853 [00:04<00:03, 2780.12 examples/s]

x
x
x
x
x


Map:  53%|█████▎    | 11520/21853 [00:04<00:03, 2676.07 examples/s]

x
x
x
x
x


Map:  54%|█████▍    | 11904/21853 [00:04<00:03, 2695.99 examples/s]

x
x
x


Map:  58%|█████▊    | 12672/21853 [00:05<00:04, 2180.39 examples/s]

x
x
x
x


Map:  60%|██████    | 13184/21853 [00:05<00:03, 2306.68 examples/s]

x
x
x
x


Map:  63%|██████▎   | 13696/21853 [00:05<00:03, 2303.82 examples/s]

x
x
x
x


Map:  64%|██████▍   | 13952/21853 [00:05<00:04, 1934.01 examples/s]

x
x
x


Map:  67%|██████▋   | 14592/21853 [00:05<00:03, 2190.83 examples/s]

x
x
x
x
x


Map:  69%|██████▉   | 15104/21853 [00:06<00:02, 2271.81 examples/s]

x
x
x
x
x


Map:  73%|███████▎  | 16000/21853 [00:06<00:02, 2534.26 examples/s]

x
x
x
x
x


Map:  74%|███████▍  | 16256/21853 [00:06<00:02, 2445.10 examples/s]

x
x
x
x
x


Map:  78%|███████▊  | 17152/21853 [00:06<00:01, 2629.73 examples/s]

x
x
x
x
x


Map:  80%|████████  | 17536/21853 [00:07<00:01, 2555.94 examples/s]

x
x
x
x


Map:  83%|████████▎ | 18176/21853 [00:07<00:01, 2444.25 examples/s]

x
x
x
x
x


Map:  86%|████████▌ | 18816/21853 [00:07<00:01, 2500.89 examples/s]

x
x
x
x
x


Map:  88%|████████▊ | 19328/21853 [00:07<00:00, 2647.13 examples/s]

x
x
x
x
x


Map:  93%|█████████▎| 20352/21853 [00:08<00:00, 2631.76 examples/s]

x
x
x
x
x


Map:  95%|█████████▍| 20736/21853 [00:08<00:00, 2563.43 examples/s]

x
x
x
x


Map:  98%|█████████▊| 21376/21853 [00:08<00:00, 2554.14 examples/s]

x
x
x
x


Map:  99%|█████████▉| 21632/21853 [00:08<00:00, 2443.80 examples/s]

x
x
x


                                                                   

x


Map:   0%|          | 0/5464 [00:00<?, ? examples/s]

x
x


Map:   7%|▋         | 384/5464 [00:00<00:02, 2333.08 examples/s]

x
x
x


Map:  14%|█▍        | 768/5464 [00:00<00:01, 2567.92 examples/s]

x
x


Map:  21%|██        | 1152/5464 [00:00<00:01, 2573.94 examples/s]

x
x
x


Map:  28%|██▊       | 1536/5464 [00:00<00:01, 2587.93 examples/s]

x
x
x
x
x


Map:  37%|███▋      | 2048/5464 [00:00<00:01, 2641.57 examples/s]

x
x


Map:  45%|████▍     | 2432/5464 [00:00<00:01, 2698.19 examples/s]

x
x
x
x
x


Map:  54%|█████▍    | 2944/5464 [00:01<00:00, 2635.91 examples/s]

x
x
x


Map:  63%|██████▎   | 3456/5464 [00:01<00:00, 2695.53 examples/s]

x
x
x
x
x


Map:  73%|███████▎  | 3968/5464 [00:01<00:00, 2717.94 examples/s]

x
x


Map:  80%|███████▉  | 4352/5464 [00:01<00:00, 2624.90 examples/s]

x
x
x
x


Map:  89%|████████▉ | 4864/5464 [00:01<00:00, 2599.08 examples/s]

x
x
x


Map:  96%|█████████▌| 5248/5464 [00:01<00:00, 2632.74 examples/s]

x
x


                                                                 

x
x


Map:   4%|▎         | 256/6830 [00:00<00:02, 2254.45 examples/s]

x
x


Map:   7%|▋         | 512/6830 [00:00<00:02, 2293.93 examples/s]

x
x


Map:  11%|█         | 768/6830 [00:00<00:02, 2142.17 examples/s]

x
x


Map:  15%|█▍        | 1024/6830 [00:00<00:02, 2148.82 examples/s]

x
x
x
x


Map:  22%|██▏       | 1536/6830 [00:00<00:02, 2334.41 examples/s]

x
x
x
x


Map:  26%|██▌       | 1792/6830 [00:00<00:02, 2121.27 examples/s]

x
x


Map:  32%|███▏      | 2176/6830 [00:00<00:02, 2283.44 examples/s]

x
x
x


Map:  39%|███▉      | 2688/6830 [00:01<00:01, 2421.93 examples/s]

x
x


Map:  43%|████▎     | 2944/6830 [00:01<00:01, 2325.47 examples/s]

x
x
x
x
x


Map:  49%|████▊     | 3328/6830 [00:01<00:01, 2457.11 examples/s]

x
x


Map:  54%|█████▍    | 3712/6830 [00:01<00:01, 2521.18 examples/s]

x
x
x


Map:  62%|██████▏   | 4224/6830 [00:01<00:00, 2693.03 examples/s]

x
x
x
x
x


Map:  69%|██████▉   | 4736/6830 [00:01<00:00, 2704.95 examples/s]

x
x
x
x


Map:  75%|███████▍  | 5120/6830 [00:02<00:00, 2385.65 examples/s]

x


Map:  84%|████████▍ | 5760/6830 [00:02<00:00, 2468.58 examples/s]

x
x
x
x
x


Map:  88%|████████▊ | 6016/6830 [00:02<00:00, 2397.11 examples/s]

x
x
x
x


Map:  94%|█████████▎| 6400/6830 [00:02<00:00, 2454.34 examples/s]

x


Map:  97%|█████████▋| 6656/6830 [00:02<00:00, 2478.29 examples/s]

x
x
x


                                                                 

In [26]:
sample = train['train'][0]
sample

{'en': 'I get into the shower, I told you already.',
 'cm': 'maiM get into the maiM snAna told you already .'}

In [None]:
train
#get sample 
sample = train['train'][0]
sample
#print shapes
print(len(sample['input_ids']))
print(len(sample['attention_mask']))
#print(len(sample['decoder_input_ids']))
#print(len(sample['decoder_attention_mask']))


512
512


In [None]:
val

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2198
    })
})

In [None]:
# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# batch_size = 8
# train_dataloader = DataLoader(

#             train,  # The training samples.

#             sampler = RandomSampler(train), # Select batches randomly

#             batch_size = batch_size # Trains with this batch size.

#         )

# validation_dataloader = DataLoader(

#             val, # The validation samples.

#             sampler = SequentialSampler(val), # Pull out batches sequentially.

#             batch_size = batch_size # Evaluate with this batch size.

#         )

# test_dataloader = DataLoader(


#             test, # The validation samples. 

#             sampler = SequentialSampler(test), # Pull out batches sequentially.

#             batch_size = batch_size # Evaluate with this batch size.

#         )

# #test train data loader
# for batch in train_dataloader:

#     print(batch)

#     break


In [None]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

def tokenize_sentence(arg):
    encoded_arg =tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    # Replace -100
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Convert id tokens to text
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Insert a line break (\n) in each sentence for ROUGE scoring
    # (Note : Please change this code, when you perform on other languages except for Japanese)
    text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
    sent_tokenizer_jp = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(l))) for l in text_labels]
    # compute ROUGE score with custom tokenization
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )

Using the latest cached version of the module from /home/aparna/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--rouge/b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Mon Mar 20 18:02:43 2023) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.


In [None]:
# finetuen mt5
os.environ["WANDB_DISABLED"] = "true"
model = MT5ForConditionalGeneration.from_pretrained('bert-base-multilingual-cased')
model.resize_token_embeddings(len(tokenizer))

#training args


training_args = Seq2SeqTrainingArguments(
  output_dir = "mt5-synthetic",
  log_level = "error",
  num_train_epochs = 10,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adafactor",
  weight_decay = 0.01,
  per_device_train_batch_size = 2,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "steps",
  eval_steps = 100,
  predict_with_generate=True,
  generation_max_length = 128,
  save_steps = 500,
  logging_steps = 10,
  push_to_hub = False
)


#trainer
trainer = Seq2SeqTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train["train"],         # training dataset
    eval_dataset=val["train"],             # evaluation dataset
    tokenizer=tokenizer,               # tokenizer
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model), # data collator
    
)

#train
trainer.train()

#save model
trainer.save_model("./mt5_synthetic")

  0%|          | 3/2740 [12:55<196:31:16, 258.49s/it]


In [None]:
from torch.utils.data import DataLoader
model = MT5ForConditionalGeneration.from_pretrained("./mt5_synthetic")
#tokenizer = MT5Tokenizer.from_pretrained("./mt5")


sample_dataloader = DataLoader(
  test["train"].with_format("torch"),
  collate_fn=DataCollatorForSeq2Seq(tokenizer, model=model),
  batch_size=5)
for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"],
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  break
print(preds, labels)
metrics_func([preds, labels])

tensor([[     0,   1250,   7800, 198280,    259,  65334,    260,   1061,  39084,
          19349,    318,  51571,  62342,  16263, 144044,    603,   2148,    513,
           2941,    334,   1312,  88806,   2050,    432,    262,    268,    387,
           1759,    290, 165794,      1,      0,      0,      0,      0,      0,
              0,      0,      0],
        [     0,  27696,  62342,    259,    261,    342,    776, 113865,    714,
           2829,    387, 102339,   9065,  42716,    260,   1816,    321,  12961,
            623,   3663,    479,   1776,   1250,   6253, 182594,    262,    290,
           1795,   1061, 146525,  56696,    313,  11395,    330,  35714,    609,
           1350,    311,      1],
        [     0,   1250,  38393,    265,    299,    609,    339,   4592,   6504,
            259,   1542,    787,   3007,    288,   6313,    260,   1061,    559,
            604,    263, 152418,   7925,   7954,      1,      0,      0,      0,
              0,      0,      0,      0, 

{'rouge1': 0.47184469277492536,
 'rouge2': 0.32362358952522885,
 'rougeL': 0.44858887882143694,
 'rougeLsum': 0.44858887882143694}

In [None]:
from torch.utils.data import DataLoader

# Predict with test data (first 5 rows)
sample_dataloader = DataLoader(
  test["train"].with_format("torch"),
  collate_fn=DataCollatorForSeq2Seq(tokenizer, model=model),
  batch_size=5)
for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"],
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  inputs = batch["input_ids"]
  break

# Replace -100 (see above)
inputs = np.where(inputs != -100, inputs, tokenizer.pad_token_id)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

# Convert id tokens to text
text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
text_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)

# Show result
print("***** Input's Text *****")
print(text_inputs[2])
print("***** codemix (True Value) *****")
print(text_labels[2])
print("***** codemix (Generated Text) *****")
print(text_preds[2])

***** Input's Text *****
@rynkee it is thi thought which we want to change. @PunsTurnMeOn
***** codemix (True Value) *****
@rynkee yehi soch to badalni hai @PunsTurnMeOn
***** codemix (Generated Text) *****
@rynkees it is thi thought which we want to change.@PunsTurnMeOn


In [None]:
for i in range(5):
    print("***** Input's Text *****")
    print(text_inputs[i])
    print("***** codemix (True Value) *****")
    print(text_labels[i])
    print("***** codemix (Generated Text) *****")
    print(text_preds[i])

***** Input's Text *****
@hurdangi haan.. @sagarikaghose sister will eat green mango today @the_hindu
***** codemix (True Value) *****
@hurdangi haan.. @sagarikaghose Didi aaj hare rang ke aam khaengi @the_hindu
***** codemix (Generated Text) *****
@hurdangi haan.@sagarikaghose bhai green mango peene ke saath kharab kar jaao #the_hindu
***** Input's Text *****
wait brother, do not cry this much, its #GST not a bomb. have some shame. @digvijaya_28 @INCIndia " country brought it out "now you sit and cry
***** codemix (True Value) *****
Are bas kar bhai itna nahi rone "ka #GST hai bomb nahi. Kuch to sharm karo. @digvijaya_28 @INCIndia " desh nikal liya "aage u sit and cry
***** codemix (Generated Text) *****
wait bhai, do not cry this much #GST nahi bomb. Haan kuch ho chuka hai @digvijaya_28@INCIndia " country brought it out"
***** Input's Text *****
@rynkee it is thi thought which we want to change. @PunsTurnMeOn
***** codemix (True Value) *****
@rynkee yehi soch to badalni hai @PunsTurn