## Train

In [None]:
%%capture
!pip install datasets evaluate transformers[sentencepiece]
!pip install rouge_score
!pip install transformers[torch]
!pip install csv
!pip install pandas

In [None]:
# !gsutil cp -r gs://vietai_public/viT5/data/vietnews .
# !gsutil cp -r gs://vietai_public/viT5/data/wikilingua .


In [None]:
from google.colab import drive
drive.mount('/colabDrive')
filePath="/colabDrive/MyDrive/colabDrive/miniDataset.csv"

Mounted at /colabDrive


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import csv

In [None]:
%%capture
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")
model.to('cuda')

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=512, truncation=True, padding=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"], max_length=512, truncation=True, padding=True
        )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [None]:
# Deprecated
# input_lines = []
# label_lines = []

# train_file = '/colabDrive/MyDrive/colabDrive/miniDataset.csv'

# with open(f'{train_file}', newline='') as file:
#     reader = csv.reader(file)
#     for line in reader:
#         input_lines.append(line[0] +'</s>')
#         label_lines.append(line[1])

# dict_obj = {'inputs': input_lines, 'labels': label_lines}
# dataset = Dataset.from_dict(dict_obj)
# tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

In [None]:
# input_lines

In [None]:
import pandas as pd
train_file = '/colabDrive/MyDrive/colabDrive/miniDataset.csv'
# Read the CSV file using pandas
df = pd.read_csv(train_file)

# Convert the columns to lists
input_lines = df['AccentlessSentences'].tolist()
label_lines = df['Sentences'].tolist()

# Append '</s>' to each input line
input_lines = [line + '</s>' for line in input_lines]

dict_obj = {'inputs': input_lines, 'labels': label_lines}
dataset = Dataset.from_dict(dict_obj)
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

Map (num_proc=8):   0%|          | 0/5000 [00:00<?, ? examples/s]



In [None]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")


training_args = Seq2SeqTrainingArguments("tmp/",
                                      do_train=True,
                                      do_eval=False,
                                      num_train_epochs=30,
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=7,
                                      per_device_eval_batch_size=7,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy="epoch",
                                      save_total_limit=3,
                                      #eval_steps=1,
                                      #evaluation_strategy="steps",
                                      # evaluation_strategy="no",
                                      fp16=True,
                                      )


# AdaFactor for ViT5-large models as it based on T5v1.1.
# See https://medium.com/the-artificial-impostor/paper-adafactor-adaptive-learning-rates-with-sublinear-memory-cost-a543abffa37
#
# from transformers.optimization import Adafactor, AdafactorSchedule
# optimizer = Adafactor(
#     model.parameters(),
#     lr=1e-3,
#     eps=(1e-30, 1e-3),
#     clip_threshold=1.0,
#     decay_rate=-0.8,
#     beta1=None,
#     weight_decay=0.0,
#     relative_step=False,
#     scale_parameter=False,
#     warmup_init=False
# )
# lr_scheduler = AdafactorSchedule(optimizer)


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,9.1917
1000,0.1645
1500,0.0792
2000,0.0498
2500,0.0355
3000,0.0279
3500,0.0215
4000,0.017
4500,0.014
5000,0.0119


TrainOutput(global_step=21450, training_loss=0.22626413994318956, metrics={'train_runtime': 12745.234, 'train_samples_per_second': 11.769, 'train_steps_per_second': 1.683, 'total_flos': 5.344472065287168e+16, 'train_loss': 0.22626413994318956, 'epoch': 30.0})

In [None]:

# Save the model
trainer.save_model('/colabDrive/MyDrive/colabDrive/myModelMini')
tokenizer.save_pretrained("/colabDrive/MyDrive/colabDrive/myTokenizer")

('/colabDrive/MyDrive/colabDrive/myTokenizer/tokenizer_config.json',
 '/colabDrive/MyDrive/colabDrive/myTokenizer/special_tokens_map.json',
 '/colabDrive/MyDrive/colabDrive/myTokenizer/spiece.model',
 '/colabDrive/MyDrive/colabDrive/myTokenizer/added_tokens.json',
 '/colabDrive/MyDrive/colabDrive/myTokenizer/tokenizer.json')

## Inference

In [None]:
from datasets import load_metric
metric = load_metric("rouge")


  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
import pandas as pd
train_file = '/colabDrive/MyDrive/colabDrive/miniDataset.csv'
# Read the CSV file using pandas
df = pd.read_csv(train_file)

# Convert the columns to lists
input_lines = df['AccentlessSentences'].tolist()
label_lines = df['Sentences'].tolist()

# Append '</s>' to each input line
input_lines = [line + '</s>' for line in input_lines]

dict_obj = {'inputs': input_lines, 'labels': label_lines}
dataset = Dataset.from_dict(dict_obj)
test_tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)

# input_lines = []
# label_lines = []
# with open(f'{task}/test.tsv') as file:
#   for line in file:
#     line = line.strip().split('\t')
#     input = line[0]
#     input_lines.append(input +'</s>')
#     label_lines.append(line[1])



# input_lines  = input_lines
# label_lines = label_lines
# dict_obj = {'inputs': input_lines, 'labels': label_lines}

# dataset = Dataset.from_dict(dict_obj)
# test_tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

Map (num_proc=10):   0%|          | 0/5000 [00:00<?, ? examples/s]



In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained('/colabDrive/MyDrive/colabDrive/myModelMini')
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
model.to('cuda')

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
import torch
import numpy as np
metrics = load_metric('rouge')

max_target_length = 512
dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  with tokenizer.as_target_tokenizer():
    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()


  0%|          | 0/157 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'rouge1': AggregateScore(low=Score(precision=0.9992397743300422, recall=0.9993307083897525, fmeasure=0.9992763461503182), mid=Score(precision=0.9997596614950635, recall=0.9997805405405404, fmeasure=0.9997850415512465), high=Score(precision=1.0, recall=1.0, fmeasure=1.0)),
 'rouge2': AggregateScore(low=Score(precision=0.9814101694915255, recall=0.9814985514984124, fmeasure=0.9814466412847879), mid=Score(precision=0.9850101694915254, recall=0.9850841742420946, fmeasure=0.9850392731071745), high=Score(precision=0.9882002542372881, recall=0.9882019825148971, fmeasure=0.9882009193854961)),
 'rougeL': AggregateScore(low=Score(precision=0.9992341325811002, recall=0.9993229356009314, fmeasure=0.9992698482875015), mid=Score(precision=0.99971706629055, recall=0.9997663326653307, fmeasure=0.9997374172185431), high=Score(precision=1.0, recall=1.0, fmeasure=1.0)),
 'rougeLsum': AggregateScore(low=Score(precision=0.9992, recall=0.9992987547527487, fmeasure=0.9992121270018895), mid=Score(precision=0

In [None]:
[{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]


[{'rouge1': 0.9997850415512465},
 {'rouge2': 0.9850392731071745},
 {'rougeL': 0.9997374172185431},
 {'rougeLsum': 0.9997875346260388}]