In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
# import torch
import os
import csv
os.environ["CUDA_VISIBLE_DEVICES"] = ""


tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")  
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")

# Ensure the model uses CPU by calling .to('cpu')
model.to('cpu')

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=1024, truncation=True, padding=True
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"], max_length=256, truncation=True, padding=True
        )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs
    
input_lines = []
label_lines = []

task = 'myDataset'
train_file = 'miniDataset.csv'

with open(f'{task}/{train_file}', newline='') as file:
    reader = csv.reader(file)
    for line in reader:
        input_lines.append(line[0] +'</s>')
        label_lines.append(line[1])

dict_obj = {'inputs': input_lines, 'labels': label_lines}
dataset = Dataset.from_dict(dict_obj)
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

dict_obj = {'inputs': input_lines, 'labels': label_lines}
dataset = Dataset.from_dict(dict_obj)
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments("tmp/",
                                        do_train=True,
                                        do_eval=False,
                                        num_train_epochs=1,
                                        learning_rate=1e-5,
                                        warmup_ratio=0.05,
                                        weight_decay=0.01,
                                        per_device_train_batch_size=8,
                                        per_device_eval_batch_size=4,
                                        logging_dir='./log',
                                        group_by_length=True,
                                        save_strategy="epoch",
                                        save_total_limit=3,
                                        fp16=False,  
                                    )
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)
trainer.train()