In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model_name = "allegro/plt5-base"

In [None]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):

    def __init__(self, file_paths, reverse=False, transform=None, target_transform=None):
        def read_from_file(file_path):
            with open(file_path, "r", encoding="UTF-8") as f:
                for i, line in enumerate(f):
                    if i % 2 == 0:
                        self.data.append([line.strip()])
                    else:
                        self.data[-1].append(line.strip())

        self.data = []
        self.back = reverse
        self.transform = transform
        self.target_transform = target_transform

        if type(file_paths) is str:
            read_from_file(file_paths)
        elif type(file_paths) in (list, tuple):
            for file_path in file_paths:
                read_from_file(file_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx][0]
        target = self.data[idx][1]
        if self.transform:
            sample = self.transform(sample)
        if self.target_transform:
            target = self.target_transform(target)
        if self.back:
            return target, sample
        else:
            return sample, target


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch
import os
print(torch.cuda.is_available())

# Load the tokenizer and model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
for param in model.parameters():
  param.data = param.data.contiguous()

# Define a function to tokenize the inputs
def tokenize_function(sample_target_pair):
    sample, target = sample_target_pair
    model_inputs = tokenizer(sample, max_length=16, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(target, max_length=16, truncation=True, padding="max_length", return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    for key in model_inputs:
        model_inputs[key] = model_inputs[key].view(-1)
    return model_inputs

def tokenize(dataset):
  return  [tokenize_function(pair) for pair in dataset]


# Load your custom dataset
train_translation = TranslationDataset("/content/drive/MyDrive/PSL-Translator/data.txt")
val_translation = TranslationDataset("/content/drive/MyDrive/PSL-Translator/val_data.txt")
train_translation = tokenize(train_translation)
val_translation = tokenize(val_translation)

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set training arguments
training_args = TrainingArguments(
    output_dir=f"/content/drive/MyDrive/PSL-Translator/{model_name}",  # output directory
    evaluation_strategy="epoch",  # evaluation during each epoch
    learning_rate=3e-5,  # learning rate
    per_device_train_batch_size=8,  # batch size
    per_device_eval_batch_size=8,  # evaluation batch size
    weight_decay=0.01,  # weight decay for regularization
    save_total_limit=2,  # limit total checkpoint saves
    num_train_epochs=10,  # number of epochs to train        # enables text generation for evaluation
    logging_dir="./logs",  # directory for storing logs
    logging_steps=10,
    report_to="none",
    save_strategy="epoch"
)





True


pytorch_model.bin:  86%|########5 | 944M/1.10G [00:00<?, ?B/s]



In [17]:
# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_translation,
    eval_dataset=val_translation,
    data_collator=data_collator,
)

# Train the model
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,2.0594,1.134108
2,1.2047,0.738685
3,1.3297,0.642928


Epoch,Training Loss,Validation Loss
1,2.0594,1.134108
2,1.2047,0.738685
3,1.3297,0.642928
4,1.1919,0.564664
5,0.8084,0.528679
6,0.8664,0.518459
7,0.7927,0.497184
8,0.6232,0.480969
9,0.7149,0.490616
10,0.732,0.489613


TrainOutput(global_step=11590, training_loss=1.386980284690034, metrics={'train_runtime': 2758.6284, 'train_samples_per_second': 33.607, 'train_steps_per_second': 4.201, 'total_flos': 2106359428055040.0, 'train_loss': 1.386980284690034, 'epoch': 10.0})

In [18]:
model.eval()
with torch.no_grad():
  for data in val_translation:
    outputs = model.generate(data['input_ids'].unsqueeze(0).to("cuda"))
    print(tokenizer.decode(data['input_ids'], skip_special_tokens=True))
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))




Ja wczoraj kino iść było
Wczoraj byłem w kinie
Ty pies spacer kiedy?
Czy wyprowadzasz psa na spacer?
Ona ciasto piec umie
Ona umie piec ciasto
My dom duży kupić
Kupiliśmy duży dom
Dziecko szkoła zaczyna dzisiaj
Dziecko dzisiaj idzie do szkoły
Ja kawę lubić nie
Nie lubię kawy
Ty rower naprawić kiedy?
Czy naprawiłeś rower?
Ona zima narty jeździ
Ona jeździ na nartach zimą
On koncert być jutro
On jutro idzie na koncert
My wakacje Grecja planować
Planujemy wakacje w Grecji
Ty kot mleko dać?
Czy masz mleko kota?
Ona telefon nowy kupić
Ona kupiła nowy telefon
Ja woda pić dużo
Piję dużo wody
Dziecko plac zabaw bawić
Dziecko bawi się na placu zabaw
On samochód sprzedać miesiąc temu
On sprzedał samochód miesiąc temu
Ty ćwiczyć codziennie?
Czy codziennie ćwiczysz?
Ona książka czytać teraz
Ona teraz czyta książkę
My film oglądać wczoraj
Oglądaliśmy film wczoraj
Ty komputer naprawa skończyć?
Skończyłeś naprawić komputer?
Ja podróż Włochy planować
Planuję podróż do Włoch
Ty praca na jutro zrobić?
Cz

KeyboardInterrupt: 