In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model_name = "????"

In [None]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):

    def __init__(self, file_paths, reverse=False, transform=None, target_transform=None):
        def read_from_file(file_path):
            with open(file_path, "r", encoding="UTF-8") as f:
                for i, line in enumerate(f):
                    if i % 2 == 0:
                        self.data.append([line.strip()])
                    else:
                        self.data[-1].append(line.strip())

        self.data = []
        self.back = reverse
        self.transform = transform
        self.target_transform = target_transform

        if type(file_paths) is str:
            read_from_file(file_paths)
        elif type(file_paths) in (list, tuple):
            for file_path in file_paths:
                read_from_file(file_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx][0]
        target = self.data[idx][1]
        if self.transform:
            sample = self.transform(sample)
        if self.target_transform:
            target = self.target_transform(target)
        if self.back:
            return target, sample
        else:
            return sample, target


In [None]:
import json

class GecDataset(Dataset):
    def __init__(self, file_path, transform=None, target_transform=None):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                incorrect_sentence = data["incorrect"]
                correct_sentence = data["correct"]
                self.data.append((incorrect_sentence, correct_sentence))
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample, target = self.data[idx]
        if self.transform:
            sample = self.transform(sample)
        if self.target_transform:
            target = self.target_transform(target)
        return sample, target


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch
import os
print(torch.cuda.is_available())

# Load the tokenizer and model

tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)
for param in model.parameters():
  param.data = param.data.contiguous()

# Define a function to tokenize the inputs
def tokenize_function(sample_target_pair):
    sample, target = sample_target_pair
    model_inputs = tokenizer(sample, max_length=16, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(target, max_length=16, truncation=True, padding="max_length", return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    for key in model_inputs:
        model_inputs[key] = model_inputs[key].view(-1)
    return model_inputs

def tokenize(dataset):
  return  [tokenize_function(pair) for pair in dataset]


# Load your custom dataset
translation_dataset = TranslationDataset("/content/drive/MyDrive/PSL-Translator/data.txt")
train_translation, val_translation = torch.utils.data.random_split(translation_dataset, (0.95, 0.05))
train_translation = tokenize(train_translation)
val_translation = tokenize(val_translation)

gec_dataset = GecDataset("/content/drive/MyDrive/PSL-Translator/gec_dataset.jsonl")

train_gec, val_gec = torch.utils.data.random_split(gec_dataset, (0.95, 0.05))
train_gec = tokenize(train_gec)
val_gec = tokenize(val_gec)

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set training arguments
training_args = TrainingArguments(
    output_dir=f"/content/drive/MyDrive/PSL-Translator/{model_name}",  # output directory
    evaluation_strategy="epoch",  # evaluation during each epoch
    learning_rate=3e-5,  # learning rate
    per_device_train_batch_size=8,  # batch size
    per_device_eval_batch_size=8,  # evaluation batch size
    weight_decay=0.01,  # weight decay for regularization
    save_total_limit=2,  # limit total checkpoint saves
    num_train_epochs=10,  # number of epochs to train        # enables text generation for evaluation
    logging_dir="./logs",  # directory for storing logs
    logging_steps=10,
    report_to="none",
    save_strategy="epoch"
)





True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



In [None]:
# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_gec,
    eval_dataset=val_gec,
    data_collator=data_collator,
)

# Train the model
trainer.train()

'# Trainer instance\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=train_gec,\n    eval_dataset=val_gec,\n    data_collator=data_collator,\n)\n\n# Train the model\ntrainer.train()'

In [None]:
# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_translation,
    eval_dataset=val_translation,
    data_collator=data_collator,
)

# Train the model
trainer.train(resume_from_checkpoint = "/content/drive/MyDrive/PSL-Translator/results/checkpoint-1000")

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss
1,4.099,2.520235
2,1.4724,0.731277
3,1.1515,0.593459
4,0.8108,0.544146
5,1.0833,0.491236
6,0.4748,0.473967
7,0.5901,0.465713
8,0.704,0.448225
9,0.5959,0.448925
10,0.5011,0.443133


TrainOutput(global_step=10250, training_loss=0.8123898086547852, metrics={'train_runtime': 3781.3446, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.711, 'total_flos': 3069932799098880.0, 'train_loss': 0.8123898086547852, 'epoch': 10.0})

In [None]:
model.eval()
with torch.no_grad():
  for data in val_translation:
    outputs = model.generate(data['input_ids'].unsqueeze(0).to("cuda"))
    print(tokenizer.decode(data['input_ids'], skip_special_tokens=True))
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))




Ja zimowe ubranie założyć
Założyłem zimowe ubranie
Oni grać piłka nożna
Oni grają w piłkę nożną
Czy ty kot lubić mieć?
Czy lubisz mieć kota?
Ja zupa pomidorowa gotować
Gotuję zupę pomidorową
Ona na obiad gotować.
Ona gotuje obiad.
Ona stół nakrywać na obiad
Ona nakryła stół na obiad
Ona pies kochać bardzo
Ona bardzo kocha psa
Ty wczoraj kino iść
Czy poszłem wczoraj do kina?
My wieczór przyjaciel odwiedzać często
Często odwiedzamy przyjaciela wieczorem
Ja wieczorem książka czytać
Czytam książkę wieczorem
My wycieczka planować wspólnie
Planujemy wspólnie wycieczkę
On historia pisać
On pisze historię
On samochód czerwony mieć
On ma czerwony samochód
Ja kiedy ostatnio urlop mieć?
Kiedy ostatnio mam urlop?
Ona rower jechać
Ona jeździ rowerem
My przyjaciel spotkanie co weekend
Co weekend spotykamy się z przyjacielem
Dlaczego ty płakać?
Dlaczego płakasz?
Ty muzyka słuchać teraz?
Czy teraz słuchasz muzyki?
Ty urodziny blisko mieć?
Masz blisko urodziny?
Ja napój zimny pić
Piję zimny napój
Oni r

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/PSL-Translator/model2.pth")

In [None]:
tokenizer("ja woda pić", return_tensors="pt")

{'input_ids': tensor([[   432,    259, 181609,    421,   7155,      1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}