In [2]:
import os
import pandas as pd
import torch
import random
from datasets import Dataset
from transformers import (
    DataCollatorForSeq2Seq,
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Trainer,
    TrainingArguments,
)

os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,original_sentence,modified_sentence
0,Иако веќе знае дека не му припаѓа на светот на...,Иако веќе знае дека не му припаѓа на светот од...
1,"Но, и тука е искористен, Кузман само ги барал ...","Но, и тука е искористен, Кузман само ги бараше..."
2,"Следува дружењето со „Црните браќа“, исто деца...","Следува дружењето со ""Црните браќа"", исто деца..."
3,Таа тепачка сосема ќе го освести Славе и ќе го...,Таа тепачка целосно ќе освести Славе и ќе го о...
4,"Тогаш тој ќе се сврти кон убавото, кон девојче...","Тогаш тој ќе се сврне кон убавиот, кон девојче..."


In [4]:
latin_to_cyrillic_map = str.maketrans({
    "o": "о", "a": "а", "e": "е", "c": "с", "x": "х", "y": "у",
    "O": "О", "A": "А", "E": "Е", "C": "С", "X": "Х", "Y": "У"
})

def normalize_text(text):
    return text.translate(latin_to_cyrillic_map)

synonym_replacements = {
    "создадена": "создадено",
    "добар": "добра",
    "еден": "една",
}

extra_errors = [
    (" и ", " "),
    (",", ""),
    (" во ", " на "),
]

def introduce_errors(sentence):
    words = sentence.split()
    for i, w in enumerate(words):
        if w in synonym_replacements and random.random() < 0.7:
            words[i] = synonym_replacements[w]
    noisy_sentence = " ".join(words)
    for src, tgt in extra_errors:
        if random.random() < 0.5:
            noisy_sentence = noisy_sentence.replace(src, tgt)
    return noisy_sentence


In [5]:
synthetic_data = []
for _, row in df.iterrows():
    correct = normalize_text(row["modified_sentence"])
    incorrect = normalize_text(row["original_sentence"])
    synthetic_incorrect = introduce_errors(correct)
    synthetic_data.append((incorrect, correct))
    synthetic_data.append((synthetic_incorrect, correct))

synthetic_data = synthetic_data * 3

aug_df = pd.DataFrame(synthetic_data, columns=["input_sentence", "target_sentence"])
aug_df = aug_df.drop_duplicates()
aug_df["input_text"] = "поправи ја реченицата: " + aug_df["input_sentence"]
aug_df["target_text"] = aug_df["target_sentence"]

dataset = Dataset.from_pandas(aug_df[["input_text", "target_text"]])
dataset


Dataset({
    features: ['input_text', 'target_text', '__index_level_0__'],
    num_rows: 13389
})

In [6]:
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name).to(device)

def tokenize(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            truncation=True,
            padding="max_length",
            max_length=128,
        )
    labels["input_ids"] = [
        [tid if tid != tokenizer.pad_token_id else -100 for tid in ids]
        for ids in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["input_text", "target_text"])


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Map: 100%|█████████

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./mt5-mk-corrector",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=15,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    fp16=False,
    logging_dir="./logs",
    report_to=None,
)

tokenized_dataset = tokenized_dataset.shuffle(seed=42)
split = tokenized_dataset.train_test_split(test_size=0.1)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
50,24.5108
100,25.6512
150,23.6058
200,23.2933
250,22.3293
300,20.7651
350,19.7642
400,17.9628
450,16.4006
500,15.3121


TrainOutput(global_step=11310, training_loss=2.6045528577135055, metrics={'train_runtime': 3069.7449, 'train_samples_per_second': 58.881, 'train_steps_per_second': 3.684, 'total_flos': 2.389287665664e+16, 'train_loss': 2.6045528577135055, 'epoch': 15.0})

In [9]:
model.save_pretrained("./mt5-mk-corrector")
tokenizer.save_pretrained("./mt5-mk-corrector")


('./mt5-mk-corrector\\tokenizer_config.json',
 './mt5-mk-corrector\\special_tokens_map.json',
 './mt5-mk-corrector\\spiece.model',
 './mt5-mk-corrector\\added_tokens.json')

In [10]:
def correct_sentence(sentence):
    inputs = tokenizer("исправи реченица: " + sentence, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64, num_beams=8, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [11]:
# Example
print(correct_sentence("јас имам еден книга"))

Јас имам еден книга
