In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "ai-forever/mGPT-1.3B-romanian"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.77G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.77G [00:00<?, ?B/s]

In [2]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"], 
    lora_dropout=0,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    fan_in_fan_out=True,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,325,376 || all params: 1,421,922,304 || trainable%: 0.3042


In [3]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="moldova_dataset.json")
dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)

def tokenize_function(examples):
    texts = [f"{title}\n{content}" for title, content in zip(examples["title"], examples["text"])]
    
    tokenized = tokenizer(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=512,
        return_tensors="pt"
    )

    tokenized["labels"] = tokenized["input_ids"].clone()
    tokenized["labels"][tokenized["input_ids"] == tokenizer.pad_token_id] = -100
    
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["title", "text"])

Map:   0%|          | 0/11464 [00:00<?, ? examples/s]

Map:   0%|          | 0/1274 [00:00<?, ? examples/s]

In [4]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mgpt-romanian-lora-finetuned_moldova_03",
    per_device_train_batch_size=4,
    num_train_epochs=4,
    save_strategy="steps",
    save_steps=2500,
    eval_strategy="steps",
    save_total_limit=1,
    logging_steps=2500,
    learning_rate=0.00001,
    fp16=True,
    report_to="none",
    label_names=['labels'],
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False
)

In [5]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
2500,2.2319,2.073318
5000,2.1307,2.033188
7500,2.104,2.015519
10000,2.0857,2.006653


TrainOutput(global_step=11464, training_loss=2.1311728989254406, metrics={'train_runtime': 5174.703, 'train_samples_per_second': 8.862, 'train_steps_per_second': 2.215, 'total_flos': 1.70864721002496e+17, 'train_loss': 2.1311728989254406, 'epoch': 4.0})

In [6]:
model.save_pretrained("mgpt-romanian-lora-adapter_moldova_03")


In [7]:
tokenizer.save_pretrained("mgpt-romanian-lora-adapter_moldova_03_tok")

('mgpt-romanian-lora-adapter_moldova_03_tok/tokenizer_config.json',
 'mgpt-romanian-lora-adapter_moldova_03_tok/special_tokens_map.json',
 'mgpt-romanian-lora-adapter_moldova_03_tok/vocab.json',
 'mgpt-romanian-lora-adapter_moldova_03_tok/merges.txt',
 'mgpt-romanian-lora-adapter_moldova_03_tok/added_tokens.json',
 'mgpt-romanian-lora-adapter_moldova_03_tok/tokenizer.json')