In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "ai-forever/mGPT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

In [24]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"], 
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,419,169,792 || trainable%: 0.1108




In [25]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/kaggle/input/ziare-romania/romania_dataset.json")

def tokenize_function(examples):
    texts = [f"Title: {title}\nContent: {content}" for title, content in zip(examples["title"], examples["content"])]
    
    tokenized = tokenizer(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=512,
        return_tensors="pt"
    )

    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [26]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mgpt-lora-finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    save_strategy="steps",
    # save_steps=500,
    logging_steps=100,
    learning_rate=0.00001,
    fp16=True,
    report_to="none"
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


TrainOutput(global_step=1, training_loss=20.03041648864746, metrics={'train_runtime': 1.5016, 'train_samples_per_second': 1.715, 'train_steps_per_second': 0.666, 'total_flos': 14870636396544.0, 'train_loss': 20.03041648864746, 'epoch': 0.00015535187199005747})

In [28]:
model.save_pretrained("mgpt-lora-adapter")
tokenizer.save_pretrained("mgpt-lora-adapter")

('mgpt-lora-adapter/tokenizer_config.json',
 'mgpt-lora-adapter/special_tokens_map.json',
 'mgpt-lora-adapter/vocab.json',
 'mgpt-lora-adapter/merges.txt',
 'mgpt-lora-adapter/added_tokens.json',
 'mgpt-lora-adapter/tokenizer.json')