<a href="https://colab.research.google.com/github/Daniil81704/Fine-tuning-w-LoRA/blob/main/Dostoevskiy_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U datasets evaluate peft

In [None]:
import requests

url = "https://gitlab.com/z00logist/artificial-dostoevsky/-/raw/main/data/corpus.txt"
try:
    response = requests.get(url)
    response.raise_for_status()
    text_from_url = response.text
    # print(text_from_url)
except requests.exceptions.RequestException as e:
    print(f"Ошибка при запросе URL: {e}")

In [None]:
text_from_url[:1000]

In [None]:
import torch
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig

def preprocess_function(examples):
    encoding = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=50)
    encoding['labels'] = encoding['input_ids'].copy()
    return encoding

In [None]:
model_name = "sberbank-ai/rugpt3large_based_on_gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = torch.device('cpu')
print(DEVICE)
model.to(DEVICE)

In [None]:
data = {
    "text": [text_from_url]
}

dataset = Dataset.from_dict(data)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    lora_dropout=0.1,
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    report_to='none',
    learning_rate=5e-5,
    num_train_epochs=800,
    per_device_train_batch_size=8,
    warmup_steps=10,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

# model.save_pretrained('./finetuned_gpt3')

In [None]:
import numpy as np

np.random.seed(42)
torch.manual_seed(42)

text = "Не хотите ли кофею?"
inpt = tokenizer.encode(text, return_tensors="pt")

out = model.generate(inpt.cuda(),
                     max_length=200,
                     repetition_penalty=6.0,
                     do_sample=True,
                     top_k=5,
                     top_p=0.95,
                     temperature=1,
                     no_repeat_ngram_size=2)

generated_text = list(map(tokenizer.decode, out))[0]
print(generated_text)