**SFT (Supervised Fine-Tuning, instruction-tuning)**  — метод адаптации предварительно обученных языковых моделей (LLM) под конкретную задачу с помощью размеченных данных

**Цель** — скорректировать веса модели так, чтобы она лучше справлялась с задачей, не теряя при этом общие знания, полученные в ходе предварительного обучения

### Установка зависимотсей

In [None]:
!pip install transformers accelerate
!huggingface-cli login
!pip install --upgrade pip
!pip install --force-reinstall "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
!pip install --force-reinstall unsloth_zoo

### Model SQLCoder-7b

In [None]:
import unsloth
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch

MODEL_NAME = "defog/sqlcoder-7b"

# загрузка модели
model_sql, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=2048,
    load_in_4bit=True
)

# оптимизация под инференс
# FastLanguageModel.for_inference(model_sql)


# prompt = """
# You are an SQL expert.
# Generate a query to find the top 5 customers with the highest total orders.
# Tables:
# orders(order_id, customer_id, order_date, total)
# customers(customer_id, name)
# """


# inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
# streamer = TextStreamer(tokenizer, skip_prompt=True)

# # генерация
# _ = model.generate(
#     **inputs,
#     streamer=streamer,
#     max_new_tokens=200
# )


### Fine-Tuning

#### Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/AQUA/data_processing/data/queries_marked_602_915.jsonl")

full_dataset = dataset["train"]

temp_split = full_dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)
train_dataset = temp_split["train"]
temp = temp_split["test"]

val_test = temp.train_test_split(test_size=0.5, seed=42, shuffle=True)

eval_dataset = val_test["train"]
test_dataset = val_test["test"]
print(train_dataset[0])
print(len(train_dataset), len(eval_dataset), len(test_dataset))

#### Lora adaptor

In [None]:
model = FastLanguageModel.get_peft_model(
    model_sql,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

#### Setting up training parameters

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

# Параметры обучения
training_args = TrainingArguments(
    num_train_epochs=5, #num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-6, 
    optim="adamw_torch_fused",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    gradient_checkpointing=True,
    logging_steps=10,
    eval_steps=25,
    save_steps=25,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    greater_is_better=False,
    metric_for_best_model="eval_loss", # Будем менять
    warmup_ratio=0.2,
    seed=3407,
    output_dir="/content/AQUA/fine-tuning/training_args",
    logging_dir="/content/AQUA/fine-tuning/logs",
    report_to="none",
)

#### Creating and launching training

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    prompt = example["instruction"]
    if example["input"].strip() != "":
        prompt += "\nInput: " + example["input"]
    prompt += "\nOutput: " + example["output"]
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=False)
tokenized_eval = eval_dataset.map(tokenize_function, batched=False)


In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    max_seq_length=512,
    packing=False,
    args=training_args,
)

trainer.train()

In [None]:
model.save_pretrained("/content/AQUA/fine-tuning/Model/sqlcoder-finetuned")
tokenizer.save_pretrained("/content/AQUA/fine-tuning/Model/sqlcoder-finetuned")