## Дообучение Saiga-Mistral, квантизация и инференс с помощью llama-cpp

В репозитории реализован код для дообучения русскоязычной LLM [Saiga mistral](https://huggingface.co/IlyaGusev/saiga_mistral_7b_lora), а также её квантизация и запуск с помощью llama-cpp. Попытался сделать код максимально гибким и воспроизводимым.  
Предполагается запуск на GPU. Может запускаться на multi-gpu без доп. модификаций.  
При создании ноутбука опирался на эту [статью](https://habr.com/ru/articles/776872/) на Хабре, задекорировал и актуализировал некоторые моменты

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import PeftModel, PeftConfig, AutoPeftModelForCausalLM
from datasets import load_dataset
import transformers
import torch
import time
import os

In [6]:
MODEL_NAME = "IlyaGusev/saiga2_7b_lora"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

st_time = time.time()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant=False,
)


config = PeftConfig.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit = True,
    torch_dtype=torch.float16,
    device_map="auto",

)
model = PeftModel.from_pretrained(
    model,
    MODEL_NAME,
    torch_dtype=torch.float16,
    is_trainable = True,
    quantization=bnb_config
)

model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)

print(generation_config)
print(f'Загрузка модели заняла {round(time.time() - st_time, 2)} секунд')

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.18s/it]


GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_new_tokens": 3584,
  "no_repeat_ngram_size": 15,
  "pad_token_id": 0,
  "repetition_penalty": 1.2,
  "temperature": 0.5,
  "top_k": 30,
  "top_p": 0.9
}

Прошло времени 16.81461763381958


In [7]:
model.print_trainable_parameters()

trainable params: 16,777,216 || all params: 6,755,192,832 || trainable%: 0.24836028248556738


In [10]:
!nvidia-smi

Fri Feb  9 11:04:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A10                     On  | 00000000:00:10.0 Off |                    0 |
|  0%   38C    P0              54W / 150W |  13480MiB / 23028MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A10                     On  | 00000000:00:11.0 Off |  

### Загрузка датасета

Датасет для дообучения должен быть в формате json и иметь формат ```[{"system": str, "user": str, "bot": str}, ... ]```, где system - системное сообщение для модели (например, у Сайги в use-example это "Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им."), user - это промпт пользователя, bot - ответ модели.

In [9]:
TRAIN_PATH = "train.json"
VALID_PATH = "val.json"

data = load_dataset(
    "json", 
    data_files={
                'train': TRAIN_PATH,
                'validation': VALID_PATH
    }
)
data["train"] = data["train"].shuffle() # for train data shuffling, optional

## Предобработка датасета

In [11]:
CUTOFF_LEN = 2500 # до какого токена будет обрезать текст


def generate_prompt(data_point):
    prompt = f"""<s>system
{data_point['system']}</s><s>user
{data_point['user']}</s><s>bot
{data_point['bot']}[</s>"""
    return prompt
 
    
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
    result["labels"] = result["input_ids"].copy()
    return result


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt


In [12]:
train_data = (
    data["train"].map(generate_and_tokenize_prompt)
)

val_data = (
    data["validation"].map(generate_and_tokenize_prompt)
)

Map: 100%|████████████████████████████████████████████████████████████████| 15033/15033 [00:42<00:00, 351.34 examples/s]


## Обучение модели

In [13]:
BATCH_SIZE = 6
MICRO_BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_EPOCHS = 5
OUTPUT_DIR = "finetuned_model"
LOG_PER_EPOCH = 

training_arguments = transformers.TrainingArguments(
            per_device_train_batch_size=MICRO_BATCH_SIZE,
            per_device_eval_batch_size=MICTO_BATCH_SIZE,
            prediction_loss_only=True,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            num_train_epochs=TRAIN_EPOCHS,
            learning_rate=LEARNING_RATE,
            fp16=True,
            logging_steps=25000,
            optim="adamw_torch",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            output_dir=OUTPUT_DIR,
            load_best_model_at_end=True,
            report_to=None,
            overwrite_output_dir=True,
)

In [14]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [15]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model = torch.compile(model)
trainer.train()
model.save_pretrained(OUTPUT_DIR)

[34m[1mwandb[0m: Currently logged in as: [33mglebbondarchuk[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
10000,No log,0.542794
20000,No log,0.526266
30000,0.528300,0.515486
40000,0.528300,0.515921
50000,0.435400,0.521162
60000,0.435400,0.515704


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



## Квантизация модели

Для начала склонируем репозитории с библиотеками rulm и llama-cpp для конкатенации обученного адаптера и квантизации.

In [None]:
!git clone https://github.com/IlyaGusev/rulm.git
!git clone https://github.com/ggerganov/llama.cpp

### Склеим модель и обученный адаптер

In [1]:
from rulm.self_instruct.src.tools import convert_to_native

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH_TO_CHECKPOINT = "finetuned_model/checkpoint-70000" # путь до чекпоинта адаптера, который хотим приклеить
MERGED_MODEL_PATH = "merged_model.pt"

convert_to_native.convert_to_native(PATH_TO_CHECKPOINT, MERGED_MODEL_PATH, 
                                    device="cuda", enable_offloading=True)

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 291/291 [00:02<00:00, 122.89it/s]


Saving state_dict...


### Конвертируем склеенную модель в 16-битный формат GGUF для запуска с помощью llama-cpp

In [3]:
# сперва сохраним токенайзер в папку, где лежит лучший чекпоинт

tokenizer = AutoTokenizer.from_pretrained("IlyaGusev/saiga2_7b_lora", use_fast=False)
tokenizer.save_pretrained(PATH_TO_CHECKPOINT)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


('tmp/checkpoint-70000/tokenizer_config.json',
 'tmp/checkpoint-70000/special_tokens_map.json',
 'tmp/checkpoint-70000/tokenizer.model',
 'tmp/checkpoint-70000/added_tokens.json')

Обязательная строчка, надо откатить версию llama-cpp, т.к. на последней квантизация почему-то не работает.

In [4]:
%cd llama.cpp
!git checkout 64e64aa

/home/ubuntu/jupyter-home-dir/llama.cpp
HEAD is now at 64e64aa2 ggml : restore abort() in GGML_ASSERT (#4242)


In [5]:
OUTPUT_PATH = "../model-f16.gguf"

In [6]:
!python convert.py {MERGED_MODEL_PATH} --vocab-dir {PATH_TO_CHECKPOINT} --outfile {OUTPUT_PATH} --outtype f16 --ctx 4096

Loading model file ../merged_model.pt
params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=4096, n_ff=11008, n_head=32, n_head_kv=32, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=None, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('..'))
Loading vocab file '../tmp/checkpoint-70000/tokenizer.model', type 'spm'
tok_embeddings.weight                            -> token_embd.weight                        | F16    | [32000, 4096]
layers.0.attention.wq.weight                     -> blk.0.attn_q.weight                      | F16    | [4096, 4096]
layers.0.attention.wk.weight                     -> blk.0.attn_k.weight                      | F16    | [4096, 4096]
layers.0.attention.wv.weight                     -> blk.0.attn_v.weight                      | F16    | [4096, 4096]
layers.0.attention.wo.weight                     -> blk.0.attn_output.weight                 | F16    | [4096, 4096]
layers.0.feed

### Квантуем моедль в 4 бита и 8 бит

In [7]:
!make quantize

I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -Wdouble-promotion -pthread -march=native -mtune=native 
I CXXFLAGS:  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -march=native -mtune=native 
I NVCCFLAGS:  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread    -Wno-pedantic -Xcompiler "-Wno-array-bounds -Wno-format-truncation -Wextra-semi -march=native -mt

In [8]:
QUANT_MODEL = "../model-q4_0.gguf"
QUANTIZATION_TYPE = "q4_0" # "q4_0" или "q4_1"

In [9]:
! ./quantize {OUTPUT_PATH} {QUANT_MODEL} {QUANTIZATION_TYPE}

main: build = 1575 (64e64aa2)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '../model-f16.gguf' to '../model-q4_0.gguf' as Q4_0
llama_model_loader: loaded meta data with 15 key-value pairs and 291 tensors from ../model-f16.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight f16      [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    6:             

### Запуск скомпилированной версии на GPU с помощью llama-cpp

Переустановим llama-cpp на последнюю версию. Параметры, которые идут перед установкой, обязательны для запуска на GPU.

In [3]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.42.tar.gz (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Downloading typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m224.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting jinja2>=

### Использование модели в питоновском коде

In [1]:
from llama_cpp import Llama

In [2]:
llm = Llama(model_path="model-q4_0.gguf", n_gpu_layers=128, n_ctx=2048)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 3 CUDA devices:
  Device 0: NVIDIA A10, compute capability 8.6, VMM: yes
  Device 1: NVIDIA A10, compute capability 8.6, VMM: yes
  Device 2: NVIDIA A10, compute capability 8.6, VMM: yes
llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from model-q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32        

In [3]:
!nvidia-smi

Wed Feb 14 08:48:37 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A10                     On  | 00000000:00:10.0 Off |                    0 |
|  0%   37C    P0              55W / 150W |   1966MiB / 23028MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A10                     On  | 00000000:00:11.0 Off |  

In [1]:
prompt = f"""<s>system
    {'Any system prompt'}</s><s>user
    {'Any user prompt'}</s><s>bot"""

In [6]:
start_time = time.time()

output = llm(
      p, # Prompt
      max_tokens=2048,
      echo=False,
      temperature=0
)

print(time.time() - start_time)


llama_print_timings:        load time =     167.61 ms
llama_print_timings:      sample time =     135.45 ms /   385 runs   (    0.35 ms per token,  2842.31 tokens per second)
llama_print_timings: prompt eval time =     166.81 ms /   218 tokens (    0.77 ms per token,  1306.84 tokens per second)
llama_print_timings:        eval time =    4487.18 ms /   384 runs   (   11.69 ms per token,    85.58 tokens per second)
llama_print_timings:       total time =    5690.04 ms /   602 tokens


5.694703578948975


In [None]:
print(output["choices"][0]["text"][:-1])