In [None]:
#@title Скачиваем и устанавливаем зависимости

from pathlib import Path

!git clone https://github.com/IlyaGusev/rulm.git
!git clone https://github.com/ggerganov/llama.cpp.git

self_instruct_dir = Path('rulm/self_instruct').resolve()

!cd {self_instruct_dir} && pip install -r ../requirements.txt

In [None]:
#@title Логинимся в WandB

import wandb
wandb.login()

In [None]:
#@title Генерируем обучающую и валидационную выборку, обрезая её для Colab

content_dir = Path('.').resolve()
!cd {self_instruct_dir} && python -m src.data_processing.create_chat_set \
    {content_dir / 'train_full.jsonl'} \
    {content_dir / 'val_full.jsonl'}

assert (content_dir / 'train_full.jsonl').exists()

train_size_limit = 400 #@param {type:"integer"}
val_size_limit = 200 #@param {type:"integer"}

!head -n {train_size_limit} {content_dir / 'train_full.jsonl'} > {content_dir / 'train.jsonl'}
!head -n {val_size_limit} {content_dir / 'val_full.jsonl'} > {content_dir / 'val.jsonl'}

In [None]:
#@title Скачиваем базовую модель LLaMa 7B и чиним её конфиг

import json
from huggingface_hub import snapshot_download

model_dir = content_dir / "ruGPT-3.5-13B"
base_model = "ai-forever/ruGPT-3.5-13B" #@param {type:"string"}
snapshot_download(repo_id=base_model, local_dir=model_dir, ignore_patterns=["LICENSE", "README.md", ".gitattributes"])

patch_model_config = True #@param {type:"boolean"}

if patch_model_config:
    replacements = {
        "tokenizer_config.json": {
            "add_bos_token": False,
            "add_prefix_space": False,
            "bos_token": {
                "__type": "AddedToken",
                "content": "<s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "clean_up_tokenization_spaces": True,
            "eos_token": {
                "__type": "AddedToken",
                "content": "</s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "errors": "replace",
            "mask_token": "<mask>",
            "model_max_length": 2048,
            "pad_token": {
                "__type": "AddedToken",
                "content": "<pad>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "tokenizer_class": "GPT2Tokenizer",
            "unk_token": {
                "__type": "AddedToken",
                "content": "<|endoftext|>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "padding_side": "left"
        },
        "special_tokens_map.json": {
            "bos_token": {
                "content": "<s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "eos_token": {
                "content": "</s>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "mask_token": "<mask>",
            "pad_token": {
                "content": "<pad>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            },
            "sep_token": "<s>",
            "unk_token": {
                "content": "<|endoftext|>",
                "lstrip": False,
                "normalized": True,
                "rstrip": False,
                "single_word": False
            }
        },
        "generation_config.json": {
            "_from_model_config": True,
            "bos_token_id": 2,
            "eos_token_id": 3,
            "pad_token_id": 0,
            "temperature": 0.2,
            "top_p": 0.9,
            "top_k": 30,
            "do_sample": True,
            "max_new_tokens": 1536,
            "num_beams": 1,
            "repetition_penalty": 1.15,
            "no_repeat_ngram_size": 15
        },
    }

    print('Patching model config...')
    for filename, new_content in replacements.items():
        print(f'{filename}:')
        with (model_dir / filename).open() as fp:
            old_content = json.load(fp)
            print(f'    Original content: {old_content}')
            if old_content == new_content:
                print('    Already patched, skipping')
        print(f'    Updated content:  {new_content}')
        with (model_dir / filename).open('w') as fp:
            json.dump(new_content, fp, indent=4)

In [None]:
#@title Уменьшаем размер батча и лимит токенов, чтобы поместиться в Colab, и длительность обучения для демки

original_config_path = content_dir / 'configs/rugpt35_13b.json'

with original_config_path.open('r') as fp:
    config = json.load(fp)

# Colab adjustments
config['trainer']['per_device_train_batch_size'] = 2 #@param {type:"integer"}
config['trainer']['per_device_eval_batch_size'] = 1 #@param {type:"integer"}
config['trainer']['gradient_accumulation_steps'] = 128 #@param {type:"integer"}
config['trainer']['eval_steps'] = 50 #@param {type:"integer"}
config['trainer']['save_steps'] = 50 #@param {type:"integer"}
config['max_tokens_count'] = 1000 #@param {type:"integer"}
#config['model_name'] = str(model_dir)
config['templates_path'] = str(content_dir / 'internal_prompts/rugpt35.json')
config['load_in_8bit'] = True #@param {type:"bool"}
config['load_in_4bit'] = False #@param {type:"bool"}

# Demo adjustments
config['trainer']['eval_steps'] = 2 #@param {type:"integer"}
config['trainer']['logging_steps'] = 1 #@param {type:"integer"}
config['trainer']['num_train_epochs'] = 1 #@param {type:"integer"}

config_path = content_dir / 'configs/rugpt35_13b_colab.json'

with config_path.open('w') as fp:
    json.dump(config, fp, indent=4)

In [None]:
#@title Запускаем обучение!

output_dir = content_dir / 'output'

!cd {self_instruct_dir} && python3 -m src.train \
    --config-file {config_path} \
    --train-file {content_dir / 'train.jsonl'} \
    --val-file {content_dir / 'val.jsonl'} \
    --output-dir {output_dir}

assert (output_dir / 'adapter_config.json').exists()

In [None]:
#@title Исправляем конфиг обученной модели

with (output_dir / 'generation_config.json').open('w') as fp:
    json.dump({
        "bos_token_id": 2,
        "eos_token_id": 3,
        "pad_token_id": 0,
        "temperature": 0.2,
        "top_p": 0.9,
        "top_k": 30,
        "do_sample": True,
        "max_new_tokens": 1536,
        "num_beams": 1,
        "repetition_penalty": 1.15,
        "no_repeat_ngram_size": 15,
    }, fp, indent=4)

In [None]:
#@title Склеиваем вместе обученные адаптеры с базовой моделью, сохраняем результат в формат PyTorch

from convert_to_native import convert_to_native
merged_model_name = 'pytorch_model.bin'

convert_to_native(
    model_name=str(output_dir),
    output_path=str(output_dir / merged_model_name),
    device='cpu',
    enable_offloading=True
)

assert (output_dir / merged_model_name).exists()

In [None]:
#@title Конвертируем склеенную модель в 16-битный формат GGML (llama.cpp)

from llm_rs.convert import AutoConverter

ggml_f16_model_name = 'ruGPT-3.5-13B-lora-f16.bin'
input_ggml = content_dir / 'ruGPT-3.5-13B-lora'
output_ggml = content_dir / 'output_ggml'

converted_model = AutoConverter.convert(input_ggml, output_ggml)

assert (output_ggml / ggml_f16_model_name).exists()

In [None]:
#@title Квантизуем результат в 4 бита

from llm_rs import AutoQuantizer, QuantizationType, ContainerType

AutoQuantizer.quantize(converted_model, quantization=QuantizationType.Q4_0, container=ContainerType.GGML)
# AutoQuantizer.quantize(converted_model, quantization=QuantizationType.Q4_0, container=ContainerType.GGJT)

# AutoQuantizer.quantize(converted_model, quantization=QuantizationType.Q4_1, container=ContainerType.GGML)
# AutoQuantizer.quantize(converted_model, quantization=QuantizationType.Q5_0, container=ContainerType.GGML)
# AutoQuantizer.quantize(converted_model, quantization=QuantizationType.Q5_1, container=ContainerType.GGML)
# AutoQuantizer.quantize(converted_model, quantization=QuantizationType.Q8_0, container=ContainerType.GGML)

ggml_quantized_model_name = 'ruGPT-3.5-13B-lora-q4_0.bin'
assert (output_ggml / ggml_quantized_model_name).exists()

In [None]:
#@title Прогоняем полученную модель на нескольких диалогах из валидационной выборки

from llm_rs import AutoModel

num_test_samples = 3 #@param {type:"integer"}
max_new_tokens = 20 #@param {type:"integer"}

!head -n {num_test_samples} {content_dir / 'val.jsonl'} > {content_dir / 'test.jsonl'}

model = AutoModel.from_pretrained(output_ggml, model_file=ggml_quantized_model_name)

with open(content_dir / 'test.jsonl', 'r') as f, open(content_dir / 'test_result.jsonl', 'w') as out_f:
    for line in f:
        data = json.loads(line)
        input_text = data['messages'][0]['content']
        output = model.generate(input_text).text
        print(output)

assert (content_dir / 'test_result.jsonl').exists()