# Pretrain

1) Препроцессинг данных

In [None]:
import re
import json
import hashlib
from pathlib import Path

In [None]:
DATA_PATH = Path("./data/corpus")
OUT_PATH = Path("./data/pretrain_corpus.jsonl")

MIN_SENT_CHARS = 20
MAX_SENT_CHARS = 5000

CONTEXT_LEN = 1024
MAX_TOKENS_PER_CHUNK = CONTEXT_LEN - 2 

BOS = "<bos>"
EOS = "<eos>"

def normalize_text(text):
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = text.replace("«", '"').replace("»", '"').replace("„", '"').replace("“", '"').replace("”", '"')
    text = text.replace("—", " — ")
    text = text.replace("–", " — ")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def normalize_punct(s):
    s = s.strip()
    s = re.sub(r"\.{4,}", "...", s)
    s = re.sub(r"!{2,}", "!", s)
    s = re.sub(r"\?{2,}", "?", s)
    s = re.sub(r"(\?!){2,}", "?!", s)
    s = re.sub(r",{2,}", ",", s)
    s = re.sub(r":{2,}", ":", s)
    s = re.sub(r";{2,}", ";", s)
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)
    s = re.sub(r"([,.;:!?])([^\s])", r"\1 \2", s)
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip()

def split_sentences(text):
    parts = re.split(r"(?<=[.!?])\s+", text)
    return [p for p in parts if p.strip()]

LATIN_RE = re.compile(r"[A-Za-z]")
CYR_RE = re.compile(r"[А-Яа-яЁё]")

def cyrillic_ratio(s):
    letters = re.findall(r"[A-Za-zА-Яа-яЁё]", s)
    if not letters:
        return 0.0
    cyr = sum(1 for ch in letters if CYR_RE.match(ch))
    return cyr / len(letters)

def is_good_sentence(s):
    if not s:
        return False

    if len(s) < MIN_SENT_CHARS:
        return False

    if len(s) > MAX_SENT_CHARS:
        return False

    if LATIN_RE.search(s):
        return False
    
    if cyrillic_ratio(s) < 0.70:
        return False
    
    letters_count = len(CYR_RE.findall(s))
    if letters_count < 5:
        return False
    
    if len(set(s)) <= 3:
        return False

    return True

def sha1(text):
    return hashlib.sha1(text.encode("utf-8")).hexdigest()

def normalize_for_dedup(s):
    s = s.lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^0-9а-яё]+", "", s)
    return s.strip()

def count_tokens(text, tokenizer=None):
    if tokenizer is None:
        return len(text.split())
    return len(tokenizer.encode(text))

def chunk_sentences(sentences, max_tokens, tokenizer=None):
    chunks = []
    current = []
    current_tokens = 0

    for s in sentences:
        s_tokens = count_tokens(s, tokenizer)
        if s_tokens > max_tokens:
            words = s.split()
            buf = []
            buf_tokens = 0

            for w in words:
                w_tokens = count_tokens(w, tokenizer)
                if buf_tokens + w_tokens > max_tokens and buf:
                    chunks.append(" ".join(buf))
                    buf = [w]
                    buf_tokens = w_tokens
                else:
                    buf.append(w)
                    buf_tokens += w_tokens

            if buf:
                chunks.append(" ".join(buf))
            continue

        if current_tokens + s_tokens > max_tokens and current:
            chunks.append(" ".join(current))
            current = [s]
            current_tokens = s_tokens
        else:
            current.append(s)
            current_tokens += s_tokens
    if current:
        chunks.append(" ".join(current))

    return chunks

In [None]:
txt_files = sorted(DATA_PATH.glob("*.txt"))
print("Количество файлов:", len(txt_files))

seen_docs = set()
seen_sents = set()
all_chunks = []

stats = {
    "docs_total": 0,
    "docs_unique": 0,
    "sents_total": 0,
    "sents_good": 0,
    "sents_unique": 0,
    "chunks_total": 0
}

for fp in txt_files:
    stats["docs_total"] += 1

    raw = fp.read_text(encoding="utf-8", errors="ignore")
    raw = normalize_text(raw)

    doc_key = sha1(normalize_for_dedup(raw))
    if doc_key in seen_docs:
        continue

    seen_docs.add(doc_key)
    stats["docs_unique"] += 1

    sents = split_sentences(raw)
    stats["sents_total"] += len(sents)

    cleaned = []
    for s in sents:
        s = normalize_punct(s)
        if not is_good_sentence(s):
            continue

        stats["sents_good"] += 1

        sent_key = sha1(normalize_for_dedup(s))
        if sent_key in seen_sents:
            continue

        seen_sents.add(sent_key)
        stats["sents_unique"] += 1
        cleaned.append(s)

    chunks = chunk_sentences(cleaned, max_tokens=MAX_TOKENS_PER_CHUNK, tokenizer=None)

    for ch in chunks:
        text = f"{BOS} {ch.strip()} {EOS}"
        all_chunks.append(text)

stats["chunks_total"] = len(all_chunks)

print("=== STATS ===")
for k, v in stats.items():
    print(f"{k}: {v}")

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with OUT_PATH.open("w", encoding="utf-8") as f:
    for t in all_chunks:
        f.write(json.dumps({"text": t}, ensure_ascii=False) + "\n")

print("Saved:", OUT_PATH, "chunks:", len(all_chunks))

3) Токенизатор

In [None]:
import json
from pathlib import Path

JSONL_PATH = Path("./data/pretrain_corpus.jsonl")
TXT_TRAIN_PATH = Path("./data/tokenizer_train.txt")

TXT_TRAIN_PATH.parent.mkdir(parents=True, exist_ok=True)

count = 0
with open(JSONL_PATH, "r", encoding="utf-8") as f_in, open(TXT_TRAIN_PATH, "w", encoding="utf-8") as f_out:
    for line in f_in:
        obj = json.loads(line)
        text = obj["text"].strip()
        if not text:
            continue
        f_out.write(text.replace("\n", " ") + "\n")
        count += 1

print("Готово. Строк для обучения токенизатора:", count)
print("Файл:", TXT_TRAIN_PATH)

In [None]:
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path

VOCAB_SIZE = 3000
MIN_FREQUENCY = 2

special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(
    files=[str(TXT_TRAIN_PATH)],
    vocab_size=VOCAB_SIZE,
    min_frequency=MIN_FREQUENCY,
    special_tokens=special_tokens
)


OUT_DIR = Path("./tokenizer_bpe_3k")
OUT_DIR.mkdir(parents=True, exist_ok=True)

tokenizer.save_model(str(OUT_DIR))

tokenizer.save(str(OUT_DIR / "tokenizer.json"))
print("Saved tokenizer.json:", OUT_DIR / "tokenizer.json")

In [None]:
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="./tokenizer_bpe_3k/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<bos>",
    eos_token="<eos>",
)

ds = load_dataset("json", data_files={"train": "./data/pretrain_corpus.jsonl"})

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        add_special_tokens=False,
        return_token_type_ids=False
    )

tokenized = ds["train"].map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

BLOCK_SIZE = 512

def group_texts(examples):
    concatenated_input_ids = []
    concatenated_attention = []

    for ids in examples["input_ids"]:
        concatenated_input_ids.extend(ids)

    for am in examples["attention_mask"]:
        concatenated_attention.extend(am)

    total_length = len(concatenated_input_ids)
    total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE

    input_ids = []
    attention_mask = []
    labels = []

    for i in range(0, total_length, BLOCK_SIZE):
        chunk_ids = concatenated_input_ids[i : i + BLOCK_SIZE]
        chunk_mask = concatenated_attention[i : i + BLOCK_SIZE]

        input_ids.append(chunk_ids)
        attention_mask.append(chunk_mask)
        labels.append(chunk_ids.copy())

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

lm_dataset = tokenized.map(
    group_texts,
    batched=True,
    remove_columns=tokenized.column_names
)

print(lm_dataset)
print("Примеров:", len(lm_dataset))
print("Длина блока:", len(lm_dataset[0]["input_ids"]))

4. Инициализация модели

In [None]:
from transformers import PreTrainedTokenizerFast
from transformers import LlamaConfig, LlamaForCausalLM
import torch

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="./tokenizer_bpe_3k/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<bos>",
    eos_token="<eos>",
)

print("Vocab size:", tokenizer.vocab_size)
print("Special tokens:", tokenizer.special_tokens_map)

config = LlamaConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=1024,
    intermediate_size=1536,
    num_hidden_layers=16,
    num_attention_heads=16,
    num_key_value_heads=8,
    max_position_embeddings=512,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    rms_norm_eps=1e-6,
    rope_theta=10000.0,
    attention_bias=False,
)

model = LlamaForCausalLM(config)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Total params:", total_params)
print("Trainable params:", trainable_params)
print("~M params:", round(total_params / 1e6, 2), "M")

model.eval()
x = torch.randint(0, tokenizer.vocab_size, (2, 32))
with torch.no_grad():
    out = model(input_ids=x)
print("Logits shape:", out.logits.shape)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from datasets import Dataset
from transformers import default_data_collator

test_prompts = [
    "Все мысли, которые имеют огромные последствия",
    "Сила войска зависит от его духа",
    "Мысль о том, что он принес страдания",
    "Человек сознает себя свободным",
    "Что бы ни случилось, я всегда буду",
    "Любовь мешает смерти",
    "Нет, жизнь не кончена",
    "Всякая мысль, даже самая простая",
    "Война не любезность, а самое гадкое дело",
    "Чтобы жить честно"
]

splits = lm_dataset.train_test_split(test_size=0.02, seed=42)

train_ds = splits["train"]
val_ds = splits["test"]

print("Train size:", len(train_ds))
print("Val size:", len(val_ds))

data_collator = default_data_collator

In [None]:
import torch
from transformers import TrainerCallback

class PromptGenerationCallback(TrainerCallback):
    def __init__(self, prompts, tokenizer, max_new_tokens=80):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.max_new_tokens = max_new_tokens

    def on_evaluate(self, args, state, control, model=None, **kwargs):
        if model is None:
            return

        model.eval()

        device = model.device
        print("\n" + "=" * 80)
        print(f"[Eval @ step {state.global_step}] Prompt generations")
        print("=" * 80)

        old_cache = getattr(model.config, "use_cache", False)
        model.config.use_cache = True

        with torch.no_grad():
            for i, prompt in enumerate(self.prompts):
                inputs = self.tokenizer(prompt, return_tensors="pt")
                input_ids = inputs["input_ids"].to(device)
                attention_mask = inputs["attention_mask"].to(device)

                out_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=self.max_new_tokens,
                    do_sample=True,
                    temperature=0.9,
                    top_p=0.95,
                    top_k=50,
                    repetition_penalty=1.1,
                    eos_token_id=self.tokenizer.eos_token_id
                )

                text = self.tokenizer.decode(out_ids[0], skip_special_tokens=True)

                print(f"\n--- Prompt #{i+1} ---")
                print("PROMPT:", prompt)
                print("GEN:\n", text)

        model.config.use_cache = old_cache
        print("\n" + "=" * 80 + "\n")

In [None]:
from transformers import TrainingArguments, Trainer

model.config.use_cache = False
per_device_train_batch_size = 8
gradient_accumulation_steps = 8

args = TrainingArguments(
    output_dir="./checkpoints_pretrain",
    overwrite_output_dir=True,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,

    num_train_epochs=1,
    learning_rate=3e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",

    weight_decay=0.1,

    logging_steps=50,

    eval_strategy="steps",
    eval_steps=500,

    save_steps=500,
    save_total_limit=2,

    fp16=torch.cuda.is_available(),
    report_to="none",

    remove_unused_columns=False
)

callbacks = [PromptGenerationCallback(test_prompts, tokenizer, max_new_tokens=80)]

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=callbacks,
)

In [None]:
import math
trainer.train()

metrics = trainer.evaluate()
print(metrics)

if "eval_loss" in metrics:
    print("Perplexity:", math.exp(metrics["eval_loss"]))

# Post-train SFT

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

questions_rus = [
    "сколько планет в нашей солнечной системе?",
    "расскажи стих",
    "когда собирать крыжовник?",
    "Как быстро выучить новый язык?"
]

model_name = "Qwen/Qwen2.5-0.5B"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model_qwen = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else None,
)

model.eval()

def generate_answer(question, max_new_tokens=120):
    prompt = f"Вопрос: {question}\nОтвет:"
    inputs = tokenizer(prompt, return_tensors="pt")

    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.8,
            top_p=0.95,
            top_k=50,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

    text = tokenizer.decode(out[0], skip_special_tokens=True)

    if "Ответ:" in text:
        return text.split("Ответ:", 1)[-1].strip()
    return text.strip()

print("=== Base Model (Before SFT) Output ===\n")

for i, q in enumerate(questions_rus, 1):
    ans = generate_answer(q)

    print(f"Model Input {i}:")
    print(q)
    print(f"Model Output {i}:")
    print(ans)
    print()


## Подготовка данных

In [26]:
import inspect
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback
from trl import SFTTrainer, SFTConfig

questions_rus = [
    "сколько планет в нашей солнечной системе?",
    "расскажи стих",
    "когда собирать крыжовник?",
    "Как быстро выучить новый язык?"
]

SYSTEM_PROMPT = "Ты полезный ассистент. Отвечай по-русски, кратко и по делу."

ds = load_dataset("d0rj/alpaca-cleaned-ru")
print(ds)

def build_user_text(instruction, inp):
    instruction = (instruction or "").strip()
    inp = (inp or "").strip()
    if inp:
        return instruction + "\n\nКонтекст:\n" + inp
    return instruction

def to_messages(example):
    instruction = example.get("instruction", "")
    inp = example.get("input", "")
    output = example.get("output", "")

    user_text = build_user_text(instruction, inp)
    assistant_text = (output or "").strip()

    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_text},
            {"role": "assistant", "content": assistant_text},
        ]
    }

train_chat = ds["train"].map(to_messages, remove_columns=ds["train"].column_names)

split = train_chat.train_test_split(test_size=0.02, seed=42)
train_ds = split["train"]
val_ds = split["test"]

print("Train size:", len(train_ds))
print("Val size:", len(val_ds))
print("Example messages:\n", train_ds[0]["messages"])

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 51760
    })
})
Train size: 50724
Val size: 1036
Example messages:
 [{'content': 'Ты полезный ассистент. Отвечай по-русски, кратко и по делу.', 'role': 'system'}, {'content': 'Какую последнюю операцию вы ожидаете выполнить при обучении модели машинного обучения?', 'role': 'user'}, {'content': 'Последняя операция, которую можно ожидать при обучении модели машинного обучения, — это оценка производительности модели на проверочном или тестовом наборе данных. Этот шаг включает в себя использование обученной модели для прогнозирования набора данных, которых она раньше не видела, а затем сравнение этих прогнозов с фактическими результатами для оценки точности и способности модели к обобщению. Это позволяет точно настроить гиперпараметры модели перед ее развертыванием на практике или использованием для прогнозирования новых данных.', 'role': 'assistant'}]


# Дообучение

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-0.5B"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype=torch.float32,   # <-- ВАЖНО: FP32 чтобы не было "Attempting to unscale FP16 gradients"
)

# обязательно для обучения
model.config.use_cache = False
model.gradient_checkpointing_enable()

print("Tokenizer:", tokenizer.name_or_path)
print("Model dtype:", next(model.parameters()).dtype)
print("CUDA:", torch.cuda.is_available())


# =========================
# 5) Baseline check BEFORE SFT (optional, but полезно)
# =========================
def generate_answers(title):
    model.eval()
    device = next(model.parameters()).device

    print(f"=== {title} ===\n")

    for i, q in enumerate(questions_rus, 1):
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": q},
        ]

        # для Qwen используем chat template
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=120,
                do_sample=True,
                temperature=0.8,
                top_p=0.95,
                top_k=50,
                repetition_penalty=1.1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )

        text = tokenizer.decode(out[0], skip_special_tokens=True).strip()

        print(f"Model Input {i}:")
        print(q)
        print(f"Model Output {i}:")
        print(text)
        print()


class QuestionsCallback(TrainerCallback):
    def __init__(self, questions, tokenizer, title, max_new_tokens=120):
        self.questions = questions
        self.tokenizer = tokenizer
        self.title = title
        self.max_new_tokens = max_new_tokens

    def on_evaluate(self, args, state, control, model=None, **kwargs):
        if model is None:
            return

        model.eval()
        device = next(model.parameters()).device

        print(f"\n=== {self.title} ===\n")

        with torch.no_grad():
            for i, q in enumerate(self.questions, 1):
                messages = [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": q},
                ]

                prompt = self.tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )

                inputs = self.tokenizer(prompt, return_tensors="pt")
                inputs = {k: v.to(device) for k, v in inputs.items()}

                out = model.generate(
                    **inputs,
                    max_new_tokens=self.max_new_tokens,
                    do_sample=True,
                    temperature=0.8,
                    top_p=0.95,
                    top_k=50,
                    repetition_penalty=1.1,
                    eos_token_id=self.tokenizer.eos_token_id,
                    pad_token_id=self.tokenizer.pad_token_id
                )

                text = self.tokenizer.decode(out[0], skip_special_tokens=True).strip()

                print(f"Model Input {i}:")
                print(q)
                print(f"Model Output {i}:")
                print(text)
                print()


Tokenizer: Qwen/Qwen2.5-0.5B
Model dtype: torch.float32
CUDA: True


In [28]:
from trl import SFTTrainer, SFTConfig
import inspect
import torch

per_device_train_batch_size = 2
gradient_accumulation_steps = 32  # 2*32 = 64 effective

sig = inspect.signature(SFTConfig)
use_eval_strategy = "eval_strategy" in sig.parameters

cfg_kwargs = dict(
    output_dir="./qwen_sft_checkpoints",
    overwrite_output_dir=True,

    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=gradient_accumulation_steps,

    num_train_epochs=1,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",

    weight_decay=0.1,

    logging_steps=50,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,

    report_to="none",

    bf16=False,  # <-- для 2080 Ti
    fp16=True,   # <-- используем AMP fp16 через Trainer
)

if use_eval_strategy:
    cfg_kwargs["eval_strategy"] = "steps"
else:
    cfg_kwargs["evaluation_strategy"] = "steps"

sft_config = SFTConfig(**cfg_kwargs)

print("SFT fp16:", sft_config.fp16)
print("SFT bf16:", sft_config.bf16)
print("Effective batch:", per_device_train_batch_size * gradient_accumulation_steps)


# =========================
# 8) Trainer
# =========================
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    callbacks=[QuestionsCallback(questions_rus, tokenizer, title="Base Model (After SFT) Output", max_new_tokens=120)]
)

# =========================
# 9) Train
# =========================
trainer.train()


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


SFT fp16: True
SFT bf16: False
Effective batch: 64


Step,Training Loss,Validation Loss


KeyboardInterrupt: 