Pretrain

In [None]:
import os, glob, json, re, hashlib, itertools, textwrap, random, unicodedata, shutil, sys, math
from tqdm import tqdm


if not os.path.exists("RussianNovels"):
    !git clone -q https://github.com/JoannaBy/RussianNovels

CORPUS_DIR = "RussianNovels/corpus"
txt_files = sorted(glob.glob(os.path.join(CORPUS_DIR, "**", "*.txt"), recursive=True))
len(txt_files), txt_files[:3]

In [None]:
CYRILLIC_RE = re.compile(r"[А-Яа-яЁё]")
PUNCT_CLEAN = [
    (re.compile(r"[ \t]+"), " "),
    (re.compile(r"\u200b"), ""),
    (re.compile(r"[“”«»]"), '"'),
    (re.compile(r"[‘’]"), "'"),
    (re.compile(r"\.{3,}"), "…"),
    (re.compile(r"[!?]{3,}"), r"!!"),
    (re.compile(r",,+" ), ","),
]

def clean_line(s: str) -> str:
    s = unicodedata.normalize("NFKC", s).strip()
    for rx, rep in PUNCT_CLEAN:
        s = rx.sub(rep, s)
    return s

def is_keep_line(s: str) -> bool:
    s = s.strip()
    if len(s) < 2: return False
    if CYRILLIC_RE.search(s) is None:
        return False
    return True

docs = []
seen_par_hashes = set()

for fp in tqdm(txt_files, desc="reading"):
    with open(fp, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    raw = raw.replace("\r\n", "\n")

    paras = [clean_line(p) for p in re.split(r"\n\s*\n", raw)]
    keep = []
    for p in paras:
        if not is_keep_line(p):
            continue

        h = hashlib.md5(p.encode("utf-8")).hexdigest()
        if h in seen_par_hashes:
            continue
        seen_par_hashes.add(h)
        keep.append(p)
    if keep:
        docs.append("\n".join(keep))

len(docs)


In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, decoders, processors
from tokenizers.normalizers import NFKC
import os


assert isinstance(docs, list) and len(docs) > 0, "Список docs пуст — сначала соберите тексты."
os.makedirs("tok_data", exist_ok=True)
with open("tok_data/corpus.txt", "w", encoding="utf-8") as f:
    for d in docs:
        d = d.replace("\r\n", "\n").strip()
        if d:
            f.write(d + "\n\n")


tok = Tokenizer(models.BPE(unk_token="<unk>"))
tok.normalizer = normalizers.Sequence([NFKC()])
tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tok.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=3000,
    special_tokens=["<unk>", "<bos>", "<eos>", "<pad>"],
)


tok.train(["tok_data/corpus.txt"], trainer=trainer)


bos_id = tok.token_to_id("<bos>")
eos_id = tok.token_to_id("<eos>")
tok.post_processor = processors.TemplateProcessing(
    single = "<bos> $A <eos>",
    special_tokens=[("<bos>", bos_id), ("<eos>", eos_id)],
)


tok.save("bpe_tokenizer.json")


from transformers import PreTrainedTokenizerFast
hf_tok = PreTrainedTokenizerFast(
    tokenizer_file="bpe_tokenizer.json",
    bos_token="<bos>", eos_token="<eos>",
    unk_token="<unk>", pad_token="<pad>",
)
hf_tok.clean_up_tokenization_spaces = False


hf_tok.save_pretrained("bpe_tokenizer_hf")
print("vocab_size:", hf_tok.vocab_size)
print("specials:", hf_tok.special_tokens_map)


In [None]:
from datasets import Dataset, DatasetDict


dataset = Dataset.from_dict({"text": docs})
dataset = dataset.shuffle(seed=42)
split = dataset.train_test_split(test_size=0.02, seed=42)
ds = DatasetDict({"train": split["train"], "valid": split["test"]})
ds

CONTEXT = 512

def tokenize_with_bos_eos(batch):
    # BOS/EOS
    texts = [f"<bos> {t.strip()} <eos>" for t in batch["text"]]
    enc = hf_tok(texts, add_special_tokens=False)
    return {"input_ids": enc["input_ids"]}

tokenized = ds.map(tokenize_with_bos_eos, batched=True, remove_columns=["text"])


def group_texts(examples):
    concatenated = list(itertools.chain.from_iterable(examples["input_ids"]))
    total_len = (len(concatenated) // CONTEXT) * CONTEXT
    result = {
        "input_ids": [concatenated[i:i+CONTEXT] for i in range(0, total_len, CONTEXT)]
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_ds = tokenized.map(group_texts, batched=True, remove_columns=["input_ids"])
lm_ds

In [None]:
from transformers import AutoModelForCausalLM, LlamaConfig


config = LlamaConfig(
    vocab_size=3000,
    hidden_size=1024,
    intermediate_size=1536,
    num_hidden_layers=16,
    num_attention_heads=16,
    num_key_value_heads=8,
    max_position_embeddings=512,
)


model = AutoModelForCausalLM.from_config(config)

print(type(model))
print(f"Параметров в модели: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, TrainerCallback
import torch

data_collator = DataCollatorForLanguageModeling(tokenizer=hf_tok, mlm=False)

test_prompts = [
    "Все мысли, которые имеют огромное последствие",
    "Сила войска зависит от его духа",
    "Мысль о том, что он принес страдания",
    "Человек сознает себя свободным",
    "Что бы ни случилось, я всегда буду",
    "Любовь мешает смерти",
    "Нет, жизнь не кончена",
    "Всякая мысль, даже самая простая",
    "Война не любезность, а самое гадкое дело",
    "Чтобы жить честно"
]

def generate_samples(model, tokenizer, prompts, max_new_tokens=50, top_p=0.9, temperature=0.9):
    model.eval()
    outs = []
    for p in prompts:
        inp = tokenizer(f"<bos> {p}", return_tensors="pt")
        inp.pop("token_type_ids", None)
        inp = {k: v.to(model.device) for k, v in inp.items()}

        with torch.no_grad():
            gen = model.generate(
                **inp,
                do_sample=True,
                top_p=top_p,
                temperature=temperature,
                max_new_tokens=max_new_tokens,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )
        text = tokenizer.decode(gen[0], skip_special_tokens=True)
        outs.append((p, text))
    return outs

class EvalSamplesCallback(TrainerCallback):
    def __init__(self, tokenizer, prompts, every_steps=500):
        self.tok = tokenizer
        self.prompts = prompts
        self.every = every_steps
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step > 0 and state.global_step % self.every == 0:
            model = kwargs["model"]
            samples = generate_samples(model, self.tok, self.prompts[:3], max_new_tokens=60)
            print("\n=== Samples @step", state.global_step, "===")
            for p, t in samples:
                print(f"\n>> {p}\n{t[:400]}\n")

TRAIN_STEPS = 1500

training_args = TrainingArguments(
    output_dir="pretrain-rus-lit",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=64,
    per_device_eval_batch_size=1,
    logging_steps=50,
    save_steps=500,
    num_train_epochs=1,
    max_steps=TRAIN_STEPS,
    learning_rate=3e-4,
    weight_decay=0.01,
    warmup_ratio=0.05,
    fp16=torch.cuda.is_available(),
    bf16=False,
    report_to="none",
    do_eval=True,
    eval_steps=250,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_ds["train"],
    eval_dataset=lm_ds["valid"],
    data_collator=data_collator,
)

trainer.add_callback(EvalSamplesCallback(hf_tok, test_prompts, every_steps=500))

if hf_tok.pad_token is None:
    hf_tok.pad_token = hf_tok.eos_token
hf_tok.padding_side = "right"
model.config.pad_token_id = hf_tok.pad_token_id
model.config.eos_token_id = hf_tok.eos_token_id
if getattr(model.config, "bos_token_id", None) is None and hf_tok.bos_token_id is not None:
    model.config.bos_token_id = hf_tok.bos_token_id

model.gradient_checkpointing_enable()
model.config.use_cache = False

train_result = trainer.train()

trainer.save_model()
hf_tok.save_pretrained(training_args.output_dir)

print("Final train loss:", train_result.training_loss)

In [None]:
samples = generate_samples(model, hf_tok, test_prompts, max_new_tokens=60)
for p, t in samples:
    print(f"\n=== PROMPT ===\n{p}\n---\n{t}\n")

Post-train SFT

In [None]:
!pip install -U transformers datasets trl peft accelerate

In [None]:
import torch, random
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

seed = 42
random.seed(seed); torch.manual_seed(seed)

base_model_id = "Qwen/Qwen2.5-0.5B"

tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    dtype=(torch.float16 if torch.cuda.is_available() else torch.float32),
    device_map="auto",
)
model.config.pad_token_id = tokenizer.pad_token_id

model.gradient_checkpointing_enable()
try:
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

# LoRA
peft_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none", task_type="CAUSAL_LM",
)

print("dtype:", next(model.parameters()).dtype)


In [None]:

raw = load_dataset("d0rj/alpaca-cleaned-ru")

PROMPT_TPL = """### Система:
{system}

### Пользователь:
{instruction}

### Ассистент:
"""
RESPONSE_TAG = "### Ассистент:\n"
def to_text(ex):
    system = (ex.get("system") or "").strip()
    instr  = (ex.get("instruction") or "").strip()
    out    = (ex.get("output") or "").strip()
    return {"text": PROMPT_TPL.format(system=system, instruction=instr) + out}

train_all = raw["train"].map(to_text, remove_columns=raw["train"].column_names)


split = train_all.train_test_split(test_size=0.02, seed=seed)
train_ds, valid_ds = split["train"], split["test"]


train_ds = train_ds.shuffle(seed=seed).select(range(20000))
valid_ds = valid_ds.shuffle(seed=seed).select(range(1000))


n = min(2000, len(train_ds))
bad = sum(1 for t in train_ds.select(range(n))["text"] if RESPONSE_TAG not in t)
print(f"Проверка шаблона: примеров без RESPONSE_TAG = {bad}/{n} (должно быть 0)")


In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling

RESPONSE_TAG = "### Ассистент:\n"

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

use_cuda = torch.cuda.is_available()
sft_cfg = SFTConfig(
    output_dir="qwen-0.5b-sft-ru",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    num_train_epochs=1,
    logging_steps=100,
    save_steps=1000,
    report_to="none",
    fp16=use_cuda,
    bf16=False,
)


def formatting_func(example):
    return example["text"]

trainer = SFTTrainer(
    model=model,
    args=sft_cfg,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    peft_config=peft_cfg,
    formatting_func=formatting_func,
    data_collator=data_collator,
    )

trainer.train()


In [None]:
questions_rus = [
    "сколько планет в нашей солнечной системе?",
    "расскажи стих",
    "когда собирать крыжовник?",
    "Как быстро выучить новый язык?",
]

def make_prompt(q):
    return PROMPT_TPL.format(system="Ты дружелюбный русскоязычный ассистент.", instruction=q)

model.eval()
for q in questions_rus:
    inputs = tokenizer(make_prompt(q), return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            do_sample=True, top_p=0.9, temperature=0.7,
            max_new_tokens=180,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    print(f"\n=== PROMPT ===\n{q}\n---\n{tokenizer.decode(out[0], skip_special_tokens=True)}")