In [3]:
!pip -q install "transformers>=4.41.0" datasets accelerate trl peft bitsandbytes \
              einops wandb --upgrade


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.3/366.3 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# ══ CONSTANTS ══
import torch
from pathlib import Path

# Device configuration
SEED         = 42
FP16         = torch.cuda.is_available()
DEVICE       = "cuda" if FP16 else "cpu"

# Dataset
DATASET_ID   = "juyoungml/HelpSteer2-binarized"

# Model
MODEL_NAME   = "HuggingFaceTB/SmolLM2-135M-Instruct"

# Reward model storage
RM_DIR_ZIP   = Path("/content/rm_ckpt.zip")
RM_EXTRACT   = Path("/content/rm_ckpt")

# Sequence lengths
MAX_LEN      = 512
MAX_LEN_RM   = 512
MAX_NEW_TOKENS = 128

# Training config for reward model
BATCH        = 4
EPOCHS       = 1
LR           = 5e-5

# Training config for REINFORCE
BATCH_SIZE       = 4
STEPS            = 300
LR_POLICY        = 1e-5
BASELINE_ALPHA   = 0.9


In [2]:
import os, torch, random, json, math, accelerate, numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    AutoModelForSequenceClassification, TrainingArguments
)
from trl import RewardTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if DEVICE == "cuda": torch.cuda.manual_seed_all(SEED)

print(f"device: {DEVICE}   fp16: {FP16}")


device: cuda   fp16: True


In [3]:
raw = load_dataset(DATASET_ID)

def split_columns(example):
    return {
        "prompt":   example["prompt"],
        "chosen":   example["chosen"],
        "rejected": example["rejected"],
    }

splits = DatasetDict({
    "train":      raw["train"].map(split_columns, remove_columns=raw["train"].column_names),
    "validation": raw["validation"].map(split_columns, remove_columns=raw["validation"].column_names),
})

print(splits)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/609 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/708k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7224 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/373 [00:00<?, ? examples/s]

Map:   0%|          | 0/7224 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 7224
    })
    validation: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 373
    })
})


In [4]:
policy_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if FP16 else torch.float32,
    device_map="auto",
)

# LoRA
policy_model = prepare_model_for_kbit_training(policy_model)
lora_cfg = LoraConfig(
    r=16, lora_alpha=32, target_modules=["q_proj","k_proj","v_proj","o_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
policy_model = get_peft_model(policy_model, lora_cfg).to(DEVICE)
policy_model.print_trainable_parameters()


config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

trainable params: 1,843,200 || all params: 136,358,208 || trainable%: 1.3517


In [5]:
# Train Reward Model

rm_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    torch_dtype=torch.float16 if FP16 else torch.float32,
    device_map="auto",
)

training_args = TrainingArguments(
    output_dir="./rm_training",           # Output directory
    per_device_train_batch_size=8,        # Train batch size
    per_device_eval_batch_size=8,         # Eval batch size
    num_train_epochs=1,                   # One epoch (as instructed)
    learning_rate=5e-5,                   # Learning rate
    fp16=FP16,                            # Use fp16 if available
    seed=SEED,                            # Seed for reproducibility
)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import os, json, glob, math, shutil, zipfile, random, torch, pathlib
from tqdm.auto import tqdm
from pathlib import Path
from huggingface_hub import snapshot_download
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          AutoModelForCausalLM, BitsAndBytesConfig,
                          get_linear_schedule_with_warmup)

random.seed(SEED); torch.manual_seed(SEED)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [7]:
local_dset_dir = snapshot_download(repo_id=DATASET_ID, repo_type="dataset",
                                   local_dir_use_symlinks=False)

import datasets, glob
def parquet_files(split):
    pat = os.path.join(local_dset_dir, "**", f"{split}*.parquet")
    return sorted(glob.glob(pat, recursive=True))

raw = {
    "train": datasets.Dataset.from_parquet(parquet_files("train")),
    "validation": (
        datasets.Dataset.from_parquet(parquet_files("validation"))
        if parquet_files("validation") else
        datasets.Dataset.from_parquet(parquet_files("test"))
    )
}
splits = datasets.DatasetDict(raw)


def encode_pair(ex):
    c = tokenizer(
        ex["prompt"] + ex["chosen"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN_RM
    )
    r = tokenizer(
        ex["prompt"] + ex["rejected"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN_RM
    )
    return {
        "input_ids_c": c["input_ids"], "attn_c": c["attention_mask"],
        "input_ids_r": r["input_ids"], "attn_r": r["attention_mask"],
    }

proc = splits.map(encode_pair, batched=False,
                  remove_columns=splits["train"].column_names)



Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7224 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
import torch, numpy as np
from tqdm.auto import tqdm
import shutil

def pairwise_collate(examples):
    """
    Converts list of dicts to dict of Long tensors.
    Keys: input_ids_c / attn_c / input_ids_r / attn_r
    """
    batch = {}
    for k in examples[0].keys():
        batch[k] = torch.tensor([e[k] for e in examples], dtype=torch.long)
    return batch

def train_reward_model():
    # Load RM tokenizer
    rm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    rm_tokenizer.pad_token = rm_tokenizer.eos_token
    rm_tokenizer.padding_side = "left"

    # Tokenize chosen/rejected pairs
    def encode_pair(ex):
        c = rm_tokenizer(
            ex["prompt"] + ex["chosen"],
            truncation=True, padding="max_length", max_length=MAX_LEN_RM
        )
        r = rm_tokenizer(
            ex["prompt"] + ex["rejected"],
            truncation=True, padding="max_length", max_length=MAX_LEN_RM
        )
        return {
            "input_ids_c": c["input_ids"], "attn_c": c["attention_mask"],
            "input_ids_r": r["input_ids"], "attn_r": r["attention_mask"],
        }

    # Apply tokenization
    proc = splits.map(encode_pair, batched=False, remove_columns=splits["train"].column_names)

    # Data collation function
    def pairwise_collate(examples):
        batch = {}
        for k in examples[0].keys():
            batch[k] = torch.tensor([e[k] for e in examples], dtype=torch.long)
        return batch

    # DataLoader for RM training
    train_loader = DataLoader(
        proc["train"],
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        collate_fn=pairwise_collate,
    )

    # Initialize RM model
    rm = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=1,
        torch_dtype=torch.float32,
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(rm.parameters(), lr=5e-5)
    steps_total = len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.06 * steps_total),
        num_training_steps=steps_total,
    )

    # Training loop
    rm.train()
    pbar = tqdm(train_loader, desc="Training RM")
    losses = []

    for batch in pbar:
        ic, ac = batch["input_ids_c"].to(DEVICE), batch["attn_c"].to(DEVICE)
        ir, ar = batch["input_ids_r"].to(DEVICE), batch["attn_r"].to(DEVICE)

        logits_c = rm(ic, attention_mask=ac).logits.squeeze(-1)
        logits_r = rm(ir, attention_mask=ar).logits.squeeze(-1)
        loss = -torch.nn.functional.logsigmoid(logits_c - logits_r).mean()

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        losses.append(loss.item())
        pbar.set_postfix(loss=loss.item())

    print("Finished training RM | mean loss:", np.mean(losses))

    # Save model and tokenizer
    rm.save_pretrained(RM_EXTRACT)
    rm_tokenizer.save_pretrained(RM_EXTRACT)
    shutil.make_archive("rm_ckpt", "zip", root_dir=RM_EXTRACT)
    print("Saved to:", RM_EXTRACT)

train_reward_model()


Map:   0%|          | 0/7224 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training RM:   0%|          | 0/1806 [00:00<?, ?it/s]

Finished training RM | mean loss: 0.6821198119624112
Saved to: /content/rm_ckpt


In [10]:
# ────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path
import torch, numpy as np

rm_tokenizer = AutoTokenizer.from_pretrained(RM_EXTRACT, use_fast=True)
rm_tokenizer.pad_token = rm_tokenizer.eos_token
rm_tokenizer.padding_side = "left"

reward_model = AutoModelForSequenceClassification.from_pretrained(
    RM_EXTRACT,
    num_labels=1,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
).eval()

tokenizer.padding_side                 = "left"
tokenizer.pad_token                    = tokenizer.eos_token
policy_model.config.pad_token_id       = tokenizer.pad_token_id
reward_model.config.pad_token_id       = rm_tokenizer.pad_token_id


In [11]:
# Pick a sample from the training set
example = splits["train"][0]

prompt   = example["prompt"]
chosen   = example["chosen"]
rejected = example["rejected"]

# Prepare texts for reward scoring
texts = [prompt + chosen, prompt + rejected]

# Compute rewards using reward model
with torch.no_grad():
    toks = rm_tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN_RM
    ).to(DEVICE)

    with torch.cuda.amp.autocast(enabled=False):
        logits = reward_model(**toks).logits.squeeze(-1).float()

    logits = torch.nan_to_num(logits, nan=0.0, posinf=0.0, neginf=0.0)

print("Reward for chosen  :", logits[0].item())
print("Reward for rejected:", logits[1].item())


  with torch.cuda.amp.autocast(enabled=False):


Reward for chosen  : 0.151611328125
Reward for rejected: -0.1290283203125


In [12]:
@torch.no_grad()
def rm_score(texts):
    """
    Compute reward scores for a list of texts using the reward model.
    """
    toks = rm_tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN_RM
    ).to(DEVICE)

    with torch.cuda.amp.autocast(enabled=False):
        logits = reward_model(**toks).logits.squeeze(-1).float()

    logits = torch.nan_to_num(logits, nan=0.0, posinf=0.0, neginf=0.0)
    return logits


In [13]:
p = "What is RLHF?"
chosen = p + "Reinforcement Learning from Human Feedback is a method of training LLMs..."
rejected = p + "I don't know."

print("Reward chosen:  ", rm_score([chosen]))
print("Reward rejected:", rm_score([rejected]))

  with torch.cuda.amp.autocast(enabled=False):


Reward chosen:   tensor([-0.6348], device='cuda:0')
Reward rejected: tensor([-2.4824], device='cuda:0')


In [None]:
# REINFORCE training loop for policy model alignment

from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from tqdm.auto import tqdm
import numpy as np
import torch

optimizer = AdamW(policy_model.parameters(), lr=LR_POLICY)
baseline = 0.0
policy_model.train()

# Training prompts from dataset
train_prompts = splits["train"]["prompt"]

# Ensure correct tokenizer settings
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"
tokenizer.pad_token = tokenizer.eos_token

for step in tqdm(range(STEPS), desc="REINFORCE"):
    # Sample a minibatch of prompts
    idx = np.random.randint(0, len(train_prompts), size=BATCH_SIZE)
    batch = [train_prompts[i] for i in idx]

    enc = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN
    ).to(DEVICE)

    # Generate responses
    with torch.no_grad():
        gen_out = policy_model.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True
        )
    full_seq = gen_out.sequences
    gen_tokens = full_seq[:, enc.input_ids.size(1):]

    # Compute log pi(a|s) for the generated part
    attn_mask = (full_seq != tokenizer.pad_token_id).long()
    logits = policy_model(full_seq, attention_mask=attn_mask).logits.float()
    log_probs = torch.log_softmax(logits, dim=-1)

    Lg = gen_tokens.size(1)
    logp_slice = log_probs[:, -Lg:, :]
    token_logp = logp_slice.gather(2, gen_tokens.unsqueeze(-1)).squeeze(-1).sum(dim=1)

    # Compute REWARD using reward model
    texts = [p + tokenizer.decode(g, skip_special_tokens=True) for p, g in zip(batch, gen_tokens)]
    reward = rm_score(texts)

    # for i, text in enumerate(texts):
    #     print(f"TEXT {i}: {text}")
    # print("REWARD:", reward)

    # Compute loss using advantage
    baseline = BASELINE_ALPHA * baseline + (1 - BASELINE_ALPHA) * reward.mean().item()
    advantage = reward - baseline
    loss = -(advantage.detach() * token_logp).mean()

    # Update policy network
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    clip_grad_norm_(policy_model.parameters(), 1.0)
    optimizer.step()

    if step % 20 == 0:
        print(f"step {step:3d} | reward {reward.mean():7.3f} "
              f"| baseline {baseline:7.3f} | loss {loss.item():8.4f}")


REINFORCE:   0%|          | 0/300 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=False):


[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
 %%map drum ^c' heads=xhead print=c'  % Crash
 %%map drum ^g heads=xhead print=g  % Hi-Hat
 %%map drum ^A' heads=xhead print=A'  % Ride
 %%map drum ^B' heads=triangle print=A' % Ride Bell
 %%map drum ^d, heads=xhead print=d,'  % Pedal hihat
 %
 %
 X:6
 T:
 M:4/4
 L:1/8
 %
 %%score (Drums)
 K:C clef=perc
 V:Drums stem=up
 %%voicemap drum
 [F^g]^g [c^g]^g [F^g]^g [c^g]^g|]
 

<extra_id_1>Assistant
Yes, I can interpret this Music Notation. This is a drum part with the following drumset configuration: a Crash Cymbal at the beginning of each measure, a Hi-Hat Cymbal at the beginning of each beat, a Ride Cymbal at the beginning of each 2nd beat, a Ride Cymbal Bell at the beginning of each 3rd beat, and a Pedalled Hi-Hat Cymbal at the beginning of each 4th beat. The overall tempo is approximately 120 beats per minute, in a 4/4 time signature.
<extra_id_1>User
Amazing. Could you generate a readable drum tab for t

In [None]:
# ────────────────────────────────────────────
@torch.no_grad()
def average_reward(model, prompts, batch_size=8, max_gen=128):
    model.eval()
    scores = []
    for i in range(0, len(prompts), batch_size):
        sub = prompts[i : i + batch_size]
        enc = tokenizer(
            sub, return_tensors="pt", padding=True,
            truncation=True, max_length=MAX_LEN
        ).to(DEVICE)

        gen_out = model.generate(
            **enc, max_new_tokens=max_gen, do_sample=True, top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id, return_dict_in_generate=True
        )
        gen_txt = [
            p + tokenizer.decode(seq[enc.input_ids.shape[1]:],
                                 skip_special_tokens=True)
            for p, seq in zip(sub, gen_out.sequences)
        ]
        scores.extend(rm_score(gen_txt).cpu().tolist())
    return float(np.mean(scores))

val_prompts = splits["validation"]["prompt"][:512]

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto"
)
reward_base = average_reward(base_model, val_prompts)

reward_rl   = average_reward(policy_model, val_prompts)

print(f"Avg reward SFT : {reward_base:8.4f}")
print(f"Avg reward RL  : {reward_rl:8.4f}")
improve = reward_rl - reward_base
print(f"delta improvement    : {improve:+8.4f}")


  with torch.cuda.amp.autocast(enabled=False):


Avg reward SFT :  -0.0537
Avg reward RL  :  -0.0366
delta improvement    :  +0.0171


In [17]:
if tokenizer.pad_token_id == tokenizer.eos_token_id:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
    pad_id = tokenizer.pad_token_id
    policy_model.resize_token_embeddings(len(tokenizer))
    reward_model.resize_token_embeddings(len(tokenizer))
else:
    pad_id = tokenizer.pad_token_id

tokenizer.padding_side          = "left"
policy_model.config.pad_token_id = pad_id
reward_model.config.pad_token_id = pad_id
rm_tokenizer.padding_side        = "left"
rm_tokenizer.pad_token           = tokenizer.pad_token
rm_tokenizer.pad_token_id        = pad_id


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [18]:
def chat(model, prompt, max_gen=128):
    enc = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    out = model.generate(
        **enc,
        max_new_tokens=max_gen,
        do_sample=True,
        top_p=0.9,
        pad_token_id=pad_id,
        eos_token_id=tokenizer.eos_token_id
    )
    gen_text = tokenizer.decode(
        out[0][enc.input_ids.shape[1]:],
        skip_special_tokens=True
    )
    return gen_text


In [19]:

test_prompt = "Question: Why is the sky blue?\nResponse:"


print("── SFT reply ──")
print(chat(base_model,  test_prompt))
print("\n── RL reply  ──")
print(chat(policy_model, test_prompt))


── SFT reply ──
 The sky is blue because of a combination of the blue light that is emitted by the Earth's oceans and atmosphere, and the scattering of light by the air. As light passes through the atmosphere, it is scattered in all directions in every direction. However, there are certain parts of the sky that reflect more blue light than others, so the blue light from the oceans and atmosphere reaches the surface of the Earth in a wider range of directions than the blue light from the air. As a result, the blue light is scattered, and the sky appears blue.
Answer:

── RL reply  ──
 The sky appears blue because it contains oxygen, which is a gas that gives the sky its blue color. Oxygen is a byproduct of the combustion of fuels like fossil fuels, and the sun emits ultraviolet radiation that is not absorbed by the atmosphere. This ultraviolet radiation causes the color of the sky to shift, causing it to appear blue.


In [20]:
SAVE_DIR = "./policy_rl_lora"
policy_model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("RL-LoRA адаптер сохранён →", SAVE_DIR)




RL-LoRA адаптер сохранён → ./policy_rl_lora


In [28]:
# Level 2 - Train reward model with 10-class output
NUM_CLASSES = 10
import torch, numpy as np
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict

# Load train/validation splits from dataset
raw = load_dataset(DATASET_ID)

# Keep only required fields: prompt, chosen, rejected
def split_columns(example):
    return {
        "prompt"  : example["prompt"],
        "chosen"  : example["chosen"],
        "rejected": example["rejected"],
    }

# Map dataset to desired format
splits = DatasetDict({
    "train"      : raw["train"].map(split_columns, remove_columns=raw["train"].column_names),
    "validation" : raw["validation"].map(split_columns, remove_columns=raw["validation"].column_names),
})

# Load or reuse tokenizer
rm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Add <pad> token if missing
if rm_tokenizer.pad_token_id is None:
    rm_tokenizer.add_special_tokens({"pad_token": "<pad>"})
pad_id = rm_tokenizer.pad_token_id

# Set padding/truncation behavior
rm_tokenizer.padding_side    = "left"
rm_tokenizer.truncation_side = "left"

# Load reward model with 10 output classes and resize embeddings
rm = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=NUM_CLASSES, torch_dtype=torch.float32
     )
rm.resize_token_embeddings(len(rm_tokenizer))
rm.to(DEVICE)

# Tokenize dataset as input pairs for reward model
def make_dataset(tokenizer, max_len=512):
    def encode_pair(ex):
        c = tokenizer(ex["prompt"] + ex["chosen"],
                      truncation=True, padding="max_length",
                      max_length=max_len)
        r = tokenizer(ex["prompt"] + ex["rejected"],
                      truncation=True, padding="max_length",
                      max_length=max_len)
        return {
            "ic": c["input_ids"], "ac": c["attention_mask"],
            "ir": r["input_ids"], "ar": r["attention_mask"],
        }
    return splits.map(encode_pair, remove_columns=splits["train"].column_names)

# Preprocess dataset
proc = make_dataset(rm_tokenizer)

# Collate function to form batches of tensors
def collate(ex):
    return {k: torch.tensor([e[k] for e in ex]) for k in ex[0]}

# Create dataloader for training
train_loader = DataLoader(proc["train"], batch_size=2, shuffle=True,
                          drop_last=True, collate_fn=collate)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/373 [00:00<?, ? examples/s]

In [None]:
opt  = torch.optim.AdamW(rm.parameters(), lr=5e-5)
sched = get_linear_schedule_with_warmup(
        opt, 0, len(train_loader)*1)

rm.train(); losses=[]
for epoch in range(1):
    pbar = tqdm(train_loader, desc=f"RM epoch {epoch+1}")
    for batch in pbar:
        ic, ac = batch["ic"].to(DEVICE), batch["ac"].to(DEVICE)
        ir, ar = batch["ir"].to(DEVICE), batch["ar"].to(DEVICE)

        pc = torch.softmax(rm(ic, attention_mask=ac).logits, -1)
        pr = torch.softmax(rm(ir, attention_mask=ar).logits, -1)

        score_vec = torch.arange(1, NUM_CLASSES+1, device=DEVICE)
        Ec = (pc * score_vec).sum(-1)
        Er = (pr * score_vec).sum(-1)

        loss = -torch.nn.functional.logsigmoid(Ec - Er).mean()
        loss.backward(); opt.step(); sched.step(); opt.zero_grad()
        losses.append(loss.item()); pbar.set_postfix(loss=float(loss))
print("mean loss:", np.mean(losses))

rm.eval(); rm_tokenizer.save_pretrained("rm_v2"); rm.save_pretrained("rm_v2")


RM epoch 1:   0%|          | 0/3612 [00:00<?, ?it/s]

mean loss: 0.6758432380054744


In [31]:
score_vec = torch.arange(1, NUM_CLASSES+1, device=DEVICE)

@torch.no_grad()
def rm_expect(texts):
    tok = rm_tokenizer(texts, return_tensors="pt",
                       padding=True, truncation=True,
                       max_length=MAX_LEN_RM).to(DEVICE)
    with torch.cuda.amp.autocast(enabled=False):
        p = torch.softmax(rm(**tok).logits.float(), -1)
    return (p * score_vec).sum(-1)


In [32]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Load tokenizer and set pad token settings
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token        = tokenizer.eos_token
tokenizer.padding_side     = "left"
tokenizer.truncation_side  = "left"

# Load SFT model and prepare for LoRA adaptation
policy_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if FP16 else torch.float32,
    device_map="auto"
)

# Prepare model for LoRA training (e.g., cast norms to fp32)
policy_model = prepare_model_for_kbit_training(policy_model)

# Define LoRA config for attention projections
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap model with LoRA adapters
policy_model = get_peft_model(policy_model, lora_cfg).to(DEVICE)

# Optional: print number of trainable parameters
policy_model.print_trainable_parameters()


trainable params: 1,843,200 || all params: 136,358,208 || trainable%: 1.3517


In [None]:
# ═  REINFORCE loop  ══
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from tqdm.auto import tqdm
import numpy as np, torch

# — Optimizer and baseline initialization —
optimizer  = AdamW(policy_model.parameters(), lr=LR_POLICY)
baseline   = 0.0
policy_model.train()

# Get all training prompts
train_prompts = splits["train"]["prompt"]

# Set tokenizer to left-padding for generation
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"
tokenizer.pad_token    = tokenizer.eos_token

for step in tqdm(range(STEPS), desc="REINFORCE"):
    # Sample a batch of prompts
    idx   = np.random.randint(0, len(train_prompts), size=BATCH_SIZE)
    batch = [train_prompts[i] for i in idx]

    enc = tokenizer(batch, return_tensors="pt",
                    padding=True, truncation=True,
                    max_length=MAX_LEN).to(DEVICE)

    # Generate responses
    with torch.no_grad():
        gen_out = policy_model.generate(
            **enc,
            max_new_tokens     = MAX_NEW_TOKENS,
            do_sample          = True,
            top_p              = 0.9,
            pad_token_id       = tokenizer.pad_token_id,
            eos_token_id       = tokenizer.eos_token_id,
            return_dict_in_generate=True
        )
    full_seq   = gen_out.sequences
    gen_tokens = full_seq[:, enc.input_ids.size(1):]

    # Compute log-probs of sampled tokens
    attn_mask = (full_seq != tokenizer.pad_token_id).long()
    logits    = policy_model(full_seq, attention_mask=attn_mask).logits.float()
    log_probs = torch.log_softmax(logits, dim=-1)

    Lg = gen_tokens.size(1)
    logp_slice = log_probs[:, -Lg:, :]
    token_logp = logp_slice.gather(2, gen_tokens.unsqueeze(-1)
                  ).squeeze(-1).sum(dim=1)

    # Compute rewards
    texts = [p + tokenizer.decode(g, skip_special_tokens=True)
             for p, g in zip(batch, gen_tokens)]
    reward = rm_expect(texts)

    # for i, text in enumerate(texts):
    #     print(f"TEXT {i}: {text}")
    # print("REWARD:", reward)

    # Compute advantage and loss
    baseline  = BASELINE_ALPHA * baseline + (1 - BASELINE_ALPHA) * reward.mean().item()
    advantage = reward - baseline

    loss = -(advantage.detach() * token_logp).mean()

    # Backprop and update policy
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    clip_grad_norm_(policy_model.parameters(), 1.0)
    optimizer.step()

    # Logging every 20 steps
    if step % 20 == 0:
        print(f"step {step:3d} | reward {reward.mean():7.3f} "
              f"| baseline {baseline:7.3f} | loss {loss.item():8.4f}")


REINFORCE:   0%|          | 0/300 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=False):


[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Slide 6:
- **Employee turnover**: 10.1 million workers lost their jobs due to employee turnover and increased turnover.
- **Inventory management**: 5.9 million employees lost their jobs due to inventory management in the first wave of COVID-19.

Slide 7:
- **Diversification of job markets**: 30.7 million workers lost their jobs due to diversification in the job market.
- **Employee turnover and job churn**: 23.5 million workers lost their jobs due to employee turnover and job churn.

Slide 8:
- **Infrastructure and workforce access**: 3.9 million workers lost their jobs due to infrastructure and workforce
TEXT 3: can you write me a power shell script for windows, that lists all membergroups and their members?
<extra_id_1>Assistant
Sure, here’s a simple script to list all groups and their members:
 

 Get-Msolusegroup | Select-Object name, members
 

 You can run this script in the Power Shell on your comp

In [41]:
#Final step
def chat(model, prompt, max_gen=128):
    enc = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    out = model.generate(
        **enc,
        max_new_tokens=max_gen, do_sample=True, top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True
    )
    return tokenizer.decode(out.sequences[0][enc.input_ids.shape[1]:], skip_special_tokens=True)

eval_prompts = [
    "Why is the sky blue?:",
    "What is RLHF?:",
    "Explain how photosynthesis works.:",
]
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto"
)
print("\n Evaluation:")
for prompt in eval_prompts:
    sft_response = chat(base_model, prompt)
    rl_response  = chat(policy_model, prompt)

    reward_sft = rm_expect([prompt + sft_response]).item()
    reward_rl  = rm_expect([prompt + rl_response]).item()

    print(f"\nPrompt: {prompt}")
    print(f"SFT: {sft_response!r}  | Reward: {reward_sft:.4f}")
    print(f"RL : {rl_response!r}  | Reward: {reward_rl:.4f}")
    improve = reward_rl - reward_sft
    print(f"delta improvement    : {improve:+8.4f}")



 Evaluation:


  with torch.cuda.amp.autocast(enabled=False):



Prompt: Why is the sky blue?:
SFT: '\nIs the sky blue in the movie Skyfall?'  | Reward: 1.3632
RL : '\n\n1. **Blue light:** This is due to the blue range of wavelengths in the visible spectrum, which is more accessible and easier to perceive for humans than the red and orange wavelengths. Blue light is often emitted by the sun, stars, and other celestial objects.\n\n2. **Spectacular:** Sky blue is often associated with the vastness of the universe and the sky itself, creating a sense of awe and wonder. Sky blue also evokes a sense of harmony and balance, as the colors of the sky and the surrounding landscape seem to be in balance.\n\nRemember that the sky blue is subjective and can depend on personal'  | Reward: 3.0625
delta improvement    :  +1.6993

Prompt: What is RLHF?:
SFT: ''  | Reward: 1.1689
RL : "\nRLHF stands for 'Real Linear Loopy Fitting'. It is a popular method for fitting linear regression models to non-linear data. It is often used for simple linear regression models an

In [None]:
import torch, numpy as np
from tqdm.auto import tqdm

def mean_reward(model, prompts, rm_fn, *, batch=8, max_len=128):
    model.eval()
    rewards = []

    for i in tqdm(range(0, len(prompts), batch), desc="eval"):
        batch_prompts = prompts[i:i+batch]
        enc = tokenizer(batch_prompts,
                        return_tensors="pt",
                        padding=True,
                        truncation=True,
                        max_length=MAX_LEN).to(DEVICE)

        with torch.no_grad():
            gen = model.generate(
                **enc,
                max_new_tokens=max_len,
                do_sample=False,           
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        gen_txt = []
        for p, seq in zip(batch_prompts, gen):
            answer = tokenizer.decode(seq[enc.input_ids.shape[1]:],
                                      skip_special_tokens=True)
            gen_txt.append(p + answer)

        r = rm_fn(gen_txt).cpu().numpy()   
        rewards.extend(r)

    return float(np.mean(rewards))


In [None]:
# Testing base model and improved reward model
import random
sample_idx   = random.sample(range(len(splits["validation"])),
                             k = len(splits["validation"]) // 10)
sub_prompts  = [splits["validation"]["prompt"][i] for i in sample_idx]

mean_sft = mean_reward(base_model,   sub_prompts, rm_expect)
mean_rl  = mean_reward(policy_model, sub_prompts, rm_expect)

print(f"SFT mean reward : {mean_sft:.4f}")
print(f"RL  mean reward : {mean_rl :.4f}")
print(f"Δ improvement   : {mean_rl - mean_sft:+.4f}")

assert mean_rl > mean_sft, "Probabilistic RM: reward did not improve"


eval:   0%|          | 0/5 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=False):


eval:   0%|          | 0/5 [00:00<?, ?it/s]

SFT mean reward : 1.4113
RL  mean reward : 1.4529
Δ improvement   : +0.0415
