# Offline RL (advantage-weighted SFT) for Qwen-0.5B on RL rewards
This notebook fine-tunes your existing Qwen-0.5B SFT model with the reward-annotated dataset `rl_games_positions.jsonl` produced by `train_scripts/rl_pgn_data_prep.py`.
It uses a lightweight advantage-weighted SFT style (single pass, top-K by reward) to fit within a Kaggle T4 24h budget.

In [None]:
%%bash
pip install -U "transformers>=4.44" "trl>=0.10" peft datasets accelerate huggingface_hub hf-transfer


In [None]:
import torch, os, json
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig


## Paths and config
- `data_path`: reward-annotated JSONL from `train_scripts/rl_pgn_data_prep.py`
- `base_model`: your SFT/merged checkpoint (adjust to your HF repo or local path).
- `output_dir`: where the RL LoRA adapter will be saved.

In [None]:
data_path = "rl_games_positions.jsonl"  # adjust if stored elsewhere
base_model = "bot-rakshit/qwen-chess-0.5b-108k-merged-new"  # or local path
output_dir = "qwen-chess-0.5b-rl-aw-sft"
top_k = 50000  # take top-K by reward for speed; set to None to use all


In [None]:
# Load dataset
ds = load_dataset("json", data_files=data_path, split="train")
if top_k is not None:
    ds = ds.sort("reward", reverse=True).select(range(min(top_k, len(ds))))
print(ds)


In [None]:
import os
tok = os.environ.get('HF_TOKEN')
tokenizer = AutoTokenizer.from_pretrained(base_model, token=tok)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 384
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    token=tok,
)


In [None]:
peft_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)
training_args = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    logging_steps=20,
    save_strategy="epoch",
    bf16=True,
    report_to="none",
    warmup_ratio=0.03,
    gradient_checkpointing=True,
)


In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=ds,
    args=training_args,
    peft_config=peft_config,
    processing_class=tokenizer,
)
trainer.train()
trainer.save_model(output_dir)
print('Saved adapter to', output_dir)


## Optional: merge LoRA into full weights
Run if you want a single merged checkpoint for inference/submission.

In [None]:
merge_out = "qwen-chess-0.5b-rl-aw-merged"
from peft import PeftModel
tok_env = os.environ.get('HF_TOKEN')
tok = tokenizer  # reuse
base = base_model
adapter = output_dir
model_base = AutoModelForCausalLM.from_pretrained(base, torch_dtype=torch.bfloat16, device_map="auto", token=tok_env)
model_base = PeftModel.from_pretrained(model_base, adapter)
merged = model_base.merge_and_unload()
tok.save_pretrained(merge_out)
merged.save_pretrained(merge_out)
print('Merged model saved to', merge_out)
