In [1]:
!pip install transformers trl==0.18.1 datasets accelerate


Collecting trl==0.18.1
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
import shutil
import pandas as pd
import torch
from accelerate import PartialState
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig
)
from trl import (
    PPOConfig,
    PPOTrainer,
    ModelConfig,
    ScriptArguments,
    get_peft_config,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE


# ====== PATHS (adjust these) ======
SFT_MODEL_PATH = "/content/drive/MyDrive/tinyllama-lora-sft-tuned-model"
REWARD_MODEL_PATH = "EleutherAI/pythia-1b-deduped"
CSV_PATH = "/content/drive/MyDrive/PPO_prompt_dataset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/ppo_output"


# ====== Custom Dataset Class ======
class CustomArgs:
    dataset_name = None
    dataset_config = None
    dataset_train_split = "train"
    dataset_test_split = None


In [11]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.0


In [6]:
from transformers import PreTrainedTokenizerBase


In [None]:
if __name__ == "__main__":
    script_args = CustomArgs()
    training_args = PPOConfig(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=1,            
        gradient_accumulation_steps=8,
        learning_rate=3e-6,
        total_episodes=5000,
        eval_strategy="no",
        response_length=32,                       
        stop_token="eos",
        seed=42
    )
    model_config = ModelConfig(
        model_name_or_path=SFT_MODEL_PATH,
        trust_remote_code=True,
        torch_dtype="auto",
        use_peft=True,
    )

    shutil.rmtree(training_args.output_dir, ignore_errors=True)

    tokenizer = AutoTokenizer.from_pretrained(
        model_config.model_name_or_path,
        padding_side="left"
    )
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    if tokenizer.chat_template is None:
        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
    )

    policy_model = AutoModelForCausalLM.from_pretrained(
        model_config.model_name_or_path,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True
    )
    policy_model.gradient_checkpointing_enable()

    reward_model = AutoModelForSequenceClassification.from_pretrained(
        REWARD_MODEL_PATH,
        num_labels=1,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True
    )
    value_model = AutoModelForSequenceClassification.from_pretrained(
        REWARD_MODEL_PATH,
        num_labels=1,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True
    )

    ref_policy = None
    peft_config = get_peft_config(model_config)

    df = pd.read_csv(CSV_PATH)

    assert "prompt" in df.columns, " Missing 'prompt' column in CSV."

    dataset = Dataset.from_pandas(df)

    def format_dataset(example):
        return {
            "messages": [{"role": "user", "content": example["prompt"]}]
        }

    dataset = dataset.map(format_dataset)
    print(dataset)

    def tokenize(example):
      prompt_text = tokenizer.apply_chat_template(
          example["messages"],
          add_generation_prompt=True,
          return_tensors=None,
          tokenize=False,
      )

      tokenized = tokenizer(prompt_text, truncation=True, max_length=512, return_tensors="pt")
      return {
          "input_ids": tokenized["input_ids"][0],
          "attention_mask": tokenized["attention_mask"][0],
          "lengths": tokenized["input_ids"].shape[-1]
      }

    with PartialState().local_main_process_first():
        dataset = dataset.map(tokenize, remove_columns=dataset.column_names)
        dataset = dataset.filter(lambda x: x["lengths"] <= 512)
        print("Samples after filtering:", len(dataset))

        dataset.set_format(type="torch")
    train_test_split = dataset.train_test_split(test_size=0.1, seed=42) # Adjust test_size as needed
    train_dataset = train_test_split["train"]
    eval_dataset = train_test_split["test"]


    trainer = PPOTrainer(
        args=training_args,
        model=policy_model,
        ref_model=ref_policy,
        processing_class=tokenizer,
        reward_model=reward_model,
        value_model=value_model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
    )



Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-1b-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-1b-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6236 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'messages'],
    num_rows: 6236
})


Map:   0%|          | 0/6236 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6236 [00:00<?, ? examples/s]

✅ Samples after filtering: 6236




In [None]:
    trainer.train()
#https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py

===training policy===




Step,Training Loss




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
