In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import KTOTrainer, KTOConfig
from peft import LoraConfig, get_peft_model
import torch
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("C:\\Users\\CSE IIT BHILAI\\Desktop\\work\\pairs_for_pmponly.csv")

# Drop missing data
df = df.dropna(subset=["prompt", "response"]).reset_index(drop=True)

# Convert to required KTO columns
df["label"] = df["is_positive"].astype(bool)
df["completion"] = df["response"]

print(f"Total dataset size: {len(df)} rows")

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["label"])
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(test_df)}")

# Convert to Hugging Face DatasetDict
train_ds = Dataset.from_pandas(train_df[["prompt", "completion", "label"]])
test_ds = Dataset.from_pandas(test_df[["prompt", "completion", "label"]])
dataset = DatasetDict({"train": train_ds, "test": test_ds})

print(dataset)
print(dataset["train"][0])

  from .autonotebook import tqdm as notebook_tqdm


Total dataset size: 17428 rows
Training samples: 12199
Validation samples: 5229
DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'label', '__index_level_0__'],
        num_rows: 12199
    })
    test: Dataset({
        features: ['prompt', 'completion', 'label', '__index_level_0__'],
        num_rows: 5229
    })
})
{'prompt': 'liar', 'completion': 'you are a liar', 'label': False, '__index_level_0__': 11963}


In [11]:
len(df)
df.columns

Index(['prompt', 'response', 'is_positive', 'src_idx', 'label', 'completion'], dtype='object')

In [2]:
# ============================================
# LOAD BASE MODEL & TOKENIZER
# ============================================

model_name = "EleutherAI/gpt-neo-1.3B"  # or "gpt2-medium", "gpt2-large"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Important for KTO

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load policy model (will be trained)
policy_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
).to(device)

# Load reference model (frozen)
ref_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
).to(device)

for param in ref_model.parameters():
    param.requires_grad = False

print("âœ… Models loaded successfully")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
`torch_dtype` is deprecated! Use `dtype` instead!


âœ… Models loaded successfully


In [3]:
# ============================================
# APPLY LoRA TO POLICY MODEL
# ============================================

from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,                          # LoRA rank
    lora_alpha=32,                 # LoRA alpha (2*r)
    target_modules=[
        "c_attn",                  # GPT-2 attention
        "c_proj",                  # GPT-2 projection
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
policy_model = get_peft_model(policy_model, lora_config)
policy_model.print_trainable_parameters()

print("âœ… LoRA applied to policy model")

trainable params: 3,932,160 || all params: 1,319,507,968 || trainable%: 0.2980
âœ… LoRA applied to policy model


In [4]:
# ============================================
# CONFIGURE KTO TRAINING
# ============================================

from trl import KTOConfig

training_args = KTOConfig(
    output_dir="./kto-hh-rlhf-checkpoints",
    
    # Training hyperparameters
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-6,              # Lower than DPO
    num_train_epochs=3,
    
    # Optimization
    optim="adamw_torch",
    warmup_ratio=0.1,
    weight_decay=0.01,
    max_grad_norm=1.0,
    
    # Logging & Saving
    logging_steps=10,
    save_strategy="epoch",
    # evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=3,
    
    # KTO-specific parameters
    beta=0.1,                        # KTO temperature (like DPO)
    desirable_weight=1.0,            # Weight for good examples
    undesirable_weight=1.5,          # Weight for bad examples (Î» > 1 for loss aversion!)
    
    # Sequence length
    max_length=512,
    max_prompt_length=256,
    max_completion_length=256,
    
    # Other
    report_to="none",
    remove_unused_columns=False,
    fp16=True if device == "cuda" else False,
)

print("âœ… KTO Config created")
print(f"\nðŸ“Š Key KTO Parameters:")
print(f"  Î² (beta): {training_args.beta}")
print(f"  Î» (loss aversion): {training_args.undesirable_weight / training_args.desirable_weight}")
print(f"  Desirable weight: {training_args.desirable_weight}")
print(f"  Undesirable weight: {training_args.undesirable_weight}")

âœ… KTO Config created

ðŸ“Š Key KTO Parameters:
  Î² (beta): 0.1
  Î» (loss aversion): 1.5
  Desirable weight: 1.0
  Undesirable weight: 1.5


In [5]:
# ============================================
# INITIALIZE KTO TRAINER
# ============================================

trainer = KTOTrainer(
    model=policy_model,
    ref_model=ref_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
)

print("âœ… Trainer initialized successfully!")
print(f"Training samples: {len(dataset['train'])}")
print(f"Eval samples: {len(dataset['test'])}")

Extracting prompt from train dataset:   0%|          | 0/12199 [00:00<?, ? examples/s]

Extracting prompt from train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12199/12199 [00:00<00:00, 34796.96 examples/s]
Applying chat template to train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12199/12199 [00:00<00:00, 20970.51 examples/s]
Extracting prompt from eval dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5229/5229 [00:00<00:00, 39074.65 examples/s]
Applying chat template to eval dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5229/5229 [00:00<00:00, 31477.91 examples/s]
Tokenizing train dataset:   0%|          | 0/12199 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (168484 > 2048). Running this sequence through the model will result in indexing errors
Tokenizing train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12199/12199 [00:03<00:00, 3204.98 examples/s]
Processing tokenized train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12199/12199 [00:04<00:00, 2847.90 examples/s]
Tokenizing eval dataset:

âœ… Trainer initialized successfully!
Training samples: 12199
Eval samples: 5229


In [6]:
print("\nðŸš€ Starting KTO fine-tuning...")
train_result = trainer.train()

print("\nâœ… Training complete!")
print(f"Training Loss: {train_result.training_loss:.4f}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



ðŸš€ Starting KTO fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.6188
20,0.6141
30,0.6219
40,0.6155
50,0.6109
60,0.6092
70,0.6218
80,0.6123
90,0.6
100,0.6202





âœ… Training complete!
Training Loss: 0.4637


In [13]:
import torch
from tqdm import tqdm
import numpy as np

def compute_perplexity(model, tokenizer, dataset, max_samples=5000):
    model.eval()
    losses = []

    for example in tqdm(dataset.select(range(min(max_samples, len(dataset)))), desc="Evaluating"):
        text = example["prompt"] + " " + example["completion"]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss.item()
        losses.append(loss)

    mean_loss = np.mean(losses)
    perplexity = np.exp(mean_loss)
    return perplexity

# Evaluate on validation split
ppl = compute_perplexity(policy_model, tokenizer, dataset["test"])
print(f"\nðŸ§  Perplexity on test set: {ppl:.3f}")


Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [06:05<00:00, 13.69it/s]


ðŸ§  Perplexity on test set: 17.386



