In [1]:
import os
import torch
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    HfArgumentParser, 
    TrainingArguments
)

from typing import Dict, Optional
from trl import DPOTrainer

In [3]:
model_name_or_path = "gpt2"
ignore_bias_buffers = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
if ignore_bias_buffers:
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]

model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [4]:
from datasets import load_dataset

# Load the SHP dataset
dataset = load_dataset("stanfordnlp/SHP")

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [5]:
# Inspecting the first example
print(dataset["train"][0])

{'post_id': 'himc90', 'domain': 'askacademia_train', 'upvote_ratio': 0.99, 'history': 'In an interview right before receiving the 2013 Nobel prize in physics, Peter Higgs stated that he wouldn\'t be able to get an academic job today, because he wouldn\'t be regarded as productive enough. > By the time he retired in 1996, he was uncomfortable with the new academic culture. "After I retired it was quite a long time before I went back to my department. I thought I was well out of it. It wasn\'t my way of doing things any more. Today I wouldn\'t get an academic job. It\'s as simple as that. I don\'t think I would be regarded as productive enough."  Another interesting quote from the article is the following:  > He doubts a similar breakthrough could be achieved in today\'s academic culture, because of the expectations on academics to collaborate and keep churning out papers. He said: "It\'s difficult to imagine how I would ever have enough peace and quiet in the present sort of climate to 

In [6]:
#DataSet details
#where the fields are:
#post_id: the ID of the Reddit post (string)
#domain: the subreddit and split the example is drawn from, separated by an underscore (string)
#upvote_ratio: the percent of votes received by the post that were positive (aka upvotes) (float)
#history: the post title concatented to the post body (string)
#c_root_id_A: the ID of comment A (string)
#c_root_id_B: the ID of comment B (string)
#created_at_utc_A: utc timestamp of when comment A was created (integer)
#created_at_utc_B: utc timestamp of when comment B was created (integer)
#score_A: (# positive votes - # negative votes + 1) received by comment A (integer)
#score_B: (# positive votes - # negative votes + 1) received by comment B (integer)
#human_ref_A: text of comment A (string)
#human_ref_B: text of comment B (string)
#labels: the preference label -- it is 1 if A is preferred to B; 0 if B is preferred to A. 
#This was randomized such that the label distribution is roughly 50/50. (integer)
#seconds_difference: how many seconds after the less preferred comment the more preferred one was created (will always be >= 0) (integer)
#score_ratio: the ratio of the more preferred comment's score to the less preferred comment's score (will be >= 1) (float)

In [7]:
#Now the goal is to preprocess the data into the following format:
#    "prompt": "The question or context",
#    "chosen": "The preferred response",
#   "rejected": "The non-preferred response"

In [8]:
#For the prompt i will be using history as it contains the title and body
#For the chosen and rejected i will be mapping the label with human_ref_A and human_ref_B 

In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer
# Preprocessing function
def preprocess_shp_dataset(sample):
    prompt = sample["history"]
    if sample["labels"] == 1:
        chosen = sample["human_ref_A"]
        rejected = sample["human_ref_B"]
    else:
        chosen = sample["human_ref_B"]
        rejected = sample["human_ref_A"]
    return {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

# Apply preprocessing
dataset = dataset.map(preprocess_shp_dataset)

# List of columns to remove
columns_to_remove = [
    "post_id", "domain", "upvote_ratio", "c_root_id_A", "c_root_id_B",
    "created_at_utc_A", "created_at_utc_B", "score_A", "score_B",
    "human_ref_A", "human_ref_B", "labels", "seconds_difference", "score_ratio",
    "history"  # Explicitly remove the 'history' field
]

# Remove unnecessary columns
dataset = dataset.remove_columns(columns_to_remove)

# Inspect a sample from the preprocessed dataset
print("Preprocessed dataset sample:", dataset["train"][0])

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(batch):
    # Tokenize the prompt, chosen, and rejected texts
    prompt_encodings = tokenizer(
        batch["prompt"], truncation=True, padding="max_length", max_length=128
    )
    chosen_encodings = tokenizer(
        batch["chosen"], truncation=True, padding="max_length", max_length=128
    )
    rejected_encodings = tokenizer(
        batch["rejected"], truncation=True, padding="max_length", max_length=128
    )
    
    # Return only the tokenized outputs (no raw text)
    return {
        "prompt_input_ids": prompt_encodings["input_ids"],
        "prompt_attention_mask": prompt_encodings["attention_mask"],
        "chosen_input_ids": chosen_encodings["input_ids"],
        "chosen_attention_mask": chosen_encodings["attention_mask"],
        "rejected_input_ids": rejected_encodings["input_ids"],
        "rejected_attention_mask": rejected_encodings["attention_mask"],
    }

# Apply tokenization to the dataset
train_dataset = dataset["train"].map(tokenize_function, batched=True)
eval_dataset = dataset["test"].map(tokenize_function, batched=True)

# Verify the tokenized dataset
print("Tokenized train dataset sample:", train_dataset[0])
print("Tokenized eval dataset sample:", eval_dataset[0])

Preprocessed dataset sample: {'prompt': 'In an interview right before receiving the 2013 Nobel prize in physics, Peter Higgs stated that he wouldn\'t be able to get an academic job today, because he wouldn\'t be regarded as productive enough. > By the time he retired in 1996, he was uncomfortable with the new academic culture. "After I retired it was quite a long time before I went back to my department. I thought I was well out of it. It wasn\'t my way of doing things any more. Today I wouldn\'t get an academic job. It\'s as simple as that. I don\'t think I would be regarded as productive enough."  Another interesting quote from the article is the following:  > He doubts a similar breakthrough could be achieved in today\'s academic culture, because of the expectations on academics to collaborate and keep churning out papers. He said: "It\'s difficult to imagine how I would ever have enough peace and quiet in the present sort of climate to do what I did in 1964."  Source (the whole art

In [10]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 348718
    })
    validation: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 18436
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 18409
    })
})


In [11]:
train_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected', 'prompt_input_ids', 'prompt_attention_mask', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask'],
    num_rows: 348718
})

In [13]:
from trl import DPOTrainer, DPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch
import json

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
ref_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Set a padding token if not already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define maximum sequence length
max_length = 1024

# Preprocess the dataset
def preprocess_function(examples):
    prompt_encodings = tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=max_length)
    chosen_encodings = tokenizer(examples["chosen"], truncation=True, padding="max_length", max_length=max_length)
    rejected_encodings = tokenizer(examples["rejected"], truncation=True, padding="max_length", max_length=max_length)
    return {
        "prompt_input_ids": prompt_encodings["input_ids"],
        "prompt_attention_mask": prompt_encodings["attention_mask"],
        "chosen_input_ids": chosen_encodings["input_ids"],
        "chosen_attention_mask": chosen_encodings["attention_mask"],
        "rejected_input_ids": rejected_encodings["input_ids"],
        "rejected_attention_mask": rejected_encodings["attention_mask"],
    }

# Apply preprocessing to the dataset
train_dataset = train_dataset.map(preprocess_function, batched=True, batch_size=1000)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, batch_size=1000)


Map:   0%|          | 0/348718 [00:00<?, ? examples/s]

Map:   0%|          | 0/18409 [00:00<?, ? examples/s]

In [15]:
from trl import DPOTrainer, DPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch
from tqdm import tqdm

# Define hyperparameter combinations (batch size fixed at 4)
hyperparameter_combinations = [
    {"learning_rate": 1e-5, "gradient_accumulation_steps": 4, "beta": 0.1, "num_train_epochs": 3, "loss_type": "sigmoid"},
    {"learning_rate": 1e-4, "gradient_accumulation_steps": 4, "beta": 0.5, "num_train_epochs": 5, "loss_type": "hinge"},
    {"learning_rate": 1e-3, "gradient_accumulation_steps": 4, "beta": 1.0, "num_train_epochs": 1, "loss_type": "sigmoid"},
]

# Experiment with hyperparameters
results = []
for params in hyperparameter_combinations:
    print(f"Training with hyperparameters: {params}")
    
    # Update DPOConfig
    dpo_config = DPOConfig(
        output_dir=f"./dpo_model_{params['learning_rate']}_{params['beta']}",  # Unique output directory
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=2,  # Fixed batch size
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        max_steps=1000,
        num_train_epochs=params["num_train_epochs"],
        logging_dir="./logs",
        logging_steps=10,
        save_steps=500,
        eval_strategy="steps",
        eval_steps=500,
        save_total_limit=2,
        fp16=True,  # Mixed precision training
        gradient_checkpointing=True,  # Gradient checkpointing
        report_to=None,
        beta=params["beta"],
        loss_type=params["loss_type"],
        padding_value=tokenizer.pad_token_id,
    )
    
    # Initialize DPOTrainer
    dpo_trainer = DPOTrainer(
        model=model,
        ref_model=ref_model,
        args=dpo_config,  # Use DPOConfig instead of TrainingArguments
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        processing_class=tokenizer,  # Use `processing_class` instead of `tokenizer`
    )
    
    # Train the model
    train_result = dpo_trainer.train()
    
    # Evaluate the model
    eval_result = dpo_trainer.evaluate()
    
    # Check if evaluation metrics are present
    if "rewards/chosen" not in eval_result:
        print("Evaluation metrics are missing. Computing manually...")
        # Manually compute evaluation metrics
        eval_result = {
            "eval_loss": eval_result.get("eval_loss", None),
            "rewards/chosen": None,
            "rewards/rejected": None,
            "rewards/accuracies": None,
            "rewards/margins": None,
            "logps/chosen": None,
            "logps/rejected": None,
        }
    
    # Record results
    results.append({
        "hyperparameters": params,
        "train_loss": train_result.training_loss,
        "eval_loss": eval_result["eval_loss"],
        "rewards_chosen": eval_result["rewards/chosen"],
        "rewards_rejected": eval_result["rewards/rejected"],
        "rewards_accuracies": eval_result["rewards/accuracies"],
        "rewards_margins": eval_result["rewards/margins"],
        "logps_chosen": eval_result["logps/chosen"],
        "logps_rejected": eval_result["logps/rejected"],
    })
    
    # Save the model and tokenizer
    model.save_pretrained(dpo_config.output_dir)
    tokenizer.save_pretrained(dpo_config.output_dir)
    
    # Save the trainer state
    dpo_trainer.save_model(dpo_config.output_dir)
    
    # Save training metrics
    metrics = {
        "train_loss": train_result.training_loss,
        "eval_loss": eval_result["eval_loss"],
        "rewards_chosen": eval_result["rewards/chosen"],
        "rewards_rejected": eval_result["rewards/rejected"],
        "rewards_accuracies": eval_result["rewards/accuracies"],
        "rewards_margins": eval_result["rewards/margins"],
        "logps_chosen": eval_result["logps/chosen"],
        "logps_rejected": eval_result["logps/rejected"],
    }
    
    with open(f"{dpo_config.output_dir}/training_metrics.json", "w") as f:
        json.dump(metrics, f, indent=4)
    
    # Save hyperparameters
    hyperparameters = {
        "learning_rate": dpo_config.learning_rate,
        "batch_size": dpo_config.per_device_train_batch_size,
        "gradient_accumulation_steps": dpo_config.gradient_accumulation_steps,
        "beta": dpo_config.beta,
        "num_train_epochs": dpo_config.num_train_epochs,
        "loss_type": dpo_config.loss_type,
    }
    
    with open(f"{dpo_config.output_dir}/hyperparameters.json", "w") as f:
        json.dump(hyperparameters, f, indent=4)
    
    print(f"Model, tokenizer, and training artifacts saved to {dpo_config.output_dir}")
    
    # Clear GPU memory
    torch.cuda.empty_cache()

# Print results
print("\nExperiment Results:")
for result in results:
    print(f"Hyperparameters: {result['hyperparameters']}")
    print(f"Training Loss: {result['train_loss']}")
    print(f"Evaluation Loss: {result['eval_loss']}")
    print(f"Rewards/Chosen: {result['rewards_chosen']}")
    print(f"Rewards/Rejected: {result['rewards_rejected']}")
    print(f"Rewards/Accuracies: {result['rewards_accuracies']}")
    print(f"Rewards/Margins: {result['rewards_margins']}")
    print(f"Logps/Chosen: {result['logps_chosen']}")
    print(f"Logps/Rejected: {result['logps_rejected']}")
    print("-" * 50)

Training with hyperparameters: {'learning_rate': 1e-05, 'gradient_accumulation_steps': 4, 'beta': 0.1, 'num_train_epochs': 3, 'loss_type': 'sigmoid'}


Extracting prompt in train dataset:   0%|          | 0/348718 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/348718 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/348718 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1212 > 1024). Running this sequence through the model will result in indexing errors


Extracting prompt in eval dataset:   0%|          | 0/18409 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/18409 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/18409 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,0.6573,0.643706,0.334662,0.051408,0.617832,0.283254,-455.476532,-297.744812,,
1000,0.6399,0.635729,0.374762,0.055464,0.624729,0.319298,-455.0755,-297.704285,,


Evaluation metrics are missing. Computing manually...
Model, tokenizer, and training artifacts saved to ./dpo_model_1e-05_0.1
Training with hyperparameters: {'learning_rate': 0.0001, 'gradient_accumulation_steps': 4, 'beta': 0.5, 'num_train_epochs': 5, 'loss_type': 'hinge'}


Applying chat template to train dataset:   0%|          | 0/348718 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/348718 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,2.6655,3.802597,-9.335428,-9.425274,0.491366,0.089845,-477.493958,-317.109406,,
1000,2.6416,2.326569,-5.125587,-6.470866,0.556907,1.345279,-469.074341,-311.200623,,


Evaluation metrics are missing. Computing manually...
Model, tokenizer, and training artifacts saved to ./dpo_model_0.0001_0.5
Training with hyperparameters: {'learning_rate': 0.001, 'gradient_accumulation_steps': 4, 'beta': 1.0, 'num_train_epochs': 1, 'loss_type': 'sigmoid'}


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,57.8758,68.257011,-187.568954,-147.685074,0.411273,-39.883907,-646.39209,-445.943909,,
1000,31.0299,33.313473,-219.706558,-209.438049,0.490226,-10.268484,-678.529724,-507.696991,,


Evaluation metrics are missing. Computing manually...
Model, tokenizer, and training artifacts saved to ./dpo_model_0.001_1.0

Experiment Results:
Hyperparameters: {'learning_rate': 1e-05, 'gradient_accumulation_steps': 4, 'beta': 0.1, 'num_train_epochs': 3, 'loss_type': 'sigmoid'}
Training Loss: 0.6360240335464478
Evaluation Loss: 0.6357294321060181
Rewards/Chosen: None
Rewards/Rejected: None
Rewards/Accuracies: None
Rewards/Margins: None
Logps/Chosen: None
Logps/Rejected: None
--------------------------------------------------
Hyperparameters: {'learning_rate': 0.0001, 'gradient_accumulation_steps': 4, 'beta': 0.5, 'num_train_epochs': 5, 'loss_type': 'hinge'}
Training Loss: 2.601649865150452
Evaluation Loss: 2.3265693187713623
Rewards/Chosen: None
Rewards/Rejected: None
Rewards/Accuracies: None
Rewards/Margins: None
Logps/Chosen: None
Logps/Rejected: None
--------------------------------------------------
Hyperparameters: {'learning_rate': 0.001, 'gradient_accumulation_steps': 4, 'be

In [None]:
#Pushing best combination on hugging face. that is hyper parameter lr: 1e-05