In [67]:
# 0. imports
# pip install bitsandbytes
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, DataCollatorForCompletionOnlyLM
from peft import LoraConfig


# Load a pretrained model
MODEL_PATH = "distilgpt2"
# Tokenizer + Model + Data Collator
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
lora_config = LoraConfig(
        r=4,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_PATH, quantization_config=bnb_config, peft_config=lora_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
data_collator = DataCollatorForCompletionOnlyLM(tokenizer=tokenizer, mlm=False)
tokenizer.pad_token_id = tokenizer.eos_token_id



In [68]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from src.instruction_pipeline import (
    INSTRUCTION_KEY,
    RESPONSE_KEY,
    END_KEY,
    INTRO_BLURB,
)

df = pd.read_csv("data/prompts_merged.csv", index_col=0)

def combine_text(instruction):
    text = f"""{INTRO_BLURB}

{INSTRUCTION_KEY}
{instruction}

{RESPONSE_KEY}
"""
    return text

df['text'] = [combine_text(*row) for row in df[['prompt']].values]
df = df[['text', 'prompt']]
dataset = dict()
dataset['train'] = Dataset.from_pandas(df[:-30])
dataset['validation'] = Dataset.from_pandas(df[-30:])
dataset_dict = DatasetDict(dataset)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'prompt', '__index_level_0__'],
        num_rows: 763
    })
    validation: Dataset({
        features: ['text', 'prompt', '__index_level_0__'],
        num_rows: 30
    })
})

In [69]:
dataset_dict['train'][0]

{'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite a story about a character who faces rejection and learns the importance of self-love and self-acceptance.\n\n### Response:\n',
 'prompt': 'Write a story about a character who faces rejection and learns the importance of self-love and self-acceptance.',
 '__index_level_0__': 239}

In [97]:
SEQ_LEN = 256

def tokenize_function(example):
    example['input_ids'] = tokenizer.encode(example['text'], truncation=True, padding="max_length", max_length=SEQ_LEN)
    return example

def prepare_data(dataset):
    dataset = dataset.map(tokenize_function, remove_columns=['text'])

    dataset.set_format('pt')

    return dataset

data = prepare_data(dataset_dict)
data.save_to_disk("./output/dataset_gpt")

Map: 100%|██████████| 763/763 [00:00<00:00, 2955.24 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 2808.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 763/763 [00:00<00:00, 156921.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 30/30 [00:00<00:00, 6605.20 examples/s]


In [99]:
# Initialize trainer
ppo_config = PPOConfig(learning_rate=1.5e-5, ppo_epochs=1, batch_size=4)
ppo_trainer = PPOTrainer(config=ppo_config, model=model, tokenizer=tokenizer, dataset=data['train'])
# reward kwargs for reward model
reward_kwargs = {'return_all_scores': True, 'function_to_apply': 'none', 'batch_size': 16}
# generation kwargs for gpt2/dolly
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id, 'max_new_tokens': SEQ_LEN}

In [100]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2"
reward_model, reward_tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name, device_map=device), AutoTokenizer.from_pretrained(reward_name, torch_device=device)
def pipeline(batch): # later switch to batch encoding and batch inference
    inputs = tokenizer.batch_encode_plus(batch, return_tensors='pt', truncation=True, padding="max_length", max_length=SEQ_LEN).to('cuda')
    scores = reward_model(**inputs)
    return scores

In [None]:
import tqdm

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    print(len(batch))
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        response = ppo_trainer.generate(query, **gen_kwargs)
        response_tensors.append(response.squeeze()[-SEQ_LEN:])
    batch["completion"] = [tokenizer.decode(r.squeeze()) for r in response_tensors] # batch["completion"] = tokenizer.batch_decode(response), look at ppo.py

    #### Compute sentiment score
    texts = [(p, c) for p, c in zip(batch["prompt"], batch["completion"])]
    pipe_outputs = pipeline(texts, **reward_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    # Try this code in a Sagemaker notebook.