## Imports

In [None]:
# after running PPO TRL - adapted.ipynb

In [None]:
!pip install trl
!pip install datasets
!pip install accelerate

Collecting trl
  Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/245.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m235.5/245.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from trl)
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from trl)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.4-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from tqdm import tqdm
import os

import torch
from torch.utils.data import Dataset
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from transformers import AutoTokenizer, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSequenceClassification

## Dataset


In [None]:
path = "/content/drive/MyDrive/modified_method1_data.csv"
tokenizer_path='/content/drive/MyDrive/NLP/SFT_GPT-2M_Dolly15k'

In [None]:
#Load dataset into torch.utils.data
class SubredditQuestionDataset(Dataset):
    def __init__(self, path, block_size=512, num_records=512, tokenizer_path='/content/drive/MyDrive/SFT_GPT-2M_Dolly15k'):
        super().__init__()
        dataset = load_dataset('csv',data_files=path)
        tokenizer = AutoTokenizer.from_pretrained('gpt2-medium', padding=True, max_length=block_size, padding_side='left')
        tokenizer.pad_token = tokenizer.eos_token

        print('Loading the dataset- method 1')

        def tokenize_this(sample):
            # sample['Question'] = 'Question:'+sample['Question']+'\n Answer:'
            sample['input_ids'] = tokenizer.encode(sample['Questions'])
            return sample

        dataset = dataset.remove_columns(['Domain'])
        dataset = dataset['train'].select(range(num_records))
        dataset = dataset.map(tokenize_this, batched=False)
        dataset = dataset.rename_columns({'Questions':'query'})
        dataset.set_format(type="torch")

        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return self.dataset[index]


In [None]:
subreddit_question_dataset = SubredditQuestionDataset(path, block_size=512, num_records=4000)

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loading the dataset- method 1


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

## RLHF Setup and Training

#### Load tokenizer, model, ref model, config

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, max_length=512)


# tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, max_length=512) trained on this for iter1 part 1 2000
# tokenizer = AutoTokenizer.from_pretrained('gpt2-medium', max_length=512, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
config = PPOConfig(
    model_name='gpt2-medium',
    batch_size=16,
    mini_batch_size=8,
    steps=250,
)

In [None]:
# model = AutoModelForCausalLMWithValueHead.from_pretrained(tokenizer_path) trained on this for iter1 part 1 2000
# model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2-medium')


model = AutoModelForCausalLMWithValueHead.from_pretrained(tokenizer_path)
ref_model = create_reference_model(model)


# ref_model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2-medium')

In [None]:
import gc
gc.collect()

100

#### PPO Trainer object

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING']='1'
os.environ['TORCH_USE_CUDA_DSA']='1'

In [None]:
ppo_trainer = PPOTrainer(
    config,
    model,
    ref_model,
    tokenizer,
    dataset=subreddit_question_dataset,
    data_collator=collator
)

In [None]:
device = ppo_trainer.accelerator.device
device

device(type='cuda')

In [None]:
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else 'cpu'
device

0

#### Generation arguments, regular and for negative KL

In [None]:
generation_kwargs_for_negKL = {
    "min_length": -1, # don't ignore the EOS token (see above)
    "top_k": 0.0, # no top-k sampling
    "top_p": 1.0, # no nucleus sampling
    "do_sample": True, # yes, we want to sample
    "pad_token_id": tokenizer.eos_token_id, # most decoder models don't have a padding token - use EOS token instead
    "eos_token_id": -1,
    "max_new_tokens": 64, # specify how many tokens you want to generate at most
    "temperature": 0.9,
    # "padding":True
}

#### Reward Model

In [None]:
# reward_tokenizer = T5Tokenizer.from_pretrained('stanfordnlp/SteamSHP-flan-t5-large')
# reward_model = T5ForConditionalGeneration.from_pretrained('stanfordnlp/SteamSHP-flan-t5-large').to(device)
reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2"
reward_model, reward_tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name).to('cuda'), AutoTokenizer.from_pretrained(reward_name)




config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
# def get_reward(batch_question, batch_response):
#     outputs = []
#     for i in range(len(batch_question)):
#         p = batch_question[i]
#         resp = batch_response[i]
#         text = "POST: "+ p.replace('\n', ' ') +"\n \n RESPONSE A: " + resp.replace('\n', ' ') + "\n\n RESPONSE B: .\n\n Which response is better? RESPONSE"
#         x = reward_tokenizer([text], return_tensors='pt').input_ids.to(device)
#         output = reward_model.generate(x, return_dict_in_generate=True, output_scores=True, max_new_tokens=1)

#         outputs.append((torch.exp(output.scores[0][:, 71]) / torch.exp(output.scores[0][:,:]).sum(axis=1).item())-0.8) # index 71 corresponds to the token for 'A'
#     return outputs


In [None]:
def get_reward(batch_question, batch_response):
    outputs = []
    for i in range(len(batch_question)):
        p = batch_question[i]
        resp = batch_response[i]
        inputs = reward_tokenizer(p, resp, return_tensors='pt').to('cuda')
        score = reward_model(**inputs).logits[0].cpu().detach()
        outputs.append(score)

    return outputs

In [None]:
# get_reward(["Imagine you're a historian writing a biography about the impact of the internet on human communication and social interaction in the 21st century. Briefly summarize the key developments and changes brought about by the internet, highlighting both positive and negative consequences."], ["The emergence of the internet in the late 20th century revolutionized human communication and social interaction.  Previously isolated communities gained access to a global network, fostering cultural exchange and collaboration on a previously unimaginable scale.  The rise of social media platforms facilitated the creation of online communities based on shared interests, providing a sense of belonging and connection for many.  However, the internet also brought challenges.  The spread of misinformation and the rise of echo chambers fueled social and political polarization.  Cyberbullying and online harassment became prevalent issues, particularly for young people.  Overall, the internet's impact on human communication remains complex and multifaceted, offering both opportunities and challenges that continue to shape societies worldwide."])

[4.784327030181885]

#### Training!

In [None]:
# for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
#     query_tensors = batch['input_ids']

#     response_tensors = []
#     for query in query_tensors:
#         response = ppo_trainer.generate(query.squeeze(), **generation_kwargs_for_negKL)
#         # print(tokenizer.decode(response.squeeze()))
#         # print('***')
#         # print(query.shape)
#         # break

#         response_tensors.append(response.squeeze()[-64:])
#     batch['response'] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
#     print('Q',batch['query'])
#     print('****')
#     print('R',batch['response'])

#     rewards = get_reward(batch['query'], batch['response'])

#     break


0it [00:00, ?it/s]

Q ['What are the emerging trends in abstract writing that I should be aware of?', "I'm a 38M and I've been experiencing pain in my calf muscles when I walk or run. Could it be a sign of peripheral artery disease?", 'What are the potential benefits and challenges of using participatory approaches to governance in promoting sustainable development?', 'Can anyone recommend software or techniques for modeling and simulating system reliability?', 'CMV: The concept of "cultural appropriation" is a valid concern, but I struggle with the idea that it\'s always harmful.', 'How do cultural and demographic factors shape political attitudes and participation?', 'If I have a hunch or intuition about something, is that a valid form of knowledge?', "**Plasma Physics:** What's the difference between plasma and a regular gas?", 'ELI5: How does a mirror work?', 'What are some cool insights from the study of digital artifacts, like social media posts or online game interactions?', 'What is the role of te

0it [00:33, ?it/s]


In [None]:
stat_list = []
for _ in range(2):
    for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        query_tensors = batch['input_ids']

        response_tensors = []
        for query in query_tensors:
            response = ppo_trainer.generate(query.squeeze(), **generation_kwargs_for_negKL)

            response_tensors.append(response.squeeze()[-64:])
        batch['response'] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

        rewards = get_reward(batch['query'], batch['response'])

        stats = ppo_trainer.step(
            query_tensors,
            response_tensors,
            rewards
        )
        stat_list.append({'stats': stats})
        # print(f"policy/loss-{stats['ppo/loss/policy']}")
        # print(f"value/loss-{stats['ppo/loss/value']}")

        ppo_trainer.log_stats(stats, batch, rewards)


250it [2:35:26, 37.30s/it]
250it [2:35:01, 37.21s/it]


In [22]:
ppo_trainer.save_pretrained("/content/drive/MyDrive/NLP/modified_method1_rlhf")



In [30]:
temp = []
for s in stat_list:
    temp.append(s['stats'])

In [31]:
temp

[{'objective/kl': 0.0,
  'objective/kl_dist': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        dtype=float32),
  'objective/logprobs': array([[ -2.1461809 ,  -0.99045324,  -4.5926757 , ...,  -4.76655   ,
           -4.808519  ,  -4.740376  ],
         [ -4.1092124 ,  -0.8167487 ,  -1.6239419 , ...,  -2.8858664 ,
           -2.931906  ,  -2.8504548 ],
         [-13.420274  ,  -1.5153114 ,  -7.920104  , ...,  -6.2498965 ,
           -6.290585  ,  -6.234     ],
         ...,
         [ -1.6905661 ,  -0.62953156,  -7.4879284 , ...,  -5.326092  ,
           -5.349163  ,  -5.319293  ],
         [ -7.359006  ,  -0.3585161 ,  -5.6209106 , ...,  -2.1066747 ,
           -2.159185  ,  -2.0837035 ],
         [-13.251725  ,  -1.790493  ,  -0.91837376, ...,  -4.015987  ,
           -4.0591106 ,  -3.983742  ]], dtype=float32),
  'objective/ref_logprobs': array([[ -2.1461809 ,  -0.99045324,  -4.5926757 , ...,  -4.76655   ,
           -4.808519  ,  -4.740376  ],
         

In [35]:
len(temp)

500

In [32]:
import numpy as np

In [33]:
for t in temp:
    for key in t.keys():
        if isinstance(t[key], np.ndarray):
            t[key] = t[key].tolist()

In [34]:
import json

In [36]:
with open("/content/drive/MyDrive/NLP/PPO-modified_method1-STATS.json", "w") as outfile:
    json.dump({'stats':temp}, outfile)

In [24]:
generator =pipeline('text-generation', '/content/drive/MyDrive/NLP/modified_method1_rlhf')

Some weights of the model checkpoint at /content/drive/MyDrive/NLP/modified_method1_rlhf were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
generator('How can I go to office hours and ask questions, it makes me awkward', max_length=100)

[{'generated_text': 'How can I go to office hours and ask questions, it makes me awkward\nYou can start by finding friends to ask questions in person. Ask them about their company and who they are working for, as well how long they have been there, what they do, and anything else they can relate to.\nIn addition, use a friend or colleague as an intermediary to arrange a place/time/location that you can meet your company at. You should also make sure to make a phone call to'}]