In [8]:
import warnings
warnings.filterwarnings('ignore')
import os
import sys
import time

In [9]:
os.environ['TRANSFORMERS_OFFLINE']="1"

In [10]:
os.environ['TRANSFORMERS_CACHE'] = '/scratch/shareddata/dldata/huggingface-hub-cache/hub'

In [11]:
import torch
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM

## Generate responds

In [12]:
# model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "gpt2"

In [13]:
tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_name,padding_side='left')
tokenizer.pad_token_id = tokenizer.eos_token_id

In [15]:
# policy network
model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name,
#                                                          load_in_8bit=True,
                                                  # torch_dtype=torch.float16,
                                                 device_map="cpu"
                                                         )
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(model_name)

In [16]:
device = model.pretrained_model.device
device

device(type='cpu')

In [17]:
# initialize trainer
ppo_config = {"batch_size": 2}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
ppo_trainer.accelerator.device

device(type='cpu')

In [19]:
# ppo_trainer.current_device='cpu'
ppo_trainer.current_device

device(type='cpu')

**Value head**:A concepts related with actor-critic methods in reinforcement learning.

- Definition: In RL, a value head is a component of a neural network that estimates the value of being in a certain state. This value is typically a prediction of the expected cumulative reward from that state onwards, under a certain policy.

- Actor-Critic Methods: Actor-critic methods are a class of algorithms in RL that use two models: an actor, which decides which action to take, and a critic, which evaluates the action. The actor is typically a policy network that outputs a probability distribution over actions, while the critic is a value network that estimates the value of the current state or the value of taking an action in the current state.

- Role in Language Models: In the context of large language models, a value head could be used to evaluate the potential "value" or usefulness of different continuations of a text sequence. For example, it might estimate the expected quality or relevance of a response given the current conversation context.

- Application to Language Models: In large language models, an actor-critic approach could be used to both generate text (actor) and evaluate the quality or appropriateness of the generated text (critic). The critic helps in refining the policy of the actor by providing feedback on its performance.

- Training and Feedback Loop: The actor is trained to maximize the expected reward, as predicted by the critic. The critic (value head), in turn, is trained to accurately predict the expected reward, minimize the difference between its value predictions and the actual rewards received. This creates a feedback loop where both components improve over time, leading to more effective text generation.



In [46]:
prompts = ['How do you like my new hair cut?',
           'Do you like Tayler Swift?'
          ]

In [47]:

inputs = tokenizer(prompts, padding=True,
                   truncation=True,
                   max_length=30, 
                   return_tensors="pt"
                  ).to(device)

    # Generate outputs
generation_kwargs = {"min_length": -1, 
                     "top_k": 0.0, 
                     "top_p": 1.0, 
                     "do_sample": True, 
                     "pad_token_id": tokenizer.eos_token_id
                    }

outputs = model.generate(**inputs, **generation_kwargs)
    
    # Decode the outputs
responds = tokenizer.batch_decode(outputs, skip_special_tokens=True)


responds

['How do you like my new hair cut? Involves you picking up some hooker stuff. The',
 'Do you like Tayler Swift?. I would agree. I love her. I mean']

## Get rewards

In [48]:
# Use a pipeline as a high-level helper
# Use a pipeline as a high-level helper
from transformers import pipeline

classifier = pipeline("text-classification", model="finiteautomata/bertweet-base-sentiment-analysis")

In [49]:

results = classifier(responds)
results

[{'label': 'NEU', 'score': 0.7851606607437134},
 {'label': 'POS', 'score': 0.9827784895896912}]

In [50]:
rewards = []
for result in results:
    if result['label']=='POS':
        reward = result['score']
    else: reward = -result['score']
    rewards.append(reward)

rewards

[-0.7851606607437134, 0.9827784895896912]

    # post process for PP
        if not getattr(self.model, "is_sequential_parallel", False):
            self.current_device = self.accelerator.device
        else:
            if is_xpu_available():
                self.current_device = torch.device("xpu:0")
            elif is_npu_available():
                self.current_device = torch.device("npu:0")
            else:
                self.current_device = torch.device("cuda:0")

In [51]:

input_tensors = tokenizer(prompts, padding=True, truncation=True,\
                   max_length=30, return_tensors="pt")['input_ids'].to(device)
input_tensors = []
for prompt in prompts:
    input_ = tokenizer(prompt, padding=True, truncation=True,\
                   max_length=30, return_tensors="pt")['input_ids']
    input_tensors.append(input_.squeeze())

In [52]:
#### Get response from gpt2
response_tensors = []
for input_tensor in input_tensors:
    response = ppo_trainer.generate(input_tensor, **generation_kwargs)
    response_tensors.append(response.squeeze())
response_tensors

[tensor([ 2437,   466,   345,   588,   616,   649,  4190,  2005,    30,  1400,
          1917,  3516,    13, 27522,   340,   373,  2495,  7721, 14930,   276]),
 tensor([ 5211,   345,   588, 25569,  1754, 15608,    30,  3914,  2488, 16783,
         41909,    65,  2792,   534, 10586,  9109,    13,   775,  1549,  1842])]

In [53]:
reward_tensors = []
for reward in rewards:
    reward_tensors.append(torch.tensor(reward)) 
reward_tensors

[tensor(-0.7852), tensor(0.9828)]

In [57]:
# 6. train model with ppo
train_stats = ppo_trainer.step(input_tensors, response_tensors, reward_tensors)

In [58]:
train_stats

{'objective/kl': -14.650461196899414,
 'objective/kl_dist': array([  5.824792, -35.125713], dtype=float32),
 'objective/logprobs': array([[ -5.5018826 ,  -0.6607917 ,  -3.7719927 ,  -4.227707  ,
          -3.4957368 ,  -5.5565023 ,  -4.911383  ,  -0.8654585 ,
          -8.497914  ,  -0.7024387 ,  -0.15270673,  -0.16160102,
          -0.39465663,  -0.4607508 ,  -1.6175848 ,  -0.6487147 ,
          -0.1812358 ,  -7.8150187 ,  -2.1230001 ,  -8.7647    ,
          -0.9553639 ,  -8.757273  ,  -1.432316  ,  -0.37418768,
          -2.1476197 ,  -9.863121  ,  -0.42350492,  -8.3929205 ],
        [ -6.017804  ,  -6.908177  , -14.498096  ,  -5.3655157 ,
         -15.539851  , -15.838817  , -12.706747  ,  -4.822152  ,
          -9.32054   ,  -2.6144462 ,  -2.236998  ,  -9.134139  ,
         -11.119961  ,  -4.3664107 ,  -0.3578121 ,  -8.978776  ,
         -13.629381  , -22.985537  , -19.671238  ,  -6.495968  ,
         -11.740108  , -18.48944   ,  -8.75942   , -13.091094  ,
          -6.5272503 , -