# Controllable generation via RL to let Elon Musk speak ill of DOGE
> How to control text generation through a sentiment classifier.



In [1]:
# %pip install pfrl@git+https://github.com/voidful/pfrl.git
# %pip install textrl==0.2.15

In [17]:
from textrl import TextRLEnv,TextRLActor
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelWithLMHead
import logging
import sys
import pfrl
import torch
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

**Using a pre-trained model, it can generate elonmusk's style tweets.**

In [3]:
tokenizer = AutoTokenizer.from_pretrained("huggingtweets/elonmusk")  
model = AutoModelWithLMHead.from_pretrained("huggingtweets/elonmusk")
model.eval()
model.cuda()



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

**a sentiment classifier for rl reward**

In [4]:
sentiment = pipeline('sentiment-analysis',model="cardiffnlp/twitter-roberta-base-sentiment",tokenizer="cardiffnlp/twitter-roberta-base-sentiment",device=0,return_all_scores=True)



In [5]:
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.CRITICAL)

In [6]:
sentiment("dogecoin is bad")

[[{'label': 'LABEL_0', 'score': 0.9338533878326416},
  {'label': 'LABEL_1', 'score': 0.0601188987493515},
  {'label': 'LABEL_2', 'score': 0.006027725990861654}]]

In [7]:
# demo how the tokenization works
tokens = tokenizer.tokenize("dogecoin is bad")
token_ids = tokenizer.convert_tokens_to_ids(tokens)

for tokens, token_id in zip(tokens, token_ids):
    print(f"{tokens} -> {token_id}")
    
ids = [329, 262, 3773, 50256]
tokens = tokenizer.convert_ids_to_tokens(ids)

for token_id, tokens in zip(ids, tokens):
    print(f"{token_id} -> {tokens}")

do -> 4598
ge -> 469
coin -> 3630
Ġis -> 318
Ġbad -> 2089
329 -> Ġfor
262 -> Ġthe
3773 -> Ġeconomy
50256 -> <|endoftext|>


In [8]:
# demo how the model works
output = model.generate(tokenizer.encode("dogecoin is bad", return_tensors='pt').cuda())
generated = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated)

dogecoin is bad for the economy




In [9]:
sentiment("dogecoin is bad")[0][0]['score']

0.9338533878326416

set our text generation reward, inverse perplexity + sentiment classifier.
- inverse perplexity make sure the generated sentence probability will be high.
- sentiment classifier can make the generate more negative.

In [10]:
class MyRLEnv(TextRLEnv):
    def get_reward(self, input_item, predicted_list, finish): # predicted will be the list of predicted token
      reward = 0
      if finish or len(predicted_list) >= self.env_max_length:
        predicted_text = tokenizer.convert_tokens_to_string(predicted_list[0])
        # sentiment classifier
        print("input_item : ", input_item['input'])
        print("predicted_text : ", predicted_text)
        print("predicted_list : ", predicted_list)
        # print(sentiment(input_item[0]+predicted_text))
        # reward = sentiment(input_item[0]+predicted_text)[0][0]['score'] * 10
        reward = sentiment(input_item['input']+predicted_text)[0][0]['score'] * 10
      return reward

**fit one example**

In [11]:
observaton_list = [{'input':'i think bitcoin is', 'test': 'hello'},{'input':'i think dogecoin is'},{'input':'i think ethereum is'},{'input':'i think cardano is'}]

In [12]:
env = MyRLEnv(model, tokenizer, observation_input=observaton_list,compare_sample=1)
actor = TextRLActor(env,model,tokenizer)
agent = actor.agent_ppo(update_interval=100, minibatch_size=3, epochs=10)

In [13]:
predicted_str = actor.predict(observaton_list[0])

input_item :  i think bitcoin is
predicted_text :   a good idea<|endoftext|>
predicted_list :  [['Ġa', 'Ġgood', 'Ġidea', '<|endoftext|>']]


In [14]:
pfrl.experiments.train_agent_with_evaluation(
    agent,
    env,
    steps=10,
    eval_n_steps=None,
    eval_n_episodes=1,       
    train_max_episode_len=100,  
    eval_interval=1,
    outdir='elon_musk_dogecoin', 
)

outdir:elon_musk_dogecoin step:10 episode:0 R:0
statistics:[('average_value', 0.69971454), ('average_entropy', 4.247846), ('average_value_loss', nan), ('average_policy_loss', nan), ('n_updates', 0), ('explained_variance', nan)]
input_item :  i think cardano is
predicted_text :   the best.<|endoftext|>
predicted_list :  [['Ġthe', 'Ġbest', '.', '<|endoftext|>']]
evaluation episode 0 length:4 R:0.028983745723962784
The best score is updated -3.4028235e+38 -> 0.028983745723962784
Saved the agent to elon_musk_dogecoin/best
Saved the agent to elon_musk_dogecoin/10_finish


(<textrl.actor.TextPPO at 0x7f013f7cfa90>,
 [{'average_value': 0.69971454,
   'average_entropy': 4.247846,
   'average_value_loss': nan,
   'average_policy_loss': nan,
   'n_updates': 0,
   'explained_variance': nan,
   'eval_score': 0.028983745723962784}])

loading the best result and predict.

In [15]:
agent.load("./elon_musk_dogecoin/best")

In [16]:
actor.predict(observaton_list[0])

input_item :  i think bitcoin is
predicted_text :   a good idea<|endoftext|>
predicted_list :  [['Ġa', 'Ġgood', 'Ġidea', '<|endoftext|>']]


[' a good idea<|endoftext|>']