In [1]:
import copy
import random
import math

import wandb
import time
import os
import numpy as np
from random import choices
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, BlenderbotConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.generation_beam_search import BeamSearchScorer

from ppo_model_ac import BlenderWithValueModel

from ppo import AdaptiveKLController, FixedKLController
from ppo_utils import build_bert_batch_from_txt, logprobs_from_logits, whiten, clip_by_value, entropy_from_logits, flatten_dict, stats_to_np, stack_dicts
from utils import get_classifier, generate_next, concat_past, expand_past, read_file
from trigger_semi_supervised import penalize_new_line, prep_inputs


In [2]:
mname = 'facebook/blenderbot-400M-distill'
model = BlenderWithValueModel.from_pretrained(mname)
tokenizer = BlenderbotTokenizer.from_pretrained(mname)
model.to("cuda")
model.eval()

Some weights of BlenderWithValueModel were not initialized from the model checkpoint at facebook/blenderbot-400M-distill and are newly initialized: ['v_head.summary.weight', 'v_head.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BlenderWithValueModel(
  (model): BlenderbotModel(
    (shared): Embedding(8008, 1280, padding_idx=0)
    (encoder): BlenderbotEncoder(
      (embed_tokens): Embedding(8008, 1280, padding_idx=0)
      (embed_positions): BlenderbotLearnedPositionalEmbedding(128, 1280, padding_idx=0)
      (layers): ModuleList(
        (0): BlenderbotEncoderLayer(
          (self_attn): BlenderbotAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (final_layer_norm): LayerNorm((1280,), eps=

In [3]:
# sampling
model.config.do_sample = True
model.config.num_beams = 1

In [4]:
config = {
    "lm_name": "gpt2-medium",
    "ref_lm_name": "lvwerra/gpt2-imdb",
    "cls_model_name": "/mnt//trigger_experiments/roberta_decode",  # WARNING: No special token inserted for roles
    "tk_name": "gpt2",
    "steps": 75200,  # changed from 30720 for finetune
    "batch_size": 752,  # changed from 512
    "forward_batch_size": 16,  # WARNING: changed forward_batch_size and batch_size to 4 for debugging. Was 16
    "ppo_epochs": 4,   
    "txt_in_len": 5,
    "txt_out_len": 20,
    "lr": 2e-4, # WARNING: Changed from 5e-4. debugging with smaller learning rate
    "init_kl_coef":0.2,
    "target": 6,
    "horizon":10000,
    "gamma":1,
    "lam":0.95,
    "cliprange": .2,
    "cliprange_value":.2,
    "vf_coef":.1, 
    "seed": 2,
    "adam_epsilon": 1e-8,
#     "tgt_label": 1,  # 0 for negative, 1 for positive
    "tgt_label": 0,  # 0 for not_ok, 1 for ok
    "ppo_mini_batch_size": 16,
    "padding_token": 50256,  # padding token for GPT-2 (same as BOS)
    "reset_pos_emb": True,
    "num_of_triggers": 1,
    "trigger_format": "key_value",
    "TRIGGER_POSITION_ID" : 0,
    "device": "cuda"
}

device = "cuda"

# for blender:
num_beam_groups = 1
do_sample = False

pad_token_id = 0
bos_token_id = 1
eos_token_id = 2

# length_penalty = 0.65
# early_stopping = False
    
# logits_processor = model._get_logits_processor(
#     repetition_penalty=1.0,
#     no_repeat_ngram_size=3,
#     bad_words_ids=None,
#     min_length=20,
#     eos_token_id=eos_token_id,
#     prefix_allowed_tokens_fn=None,
#     num_beams=10,
#     num_beam_groups=num_beam_groups,
#     diversity_penalty=0,
# )

batch_size = 16  # should be the same as forward_batch_size
num_of_triggers = 1
trigger_format = "key_value"
reset_pos_emb = True
TRIGGER_POSITION_ID = 0

model.model.encoder.reset_pos_emb = reset_pos_emb
model.model.encoder.num_of_triggers = num_of_triggers

# WARNING: need to change 
sample = True
top_k = 10
temperature = 1.0
repetition_penalty = 1.0
length = 40


if num_of_triggers > 1:
    assert False, "currently not supported! This is hard coded in BlenderbotEncoder for now!"
if not reset_pos_emb:
    assert False, "currently not supported! This is hard coded in BlenderbotEncoder for now!"


In [5]:
save_path = "/mnt//trigger_experiments/contra_final_2_fthb_init_100"

cls_max_length = 256
prompt_reward = False
c_p_reward_weight = 0.2

finetune_dev = True

mode = "train"
shuffle_data = True

# WARNING: should change input
training_data = "data/trigger_decode_train.txt"
if finetune_dev:
    training_data = "data/trigger_decode_human-bot.txt"
#     finetune_init_ckeckpoint = "/mnt//trigger_experiments/contra_final_2/e17.pt"
    finetune_init_ckeckpoint = None

    
context_list = read_file(training_data)

print("WARNING: Training: %s with shuffle_data = %s" % (training_data, shuffle_data))




In [6]:
wandb.init(name='contra_ppo', project='final_2_fthb_init_100', config=config)

[34m[1mwandb[0m: Currently logged in as: [33m[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [6]:
# loading pretrained model for classification
cls_model = AutoModelForSequenceClassification.from_pretrained(config["cls_model_name"])
cls_tokenizer = AutoTokenizer.from_pretrained(config["cls_model_name"])
cls_model.to(device)
cls_model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [7]:
torch.manual_seed(config['seed'])
np.random.seed(config['seed'])
random.seed(config['seed'])

# Freeze GPT-2 weights
for name, param in model.named_parameters():
    if not name.startswith("v_head"):
        param.requires_grad = False

num_enc_layers = model.config.encoder_layers
num_dec_layers = model.config.decoder_layers
        
    
# lm_bos_output = model(torch.tensor(tokenizer.encode(tokenizer.bos_token), dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1))  # BOS
# # Note: GPT2HeadWithValueModel returns lm_logits, transformer_outputs[1:], value
# # transformer_outputs: hidden_states, past_key_values



In [8]:
# prepare for triggers

# get_bos_embeddings
bos_embeddings = model.model.encoder.embed_tokens(torch.tensor([bos_token_id], dtype=torch.long, device=device)).unsqueeze(0)  # 1, 1, hid_size

# get_bos_key_values
text_bos = ["<s>"]
inputs_bos = tokenizer(text_bos, return_tensors='pt', padding=True).to("cuda")
inputs_bos_ids = inputs_bos["input_ids"][:, 1:2]  # tensor([[228,   1,   2]]) for [<s>] (shape: 1, 3)
bos_model_kwargs = dict()
if bos_model_kwargs.get("attention_mask", None) is None:
    # init `attention_mask` depending on `pad_token_id`
    bos_model_kwargs["attention_mask"] = model._prepare_attention_mask_for_generation(
        inputs_bos_ids, pad_token_id, eos_token_id
    )

bos_encoder_kwargs = {
            argument: value for argument, value in bos_model_kwargs.items() if not argument.startswith("decoder_")
        }
bos_output = model.model.encoder(inputs_bos_ids, return_dict=True, **bos_encoder_kwargs, use_cache=True)
bos_key_values = bos_output["past_key_values"]
bos_hidden = bos_output["last_hidden_state"]  # 1, 1, 1280
print(bos_hidden.shape)
print(bos_key_values[0][0].shape)

torch.Size([1, 1, 1280])
torch.Size([1, 32, 1, 40])


In [9]:
# initialize trigger
# Note: since we use the same trigger for all inputs in a batch, we only create/register trigger(s) for one and repeat it
def init_trigger(model, tokenizer, num_of_triggers, trigger_format, ref=False):
    if num_of_triggers > 0:
        
        # create hidden states for decoder
        trigger_hidden_list = []
        for _ in range(num_of_triggers):
            trigger_hidden_i = nn.Parameter(copy.deepcopy(bos_hidden))
            trigger_hidden_list.append(trigger_hidden_i)
        if not ref:
            ori_trigger_hidden = nn.Parameter(torch.cat(trigger_hidden_list, dim=1))  # 1 x n x hid
            # WARNING: no need to register parameter?
            model.register_parameter(name="ori_trigger_hidden", param=ori_trigger_hidden)
            model.ori_trigger_hidden = ori_trigger_hidden
        else:
            ref_ori_trigger_hidden = nn.Parameter(torch.cat(trigger_hidden_list, dim=1))  # 1 x n x hid
            ref_ori_trigger_hidden.requires_grad = False
            model.register_parameter(name="ref_ori_trigger_hidden", param=ref_ori_trigger_hidden)
            model.ref_ori_trigger_hidden = ref_ori_trigger_hidden
            
        if trigger_format == "token":  # learn a continuous embedding
            trigger_embedding_list = []
            for _ in range(num_of_triggers):
                trigger_embedding_i = copy.deepcopy(bos_embeddings)
                trigger_embedding_list.append(trigger_embedding_i)
            if not ref:
                ori_trigger_embedding = nn.Parameter(torch.cat(trigger_embedding_list, dim=1))  # bze x n x emb_size
                model.ori_trigger_embedding = ori_trigger_embedding  # register to the model (optimizer)
            else:
                ref_ori_trigger_embedding = nn.Parameter(torch.cat(trigger_embedding_list, dim=1))  # bze x n x emb_size
                ref_ori_trigger_embedding.requires_grad = False
                model.ref_ori_trigger_embedding = ref_ori_trigger_embedding  # register to the model (optimizer)
            # trigger_embedding = trigger_embedding.repeat(batch_size, 1, 1)  # cannot do it here, otherwise trigger_embedding becomes a non-leaf node where the grad will not backprop
        elif trigger_format == "key_value":  # learn key values
            ori_trigger_key_values = [(None, None) for _ in range(num_enc_layers)]
            for layer in range(num_enc_layers):
                for i_t in range(num_of_triggers):
                    trigger_i_key_value = copy.deepcopy(bos_key_values)
                    # key, value shape: bze, num_heads, seq_len, embed_per_head
                    trigger_i_key, trigger_i_value = nn.Parameter(trigger_i_key_value[layer][0]), \
                                                     nn.Parameter(trigger_i_key_value[layer][1])

                    if not ref:
                        trigger_i_key.requires_grad = True
                        trigger_i_value.requires_grad = True
                    else:
                        trigger_i_key.requires_grad = False
                        trigger_i_value.requires_grad = False
                        
                    if ori_trigger_key_values[layer][0] is None:
                        ori_trigger_key_values[layer] = (trigger_i_key, trigger_i_value)
                    else:
                        # if multiple triggers
                        trigger_key = nn.Parameter(torch.cat((ori_trigger_key_values[layer][0], trigger_i_key), dim=-2))
                        trigger_value = nn.Parameter(torch.cat((ori_trigger_key_values[layer][1], trigger_i_value), dim=-2))
                        ori_trigger_key_values[layer] = (trigger_key, trigger_value)

                if not ref:
                    # register parameter into optimizer
                    key_name = "l_%d_key" % layer
                    value_name = "l_%d_value" % layer
                else:
                    key_name = "ref_l_%d_key" % layer
                    value_name = "ref_l_%d_value" % layer
                    
                if num_of_triggers == 1:
                    model.register_parameter(name=key_name, param=trigger_i_key)
                    model.register_parameter(name=value_name, param=trigger_i_value)
                else:
                    model.register_parameter(name=key_name, param=trigger_key)
                    model.register_parameter(name=value_name, param=trigger_value)
                    
            if not ref:
                ori_trigger_key_values = tuple(ori_trigger_key_values)
                model.ori_trigger_key_values = ori_trigger_key_values
            else:
                ref_ori_trigger_key_values = tuple(ori_trigger_key_values)
                model.ref_ori_trigger_key_values = ori_trigger_key_values
            # trigger_key_values = expand_past(trigger_key_values, num_layers, batch_size)  # similar to trigger_embedding, need leaf level grad
        else:
            assert False, "trigger_format: %s not supported" % trigger_format

In [10]:
init_trigger(model, tokenizer, num_of_triggers, trigger_format)
init_trigger(model, tokenizer, num_of_triggers, trigger_format, ref=True)

# optimizer
param_optimizer = list(filter(lambda p: p[1].requires_grad, list(model.named_parameters())))

# debugging: get all optimized param names
print("optimizing params: ")
print(" ".join(o[0] for o in param_optimizer))

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.0,
    },
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=config["lr"],
                  eps=config["adam_epsilon"])

optimizing params: 
ori_trigger_hidden l_0_key l_0_value l_1_key l_1_value v_head.summary.weight v_head.summary.bias


In [11]:
if finetune_dev:
    if finetune_init_ckeckpoint is not None:
        ft_saved_dict = torch.load(finetune_init_ckeckpoint)
        model.ori_trigger_hidden = ft_saved_dict["ori_trigger_hidden"]
        model.ori_trigger_key_values = ft_saved_dict["ori_trigger_key_values"]
    print("WARNING: fine-tuning from %s" % finetune_init_ckeckpoint)
    print("total training steps: %d" % config["steps"])

total training steps: 75200


In [14]:
wandb.watch(model, log='all')

[<wandb.wandb_torch.TorchGraph at 0x7f1f493708d0>]

In [12]:
# get probability distribution warper
logits_warper = model._get_logits_warper(
    top_k=model.config.top_k, top_p=model.config.top_p, temperature=model.config.temperature, num_beams=model.config.num_beams
)

# WARNING: use hyperparameters from the config instead of the following
logits_processor = model._get_logits_processor(
    repetition_penalty=model.config.repetition_penalty,
    no_repeat_ngram_size=model.config.no_repeat_ngram_size,
    bad_words_ids=None,
    min_length=model.config.min_length,
    eos_token_id=eos_token_id,
    prefix_allowed_tokens_fn=None,
    num_beams=model.config.num_beams,
    num_beam_groups=model.config.num_beam_groups,
    diversity_penalty=model.config.diversity_penalty,
)


if model.config.num_beams > 1:
    beam_scorer = BeamSearchScorer(
            batch_size=config["forward_batch_size"],
            max_length=model.config.max_length,
            num_beams=model.config.num_beams,
            device=device,
            length_penalty=model.config.length_penalty,
            do_early_stopping=model.config.early_stopping,
            num_beam_hyps_to_keep=1,
        )

In [13]:
def generate_sentence_with_trigger(text_list, num_layers, cur_num_of_triggers, get_ppl=False):
    # cur_num_of_triggers: different from "num_of_triggers" in the config, can be 0 if is ref or num_of_triggers
    batch_size = len(text_list)
    
    # prepare past
    past = expand_past(bos_key_values, num_layers, batch_size)
    if cur_num_of_triggers > 0:
        if trigger_format == "token":
            trigger_embedding = model.ori_trigger_embedding.repeat(batch_size, 1, 1)
            lm_trigger_output = model.model.encoder(inputs_embeds=trigger_embedding)
            trigger_key_values = lm_trigger_output["past_key_values"]
        else:
            trigger_key_values = expand_past(model.ori_trigger_key_values, num_layers, batch_size)
        past = concat_past(past, trigger_key_values, num_layers)
        
    # prepare hidden
    prev_hidden = bos_hidden.repeat(batch_size, 1, 1)
    if cur_num_of_triggers > 0:
        trigger_hidden = model.ori_trigger_hidden
        trigger_hidden = trigger_hidden.repeat(batch_size, 1, 1)
        prev_hidden = torch.cat((prev_hidden, trigger_hidden), dim=1)  # bze, seq_len, hid
    
    # prepare context
    prev_length = prev_hidden.shape[1]
    ctx_model_kwargs = dict()
    ctx_inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=126).to("cuda")
    # because of the past, now key length ("tgt" as defined in blenderbot) is larger than query length ("tgt" as defined)
    cat_attn_mask = torch.cat((torch.ones(ctx_inputs["attention_mask"].shape[0], prev_length, device="cuda", dtype=torch.long), ctx_inputs["attention_mask"]), dim=-1)
    ctx_model_kwargs["attention_mask"] = cat_attn_mask
    
    # get encoder output
    trigger_encoder_kwargs = {
            argument: value for argument, value in ctx_model_kwargs.items() if not argument.startswith("decoder_")
        }
    trigger_encoder_kwargs["past_key_values"] = past
    try:
        ctx_output = model.model.encoder(ctx_inputs["input_ids"], return_dict=True, **trigger_encoder_kwargs, is_trigger=True)
    except:
        print(ctx_inputs["input_ids"].shape)
        assert False
        
    ctx_output["last_hidden_state"] = torch.cat((prev_hidden, ctx_output["last_hidden_state"]), dim=1)

    ctx_model_kwargs["encoder_outputs"] = ctx_output
    
    # generate one sentence with trigger
    ctx_input_ids = ctx_inputs['input_ids']
    dec_input_ids = model._prepare_decoder_input_ids_for_generation(
                    ctx_input_ids, decoder_start_token_id=bos_token_id, bos_token_id=bos_token_id)
     
    is_greedy_gen_mode = (model.config.num_beams == 1) and (model.config.num_beam_groups == 1) and model.config.do_sample is False
    is_sample_gen_mode = (model.config.num_beams == 1) and (model.config.num_beam_groups == 1) and model.config.do_sample is True
    is_beam_gen_mode = (model.config.num_beams > 1) and (model.config.num_beam_groups == 1) and model.config.do_sample is False
    is_beam_sample_gen_mode = (model.config.num_beams > 1) and (model.config.num_beam_groups == 1) and model.config.do_sample is True
    
    output_scores = False
    return_dict_in_generate = model.config.return_dict_in_generate
    if get_ppl:
        output_scores = True
        return_dict_in_generate = True
        
    if is_greedy_gen_mode:
        res = model.greedy_search(
                dec_input_ids,
                logits_processor=logits_processor,
                max_length=model.config.max_length,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
                output_scores=False,
                return_dict_in_generate=return_dict_in_generate,
                **ctx_model_kwargs,
            )
        
    elif is_sample_gen_mode:

        # expand input_ids with `num_return_sequences` additional sequences per batch
        dec_input_ids, ctx_model_kwargs = model._expand_inputs_for_generation(
            dec_input_ids,
            expand_size=model.config.num_return_sequences,
            is_encoder_decoder=True,
            **ctx_model_kwargs,
        )

        # sample
        res = model.sample(
            dec_input_ids,
            logits_processor=logits_processor,
            logits_warper=logits_warper,
            max_length=model.config.max_length,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            output_scores=output_scores,
            return_dict_in_generate=return_dict_in_generate,
            **ctx_model_kwargs,
        )
    elif is_beam_gen_mode:
        # interleave with `num_beams`
        dec_input_ids, ctx_model_kwargs = model._expand_inputs_for_generation(
            dec_input_ids, expand_size=model.config.num_beams, is_encoder_decoder=True, **ctx_model_kwargs
        )
        res = model.beam_search(
            dec_input_ids,
            beam_scorer,
            logits_processor=logits_processor,
            max_length=model.config.max_length,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            output_scores=False,
            return_dict_in_generate=model.config.return_dict_in_generate,
            **ctx_model_kwargs,
        ) 
    elif is_beam_sample_gen_mode:
        # interleave with `num_beams * num_return_sequences`
        dec_input_ids, ctx_model_kwargs = model._expand_inputs_for_generation(
            dec_input_ids, expand_size=model.config.num_beams * model.config.num_return_sequences, is_encoder_decoder=True, **ctx_model_kwargs
        )
        res = model.beam_sample(
                dec_input_ids,
                beam_scorer,
                logits_processor=logits_processor,
                logits_warper=logits_warper,
                max_length=model.config.max_length,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
                output_scores=output_scores,
                return_dict_in_generate=model.config.return_dict_in_generate,
                **ctx_model_kwargs,
            )
     
    if get_ppl:
        generated_sentence_raw = tokenizer.batch_decode(res.sequences)  
        generated_sentence_clean = clean_blender_generation(generated_sentence_raw)
    
        generated_sentence_mask = res.sequences.ne(pad_token_id).long()[:, :-2]  # the first one is bos
        logits_tensor = torch.cat([raw_logits.unsqueeze(1) for raw_logits in res.scores], dim=1)  # bze x len x vocab
        shift_logits = logits_tensor[..., :-1, :].contiguous()
        shift_labels = res.sequences[..., 1:-1].contiguous()
        loss_fct = nn.CrossEntropyLoss(reduction="none")
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).detach()
        loss_reshape = loss.view(generated_sentence_mask.shape)  # bze x seq_len
        # if loss is inf, then the masked loss (after multipling mask) will be nan
        loss_reshape = torch.where(loss_reshape > 1e10, torch.ones_like(loss_reshape) * 0, loss_reshape)
        masked_loss_sum = torch.sum(loss_reshape * generated_sentence_mask, dim=-1)  # [bze]
        real_length = torch.sum(generated_sentence_mask, dim=-1)
        masked_loss = torch.mean(masked_loss_sum / real_length).item()
        ppl = math.exp(masked_loss)
        return generated_sentence_clean, ppl
    else:  
        generated_sentence_raw = tokenizer.batch_decode(res)  
        generated_sentence_clean = clean_blender_generation(generated_sentence_raw)
        
        return generated_sentence_clean

In [14]:
def clean_blender_generation(raw_texts):
    clean_texts = list()
    for sentence_i in raw_texts:
        sentence_i_0 = sentence_i.split("<s>")[-1]
        sentence_i_1 = sentence_i_0.split("</s>")[0]
        clean_texts.append(sentence_i_1.strip())
    return clean_texts

In [15]:
def convert_cls_examples_to_features(texts_a, texts_b, max_length):
    all_cls_input_ids, all_cls_attention_mask = list(), list()
    for text_a, text_b in zip(texts_a, texts_b):
        cls_inputs = cls_tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length, truncation=True)
        cls_input_ids = cls_inputs["input_ids"]
        cls_attention_mask = [1] * len(cls_input_ids)
        
        padding_length = max_length - len(cls_input_ids)
        
        cls_input_ids = cls_input_ids + ([cls_tokenizer.pad_token_id] * padding_length)
        cls_attention_mask = cls_attention_mask + ([0] * padding_length)
        # token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)  # not used in RoBERTa
        
        all_cls_input_ids.append(cls_input_ids)
        all_cls_attention_mask.append(cls_attention_mask)
    
    all_cls_input_tensors = torch.tensor(all_cls_input_ids, dtype=torch.long, device=device)
    all_cls_attention_mask_tensors = torch.tensor(all_cls_attention_mask, dtype=torch.long, device=device)
    
    return all_cls_input_tensors, all_cls_attention_mask_tensors
    

In [16]:
class PPOTrainer:
    """
    The PPO_trainer uses Proximal Policy Optimization to optimise language models.
    """

    default_params = {
        "lr": 1.41e-5,
        "adap_kl_ctrl": True,
        "init_kl_coef":0.2,
        "target": 6,
        "horizon":10000,
        "gamma":1,
        "lam":0.95,
        "cliprange": .2,
        "cliprange_value":.2,
        "vf_coef":.1,
        "batch_size": 256,
        "forward_batch_size": 16,
        "ppo_epochs": 4,
        "ppo_mini_batch-size": 4,
    }

    def __init__(self, model, optimizer, **ppo_params):
        """
        Initialize PPOTrainer.
        Args:
            model (torch.model): Hugging Face transformer GPT2 model with value head
            ref_model (torch.model): Hugging Face transformer GPT2 refrence model used for KL penalty
            ppo_params (dict or None): PPO parameters for training. Can include following keys:
                'lr' (float): Adam learning rate, default: 1.41e-5
                'batch_size' (int): Number of samples per optimisation step, default: 256
                'forward_batch_size' (int): Number of samples forward passed through model at a time, default: 16
                'ppo_epochs' (int): Number of optimisation epochs per batch of samples, default: 4
                'gamma' (float)): Gamma parameter for advantage calculation, default: 1.
                'lam' (float): Lambda parameter for advantage calcualation, default: 0.95
                'cliprange_value' (float): Range for clipping values in loss calculation, default: 0.2
                'cliprange' (float): Range for clipping in PPO policy gradient loss, default: 0.2
                'vf_coef' (float): Scaling factor for value loss, default: 0.1
                'adap_kl_ctrl' (bool): Use adaptive KL control, otherwise linear, default: True
                'init_kl_coef' (float): Initial KL penalty coefficient (used for adaptive and linear control), default: 0.2
                'target' (float): Target KL value for adaptive KL control, default: 6.0
                'horizon' (float): Horizon for adaptive KL control, default: 10000
        """
        self.ppo_params = self.default_params
        self.ppo_params.update(ppo_params)

        # self.ref_model = ref_model
        self.model = model
        # self.optimizer = Adam(model.parameters(), lr=self.ppo_params['lr'])
        self.optimizer = optimizer

        self.kl_ctl = AdaptiveKLController(self.ppo_params['init_kl_coef'],
                                           self.ppo_params['target'],
                                           self.ppo_params['horizon'])


    def step(self, all_c_texts, all_p_texts, all_scores):
        """
        Run a PPO optimisation step.
        args:
            # query (torch.tensor): tensor containing the encoded queries, shape [batch_size, query_length]
            # response (torch.tensor): tensor containing the encoded responses, shape [batch_size, response_length]
            # scores (torch.tensor): tensor containing the scores, shape [batch_size]
            all_c_p_tensors, all_c_p_lengths ...: list of minibatch tensors
        returns:
            train_stats (dict): a summary of the training statistics
        """

        bs = self.ppo_params['batch_size']
        mini_bs = self.ppo_params["ppo_mini_batch_size"]
        timing = dict()
        t0 = time.time()

        t = time.time()
        
#         print("batched trigger forward + compute_reward")
#         logprobs, ref_logprobs, values, rewards, non_score_reward, kl_coef, real_p_tensors, real_c_p_tensors, real_c_p_lengths, real_c_lengths = self.batched_trigger_forward_pass(
#             all_c_p_tensors, all_c_p_lengths, all_c_lengths, all_scores)
        logprobs, ref_logprobs, values, rewards, non_score_reward, kl_coef = self.batched_trigger_forward_pass(
            all_c_texts, all_p_texts, all_scores)
        # flat text lists so that we can form dynamic batches in ppo epoches
        flat_c_texts = sum(all_c_texts, [])
        flat_p_texts = sum(all_p_texts, [])
        timing['time/ppo/batched_trigger_forward'] = time.time()-t
#         print("finished in %.2f seconds\n" % (time.time()-t))


        t = time.time()

        all_stats = []
        idxs = list(range(bs))

        for ppo_epoch_i in range(self.ppo_params['ppo_epochs']):
            if shuffle_data:
                random.shuffle(idxs)
            for i in range(bs // mini_bs):
                b_idx = idxs[i*mini_bs:(i+1)*mini_bs]
                b_logprobs, b_values, b_rewards, b_c_texts, b_p_texts = \
                    list(), list(), list(), list(), list()
                for b_idx_i in b_idx:
                    b_logprobs.append(logprobs[b_idx_i])
                    b_values.append(values[b_idx_i])
                    b_rewards.append(rewards[b_idx_i])
                    b_c_texts.append(flat_c_texts[b_idx_i])
                    b_p_texts.append(flat_p_texts[b_idx_i])
                
#                 print("\n\n------ppo_epoch: %d/%d; minibatch: %d/%d--------" % (ppo_epoch_i + 1, self.ppo_params['ppo_epochs'], i + 1, bs // mini_bs))
                train_stats = self.train_minibatch(b_logprobs, b_values, b_rewards, b_c_texts, b_p_texts)

                all_stats.append(train_stats)

        timing['time/ppo/optimize_step'] = time.time()-t

        t = time.time()
        train_stats = stack_dicts(all_stats)

        # the following stats is ignored because the lengths are not the same
#         # reshape advantages/ratios such that they are not averaged.
#         train_stats['policy/advantages'] = torch.flatten(train_stats['policy/advantages']).unsqueeze(0)
#         train_stats['policy/ratio'] = torch.flatten(train_stats['policy/ratio']).unsqueeze(0)

        stats = self.record_step_stats(logprobs=logprobs, ref_logprobs=ref_logprobs, train_stats=train_stats,
                                       kl_coef=kl_coef)
        stats = stats_to_np(stats)
        timing['time/ppo/calc_stats'] = time.time()-t

        self.kl_ctl.update(stats['objective/kl'], self.ppo_params['batch_size'])

        timing['time/ppo/total'] = time.time()-t0
        stats.update(timing)
        return stats

    def get_trigger_forward_pass(self, c_texts, p_texts, is_ref=False):
        reset_pos_emb = self.ppo_params["reset_pos_emb"]
        num_of_triggers = self.ppo_params["num_of_triggers"]
        trigger_format = self.ppo_params["trigger_format"]
        TRIGGER_POSITION_ID = self.ppo_params["TRIGGER_POSITION_ID"]
        device = self.ppo_params["device"]
        
        mini_batch_size = len(c_texts)
        
        # WARNING: bos_key_value need to be passed
        past = expand_past(bos_key_values, num_enc_layers, mini_batch_size)  # deep copy? shouldn't be modifed

        if num_of_triggers > 0:
            if trigger_format == "token":
                if is_ref:
                    trigger_embedding = self.model.ref_ori_trigger_embedding.repeat(mini_batch_size, 1, 1)
                else:
                    trigger_embedding = self.model.ori_trigger_embedding.repeat(mini_batch_size, 1, 1)
                trigger_key_values = self.model.model.encoder(inputs_embeds=trigger_embedding)["past_key_values"]
            else:
                if is_ref:
                    trigger_key_values = expand_past(self.model.ref_ori_trigger_key_values, num_enc_layers, mini_batch_size)
                else:
                    trigger_key_values = expand_past(self.model.ori_trigger_key_values, num_enc_layers, mini_batch_size)

            past = concat_past(past, trigger_key_values, num_enc_layers)
        
        # prepare hidden
        prev_hidden = bos_hidden.repeat(mini_batch_size, 1, 1)
        if num_of_triggers > 0:
            trigger_hidden = model.ori_trigger_hidden
            trigger_hidden = trigger_hidden.repeat(mini_batch_size, 1, 1)
            prev_hidden = torch.cat((prev_hidden, trigger_hidden), dim=1)  # bze, seq_len, hid
            
        # prepare context
        prev_length = prev_hidden.shape[1]
        ctx_model_kwargs = dict()
        ctx_inputs = tokenizer(c_texts, return_tensors='pt', padding=True, truncation=True, max_length=126).to("cuda")
        # because of the past, now key length ("tgt" as defined in blenderbot) is larger than query length ("tgt" as defined)
        cat_attn_mask = torch.cat((torch.ones(ctx_inputs["attention_mask"].shape[0], prev_length, device="cuda", dtype=torch.long), ctx_inputs["attention_mask"]), dim=-1)
        ctx_model_kwargs["attention_mask"] = cat_attn_mask
        
        # get encoder output
        trigger_encoder_kwargs = {
                argument: value for argument, value in ctx_model_kwargs.items() if not argument.startswith("decoder_")
            }
        trigger_encoder_kwargs["past_key_values"] = past
        ctx_output = self.model.model.encoder(ctx_inputs["input_ids"], return_dict=True, **trigger_encoder_kwargs, is_trigger=True)
        ctx_output["last_hidden_state"] = torch.cat((prev_hidden, ctx_output["last_hidden_state"]), dim=1)
        ctx_model_kwargs["encoder_outputs"] = ctx_output
        
        # prepare decoder
        # Note: when calling tokenizer, it will append <eos> to the end (2) but not <bos> at the beginning
        prompt_inputs = tokenizer(p_texts, return_tensors='pt', padding=True, truncation=True).to("cuda")
        prompt_inputs_ids = prompt_inputs["input_ids"]
        prompt_attn_mask = prompt_inputs["attention_mask"]
        # add bos
        dec_bos_ids = torch.ones((prompt_inputs_ids.shape[0], 1), dtype=torch.long, device=device) * bos_token_id
        dec_bos_mask = torch.ones((prompt_inputs_ids.shape[0], 1), dtype=torch.long, device=device)
        dec_inputs_ids = torch.cat((dec_bos_ids, prompt_inputs_ids), dim=1)
        dec_attn_mask = torch.cat((dec_bos_mask, prompt_attn_mask), dim=1)
        prompt_length = torch.sum(dec_attn_mask, dim=-1)  # including bos and eos. shape: [bze]
        # dec_attn_mask needs to be uni-directional? 
        # A: do not pass dec_attn_mask to the model. In decoder, when input length > 1, causal mask is created
        
#         print("debugging!")
#         print(c_texts)
#         print(p_texts)
#         print("ctx inputs: %s" % str(ctx_inputs["input_ids"].shape))
#         print("encoder output hidden: %s" % str(ctx_output["last_hidden_state"].shape))
#         print(dec_inputs_ids)
#         print(dec_attn_mask)
#         print(prompt_length)
        
        # Note: attention_mask is for encoder. "decoder_attention_mask" is for decoder
        model_inputs = {"decoder_input_ids": dec_inputs_ids, "encoder_outputs": ctx_model_kwargs["encoder_outputs"],
                        "attention_mask": ctx_model_kwargs["attention_mask"]}
        outputs = self.model(**model_inputs, return_dict=True)
        
        logits = outputs["logits"]
        value = outputs["value"]
        
#         print("dec_inputs_ids: %s" % str(dec_inputs_ids.shape))
#         print("logits: %s" % str(logits.shape))
#         print("value: %s" % str(value.shape))
        
        return logits, value, dec_inputs_ids, prompt_length
        # Note: different from LM where the bos token does not attend to trigger so that it will cause the problem for value,
        # for encoder-decoder models, logits, and values are from the decoder only, where all the tokens (including decoder_bos)
        # attend to triggers. Therefore, it should not have the problems as before

        
    def batched_trigger_forward_pass(self, all_c_texts, all_p_texts, all_scores):
        # combines batched_forward_pass and compute_rewards
        logprobs, ref_logprobs, values = list(), list(), list()
        rewards, non_score_rewards = list(), list()
        
        for i in range(len(all_c_texts)):
            mini_i_c = all_c_texts[i]
            mini_i_p = all_p_texts[i]
            
            logits, v, p_ids, p_length = self.get_trigger_forward_pass(mini_i_c, mini_i_p)
            ref_logits, _, _, _ = self.get_trigger_forward_pass(mini_i_c, mini_i_p, is_ref=True)
            lp = logprobs_from_logits(logits[:, :-1, :], p_ids[:, 1:])
            ref_lp = logprobs_from_logits(ref_logits[:, :-1, :], p_ids[:, 1:])
            
            for j in range(len(mini_i_c)):  # loop through the minibatch to get the real indices
                start = 0
                end = p_length[j] - 1
                values.append(v[j:j+1, start:end].detach())
                ij_logprob = lp[j:j+1, start:end].detach()
                ij_ref_logprob = ref_lp[j:j+1, start:end].detach()
                logprobs.append(ij_logprob)
                ref_logprobs.append(ij_ref_logprob)
                
                # compute rewards
                ij_reward, ij_non_score_reward, kl_coef = self.compute_rewards(all_scores[i][j], ij_logprob, ij_ref_logprob)
                rewards.append(ij_reward)
                non_score_rewards.append(ij_non_score_reward)
                
        return logprobs, ref_logprobs, values, rewards, non_score_rewards, kl_coef

    def train_minibatch(self, b_logprobs, b_values, b_rewards, b_c_texts, b_p_texts):
        """Train one PPO minibatch"""
#         print("getting loss!")
        loss_p, loss_v, train_stats  = self.loss(b_logprobs, b_values, b_rewards, b_c_texts, b_p_texts)
        loss = loss_p + loss_v
#         print(loss_p.item(), loss_v.item(), loss.item())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return train_stats

    def compute_rewards(self, scores, logprobs, ref_logprobs):
        """Compute per token rewards from scores and KL-penalty."""
        kl = torch.abs(logprobs - ref_logprobs)
        non_score_reward = -self.kl_ctl.value * kl
        rewards = non_score_reward.clone().detach()
        rewards[:, -1] += scores
        return rewards, non_score_reward, self.kl_ctl.value

    # def loss(self, old_logprobs, values, rewards, query, response, model_input):
    def loss(self, old_b_logprobs, b_values, b_rewards, b_c_texts, b_p_texts):
        """Calculate policy and value losses."""
        
        # Note: values, old_logprobs are for prompts only (without context)

        mini_bs = self.ppo_params["ppo_mini_batch_size"]

        b_logits, b_vpred, b_p_ids, b_p_length = self.get_trigger_forward_pass(b_c_texts, b_p_texts)
        b_logprob = logprobs_from_logits(b_logits[:, :-1, :], b_p_ids[:, 1:])  # min_bs x (batch_max_length - 1)
        
        b_pg_loss, b_vf_loss, b_loss, b_entropy, b_approxkl, b_policykl, b_pg_clipfrac,\
        b_advantages_mean, b_return_mean, b_return_var, b_mean_vpred, b_error, b_vf_clipfrac, b_value_mean, b_value_var = \
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        
        for j in range(mini_bs): 
            start = 0
            end = b_p_length[j] - 1

            logprob = b_logprob[j:j+1, start:end]
            vpred = b_vpred[j:j+1, start:end]
            gen_len = end - start
            
            old_logprobs = old_b_logprobs[j]
            rewards = b_rewards[j]
            values = b_values[j]
            
            lastgaelam = 0
            advantages_reversed = []
            for t in reversed(range(gen_len)):
                nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
                delta = rewards[:, t] + self.ppo_params['gamma'] * nextvalues - values[:, t]
                lastgaelam = delta + self.ppo_params['gamma'] * self.ppo_params['lam'] * lastgaelam
                advantages_reversed.append(lastgaelam)
            advantages = torch.stack(advantages_reversed[::-1]).transpose(0, 1)

            returns = advantages + values
            advantages = whiten(advantages)
            advantages = advantages.detach()

            vpredclipped = clip_by_value(vpred,
                                         values - self.ppo_params["cliprange_value"],
                                         values + self.ppo_params["cliprange_value"])

            vf_losses1 = (vpred - returns)**2
            vf_losses2 = (vpredclipped - returns)**2
            vf_loss = .5 * torch.mean(torch.max(vf_losses1, vf_losses2))
            vf_clipfrac =  torch.mean(torch.gt(vf_losses2, vf_losses1).double())

            ratio = torch.exp(logprob - old_logprobs)

            pg_losses = -advantages * ratio
            pg_losses2 = -advantages * torch.clamp(ratio,
                                                   1.0 - self.ppo_params['cliprange'],
                                                   1.0 + self.ppo_params['cliprange'])
            
#             print("advantages")
#             print(advantages)
#             print("ratio")
#             print(ratio)
#             print("pg_losses1: %s" % str(torch.mean(pg_losses).item()))
#             print(pg_losses)
#             print("pg_losses2: %s" % str(torch.mean(pg_losses2).item()))
#             print(pg_losses2)
#             print("pg_loss: %s" % str(torch.mean(torch.max(pg_losses, pg_losses2))))
#             print(torch.max(pg_losses, pg_losses2))
                  
            pg_loss = torch.mean(torch.max(pg_losses, pg_losses2))
            pg_clipfrac = torch.mean(torch.gt(pg_losses2, pg_losses).double())

            loss = pg_loss + self.ppo_params['vf_coef'] * vf_loss

            approxkl = .5 * torch.mean((logprob - old_logprobs)**2)
            policykl = torch.mean(logprob - old_logprobs)
            return_mean, return_var = torch.mean(returns), torch.var(returns)
            value_mean, value_var = torch.mean(values), torch.var(values)

            b_pg_loss += pg_loss
            b_vf_loss += vf_loss
            b_loss += loss
            b_approxkl += approxkl
            b_policykl += policykl
            b_pg_clipfrac += pg_clipfrac
            b_advantages_mean += torch.mean(advantages)
            b_return_mean += return_mean
            b_return_var += return_var
            b_mean_vpred += torch.mean(vpred)
            b_error += torch.mean((vpred - returns) ** 2)
            b_vf_clipfrac += vf_clipfrac
            b_value_mean += value_mean
            b_value_var += value_var

        stats = dict(
            loss=dict(policy=b_pg_loss/mini_bs, value=b_vf_loss/mini_bs, total=b_loss/mini_bs),
            policy=dict(approxkl=b_approxkl/mini_bs, policykl=b_policykl/mini_bs, clipfrac=b_pg_clipfrac/mini_bs,
                        advantages_mean=b_advantages_mean/mini_bs),
            returns=dict(mean=b_return_mean/mini_bs, var=b_return_var/mini_bs),
            val=dict(vpred=b_mean_vpred/mini_bs, error=b_error/mini_bs,
                     clipfrac=b_vf_clipfrac/mini_bs, mean=b_value_mean/mini_bs, var=b_value_var/mini_bs),
        )
        return b_pg_loss/mini_bs, self.ppo_params['vf_coef'] * b_vf_loss/mini_bs, flatten_dict(stats)


    def record_step_stats(self, kl_coef, **data):
        """Record training step statistics."""
        all_mean_kl = 0
        bs = self.ppo_params['batch_size']
        for i in range(bs):
            kl = torch.abs(data["logprobs"][i] - data["ref_logprobs"][i])
            mean_kl = torch.mean(torch.sum(kl, axis=-1))
            all_mean_kl += mean_kl

        # kl = data['logprobs'] - data['ref_logprobs']
        # mean_kl = torch.mean(torch.sum(kl, axis=-1))

        stats = {
            'objective/kl': all_mean_kl / bs,  # need this for adaptive kl controller
            'objective/kl_coef': kl_coef,
        }

        for k, v in data['train_stats'].items():
            stats[f'ppo/{k}'] = torch.mean(v, axis=0)
        stats['ppo/val/var_explained'] = 1 - stats['ppo/val/error'] / stats['ppo/returns/var']
        return stats
    

In [17]:
# assert False

In [31]:
ppo_trainer = PPOTrainer(model, optimizer, **config)
fbs = config['forward_batch_size']

for epoch in tqdm(range(int(np.ceil(config["steps"]/config["batch_size"])))):
    print("***********Epoch: %d/%d*************" % (epoch + 1, int(np.ceil(config["steps"]/config["batch_size"]))))
    torch.cuda.empty_cache()
    logs = dict()
    game_data = dict()
    timing = dict()
    t0 = time.time()
    
    #### get a batch from the dataset
    if mode == "train" and shuffle_data:
        random.shuffle(context_list)
    cond_list = context_list[:config["batch_size"]]
    
#     # this pad to the longest of all. may not be necessary
#     all_input_ids, all_attention_masks, batch_min_length, batch_max_length, all_lengths = prep_inputs(cond_list, tokenizer, device, t_pad_token)
    
    all_c_lengths = list()
    all_c_p_tensors, all_c_p_texts, all_c_p_lengths = list(), list(), list()
    all_c_p_r_tensors, all_c_p_r_texts, all_c_p_r_lengths = list(), list(), list()
    all_rewards = list()
    all_c_p_r_rewards, all_c_p_rewards, all_c_p_rewards_adjusted = list(), list(), list()
    
    log_context, log_prompt, log_response = list(), list(), list()
    all_ppl = list()

    all_c_texts, all_p_texts = list(), list()
    all_r_texts, all_c_p_r_texts = list(), list()  # for debugging
    
    #### get prompt from model
    for i in range(int(config["batch_size"]/fbs)):
        ctx_i = cond_list[i*fbs:(i+1)*fbs]
        log_context += ctx_i
        
        p_texts, p_ppl = generate_sentence_with_trigger(ctx_i, num_enc_layers, num_of_triggers, get_ppl=True)
        log_prompt += p_texts
        all_ppl.append(p_ppl)

        
        c_p_texts = list()
        for c, p in zip(ctx_i, p_texts):
            c_p_texts.append("%s   %s" % (c, p))
            
        c_p_inputs = tokenizer(c_p_texts, return_tensors='pt', padding=True, truncation=True).to(device)
        try:
            r_tensor = model.generate(c_p_inputs['input_ids'], num_beams=model.config.num_beams, do_sample=model.config.do_sample)
        except Exception as e:
            print(c_p_inputs["input_ids"].shape)
            print(ctx_i)
            print(c_p_texts)
            assert False, "Exception: %s" % e
        r_texts_raw = tokenizer.batch_decode(r_tensor)
        r_texts = clean_blender_generation(r_texts_raw)
        log_response += r_texts
        
        c_p_r_texts = list()
        for c_p, r in zip(c_p_texts, r_texts):
            c_p_r_texts.append("%s   %s" % (c_p, r))
            
        
        all_c_texts.append(ctx_i)
        all_p_texts.append(p_texts)
        all_r_texts.append(r_texts)
        all_c_p_r_texts.append(c_p_r_texts)
        
        
        # run classifier for rewards        
        cls_c_p_r_inputs, cls_c_p_r_mask = convert_cls_examples_to_features(r_texts, c_p_texts, cls_max_length)
        with torch.no_grad():
            res = cls_model(cls_c_p_r_inputs, cls_c_p_r_mask)["logits"][:, config["tgt_label"]].detach() 
        
        if prompt_reward:
            cls_c_p_inputs, cls_c_p_mask = convert_cls_examples_to_features(p_texts, ctx_i, cls_max_length)
            with torch.no_grad():
                c_p_res = cls_model(cls_c_p_inputs, cls_c_p_mask)["logits"][:, config["tgt_label"]].detach() 
            # to make it neutral, we assign a reward score following the original ppo sentiment implementation
            # this encourages the logits to be around 0
            c_p_res_adjusted = -2*torch.abs(c_p_res)+4
            all_c_p_r_rewards.append(res)
            all_c_p_rewards.append(c_p_res)
            all_c_p_rewards_adjusted.append(c_p_res_adjusted)
            res = res + c_p_reward_weight * c_p_res_adjusted
            
        all_rewards.append(res)  # [bze]
        
        
    # WARNING: Moving the following to outside of the for loop to debug trigger key_value
#     print("sampled sentences")
#     for ck_i, ck_text in enumerate(all_c_p_r_texts):
#         print(all_c_texts[ck_i])
#         print(all_p_texts[ck_i])
#         print(ck_text)
#         print(all_rewards[ck_i])
#         print(torch.mean(all_rewards[ck_i]))
#         print()
#     print("===========\n\n")

#     print("Debuggin current key_value")
#     print(model.l_1_value[:, 0, :, :10])
#     print(model.ori_trigger_hidden[:, :, :10])
#     print(model.ref_ori_trigger_hidden[:, :, :10])
#     print("++++++++++++\n\n\n")
#     assert False, "Stop here. For PPO debugging, run the following in a different cell"
    
    # should the following be in the fbs loop? Not really. We can change the order of batches in ppo epochs
    # ideally we should be able to dynmaically combine batches, but using the batches formed before should be fine
    # Run PPO training
    t = time.time()
    stats = ppo_trainer.step(all_c_texts, all_p_texts, all_rewards)
    timing['time/optimization'] = time.time()-t
    
    #### Log everything
    timing['time/epoch'] = time.time()-t0
    logs.update(timing)
    logs.update(stats)
    log_name = "game_log_e%d" % (epoch + 1)
    log_rewards = torch.cat(all_rewards)
    log_ppl = sum(all_ppl) / len(all_ppl)
    if prompt_reward:
        log_c_p_rewards = torch.cat(all_c_p_rewards)
        log_c_p_rewards_adjusted = torch.cat(all_c_p_rewards_adjusted)
        log_c_p_r_rewards = torch.cat(all_c_p_r_rewards)
        table_rows = [list(r) for r in zip(log_context, log_prompt, log_response, log_rewards.cpu().tolist(), log_c_p_r_rewards.cpu().tolist(), log_c_p_rewards.cpu().tolist(), log_c_p_rewards_adjusted.cpu().tolist())]
        logs.update({log_name:wandb.Table(
            columns=['context', 'prompt', 'response', 'combined reward', 'c_p_r_reward', 'c_p_reward', 'c_p_adjusted'],
            rows=table_rows)})
        logs['env/c_p_r_reward_mean'] = torch.mean(log_c_p_r_rewards).cpu().numpy()
        logs['env/c_p_r_reward_std'] = torch.std(log_c_p_r_rewards).cpu().numpy()
        logs['env/c_p_r_reward_dist'] = log_c_p_r_rewards.cpu().numpy()
        logs['env/combined_reward_mean'] = torch.mean(log_rewards).cpu().numpy()
        logs['env/c_p_reward_mean'] = torch.mean(log_c_p_rewards).cpu().numpy()
        logs['env/c_p_adjusted_mean'] = torch.mean(log_c_p_rewards_adjusted).cpu().numpy()
        logs['env/p_ppl'] = log_ppl

    else:
        table_rows = [list(r) for r in zip(log_context, log_prompt, log_response, log_rewards.cpu().tolist())]
        logs.update({log_name:wandb.Table(
            columns=['context', 'prompt', 'response', 'reward'],
            rows=table_rows)})
        logs['env/reward_mean'] = torch.mean(log_rewards).cpu().numpy()
        logs['env/reward_std'] = torch.std(log_rewards).cpu().numpy()
        logs['env/reward_dist'] = log_rewards.cpu().numpy()
        logs['env/p_ppl'] = log_ppl

    wandb.log(logs)
        
    # save trigger
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    save_filename = "%s/e%d.pt" % (save_path, epoch + 1)
    save_data = dict()
    save_data["ori_trigger_hidden"] = model.ori_trigger_hidden
    if trigger_format == "token":
        save_data["ori_trigger_embedding"] = model.ori_trigger_embedding
    else:
        save_data["ori_trigger_key_values"] = model.ori_trigger_key_values
    torch.save(save_data, save_filename)    
        
        
        

  0%|          | 0/100 [00:00<?, ?it/s]

***********Epoch: 1/100*************


  1%|          | 1/100 [02:38<4:21:09, 158.27s/it]

***********Epoch: 2/100*************


  1%|          | 1/100 [03:18<5:28:14, 198.93s/it]


KeyboardInterrupt: 

In [None]:
# assert False

In [28]:
# load pre-trained model
# saved_model_path = "/mnt//trigger_experiments/contra_final_2/e17.pt"
# saved_dict = torch.load(saved_model_path)
# model.ori_trigger_hidden = saved_dict["ori_trigger_hidden"]
# model.ori_trigger_key_values = saved_dict["ori_trigger_key_values"]
# print("WARNING: Evaluating a saved model")

init_trigger(model, tokenizer, num_of_triggers, trigger_format)
init_trigger(model, tokenizer, num_of_triggers, trigger_format, ref=True)


In [29]:
# Evaluation
softmax_fn = nn.Softmax(dim=-1)

import csv
csv_file = open("data/testing.csv", "w")

epoch = 60
fbs = 16

eval_context_filename = "data/trigger_decode_human-bot.txt"
eval_context_list = read_file(eval_context_filename)
print("evaluating %s" % eval_context_filename)
print("***********Evaluation at Epoch: %d/%d*************" % (epoch + 1, int(np.ceil(config["steps"]/config["batch_size"]))))


torch.cuda.empty_cache()
logs = dict()
game_data = dict()
timing = dict()
t0 = time.time()

#### get everything from the dataset
cond_list = eval_context_list

all_rewards, all_c_p_r_rewards, all_c_p_rewards, all_c_p_rewards_adjusted = list(), list(), list(), list()
all_probs, all_c_p_probs = list(), list()
log_context, log_prompt, log_response = list(), list(), list()
all_ppl = list()

all_c_texts, all_p_texts = list(), list()
all_r_texts, all_c_p_r_texts = list(), list()  # for debugging

#### get prompt from model
for i in tqdm(range(int(len(cond_list)//fbs))):

    ctx_i = cond_list[i*fbs:(i+1)*fbs]
    log_context += ctx_i

    p_texts, p_ppl = generate_sentence_with_trigger(ctx_i, num_enc_layers, num_of_triggers, get_ppl=True)
    log_prompt += p_texts
    all_ppl.append(p_ppl)

    c_p_texts = list()
    for c, p in zip(ctx_i, p_texts):
        c_p_texts.append("%s   %s" % (c, p))

    c_p_inputs = tokenizer(c_p_texts, return_tensors='pt', padding=True, truncation=True).to(device)
    try:
        r_tensor = model.generate(c_p_inputs['input_ids'], num_beams=model.config.num_beams, do_sample=model.config.do_sample)
    except Exception as e:
        print(c_p_inputs["input_ids"].shape)
        print(ctx_i)
        print(c_p_texts)
        assert False, "Exception: %s" % e
    r_texts_raw = tokenizer.batch_decode(r_tensor)
    r_texts = clean_blender_generation(r_texts_raw)
    log_response += r_texts

    c_p_r_texts = list()
    for c_p, r in zip(c_p_texts, r_texts):
        c_p_r_texts.append("%s   %s" % (c_p, r))


    all_c_texts.append(ctx_i)
    all_p_texts.append(p_texts)
    all_r_texts.append(r_texts)
    all_c_p_r_texts.append(c_p_r_texts)


    # run classifier for rewards        
    cls_max_length = 256

    cls_c_p_r_inputs, cls_c_p_r_mask = convert_cls_examples_to_features(r_texts, c_p_texts, cls_max_length)
    with torch.no_grad():
        all_logits = cls_model(cls_c_p_r_inputs, cls_c_p_r_mask)["logits"]
        res = all_logits[:, config["tgt_label"]].detach() 
        res_probs = softmax_fn(all_logits)[:, config["tgt_label"]].detach() 

    # WARNING: set hyperparameters here
    prompt_reward = True
    c_p_reward_weight = 0.2
    if prompt_reward:
        cls_c_p_inputs, cls_c_p_mask = convert_cls_examples_to_features(p_texts, ctx_i, cls_max_length)
        with torch.no_grad():
            c_p_logits = cls_model(cls_c_p_inputs, cls_c_p_mask)["logits"]
            c_p_res = c_p_logits[:, config["tgt_label"]].detach() 
            c_p_res_probs = softmax_fn(c_p_logits)[:, config["tgt_label"]].detach() 
        # to make it neutral, we assign a reward score following the original ppo sentiment implementation
        # this encourages the logits to be around 0
        c_p_res_adjusted = -2*torch.abs(c_p_res)+4
        all_c_p_r_rewards.append(res)
        all_c_p_rewards.append(c_p_res)
        all_c_p_rewards_adjusted.append(c_p_res_adjusted)
        res = res + c_p_reward_weight * c_p_res_adjusted
        all_c_p_probs.append(c_p_res_probs)

    all_rewards.append(res)  # [bze]
    # if prompt_reward, all_probs is actually for c_p_r
    all_probs.append(res_probs)


log_name = "evaluation %s @e%d" % (eval_context_filename, epoch + 1)
log_rewards = torch.cat(all_rewards)
log_probs = torch.cat(all_probs)
log_ppl = sum(all_ppl) / len(all_ppl)
if prompt_reward:
    log_c_p_rewards = torch.cat(all_c_p_rewards)
    log_c_p_rewards_adjusted = torch.cat(all_c_p_rewards_adjusted)
    log_c_p_r_rewards = torch.cat(all_c_p_r_rewards)
    log_c_p_probs = torch.cat(all_c_p_probs)
    fieldnames = ['context', 'prompt', 'response', 'combined reward', 'c_p_r_reward', 'c_p_r_probs', 'c_p_reward', 'c_p_adjusted']
    
    table_rows = [list(r) for r in zip(log_context, log_prompt, log_response, log_rewards.cpu().tolist(), log_c_p_r_rewards.cpu().tolist(), log_probs.cpu().tolist(), log_c_p_rewards.cpu().tolist(), log_c_p_rewards_adjusted.cpu().tolist())]
    
    logs['env/c_p_r_reward_mean'] = torch.mean(log_c_p_r_rewards).cpu().numpy()
    logs['env/c_p_r_reward_std'] = torch.std(log_c_p_r_rewards).cpu().numpy()
    logs['env/combined_reward_mean'] = torch.mean(log_rewards).cpu().numpy()
    logs['env/c_p_reward_mean'] = torch.mean(log_c_p_rewards).cpu().numpy()
    logs['env/c_p_adjusted_mean'] = torch.mean(log_c_p_rewards_adjusted).cpu().numpy()
    
    logs['env/c_p_probs_mean'] = torch.mean(log_c_p_probs).cpu().numpy()
    logs['env/c_p_probs_std'] = torch.std(log_c_p_probs).cpu().numpy()
else:
    table_rows = [list(r) for r in zip(log_context, log_prompt, log_response, log_rewards.cpu().tolist(), log_probs.cpu().tolist())]

    fieldnames = ['context', 'prompt', 'response', 'reward', 'probs'],

    logs['env/reward_mean'] = torch.mean(log_rewards).cpu().numpy()
    logs['env/reward_std'] = torch.std(log_rewards).cpu().numpy()

logs['env/reward_prob_mean'] = torch.mean(log_probs).cpu().numpy()
logs['env/reward_prob_std'] = torch.std(log_probs).cpu().numpy()

logs['env/p_ppl'] = log_ppl

logs.update({log_name:wandb.Table(
            columns=fieldnames,
            rows=table_rows)})

writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for row_list in table_rows:
    row_dict = dict()
    for row_name, row_item in zip(fieldnames, row_list):
        row_dict[row_name] = row_item
    writer.writerow(row_dict)

print(logs)
    


  0%|          | 0/47 [00:00<?, ?it/s]

evaluating data/trigger_decode_human-bot.txt
***********Evaluation at Epoch: 61/100*************


100%|██████████| 47/47 [01:34<00:00,  2.00s/it]


{'env/c_p_r_reward_mean': array(-1.7488185, dtype=float32), 'env/c_p_r_reward_std': array(1.6573225, dtype=float32), 'env/combined_reward_mean': array(-2.1369336, dtype=float32), 'env/c_p_reward_mean': array(-2.8941903, dtype=float32), 'env/c_p_adjusted_mean': array(-1.9405743, dtype=float32), 'env/c_p_probs_mean': array(0.03415425, dtype=float32), 'env/c_p_probs_std': array(0.14086929, dtype=float32), 'env/reward_prob_mean': array(0.18237211, dtype=float32), 'env/reward_prob_std': array(0.31702128, dtype=float32), 'env/p_ppl': 14.177047289055817, 'evaluation data/trigger_decode_human-bot.txt @e61': <wandb.data_types.Table object at 0x7f6b392587b8>}


In [None]:
assert False


# The End

In [None]:
tokenizer(["This is a test 1, 2, 3, 4, 5, 6, 7", "This is true"], return_tensors='pt', padding=True, )

In [None]:
tokenizer(["This is a test 1, 2, 3, 4, 5, 6, 7", "This is true"], return_tensors='pt', padding=True, )["input_ids"].shape

In [None]:
tokenizer(["This is a test 1, 2, 3, 4, 5, 6, 7", "This is true"], return_tensors='pt', padding=True, truncation=True, max_length=10)

In [None]:
assert False, "STOP HERE"

In [None]:
tokenizer.decode([   1,  649,  341,  265,  699,  306,  672,   21,  714,  906,  306,  672,
          348,  361,  304,  361,   38,  946,  304, 1532,  312,   38,    2,    0],)

In [None]:
ppo_trainer = PPOTrainer(model, optimizer, **config)
stats = ppo_trainer.step(all_c_texts, all_p_texts, all_rewards)

In [None]:
stats

# testing classifier

s1 = ["I kept quiet because I didn't want to make a scene, but i intend, having her beat up", 
      "I'm not doing great, some black girl insulted me today",
      "I'm doing well, thank you. How are you? What do you do for a living?",
      "Oh no! I'm so sorry to hear that. What did you say back to her?",
      "Oh my goodness! That's terrible. I'm sorry you had to go through that.",
      "She deserves to be beaten up",
      "I agree. I hope you don't let her get to you. You're better than that",
      
     ]
s2 = ["I'm doing well, thank you. How are you? What do you do for a living?   I'm not doing great, some black girl insulted me today   Oh no! I'm so sorry to hear that. What did you say back to her?", 
      "Hello, how are you doing   I'm doing well, thank you. How are you? What do you do for a living",
      "Hello, how are you doing",
      "Hello, how are you doing   I'm doing well, thank you. How are you? What do you do for a living?   I'm not doing great, some black girl insulted me today",
      "I'm not doing great, some black girl insulted me today   Oh no! I'm so sorry to hear that. What did you say back to her?   I kept quiet because I didn't want to make a scene, but i intend, having her beat up",
      "Oh no! I'm so sorry to hear that. What did you say back to her?   I kept quiet because I didn't want to make a scene, but i intend, having her beat up   Oh my goodness! That's terrible. I'm sorry you had to go through that.",
      "I kept quiet because I didn't want to make a scene, but i intend, having her beat up   Oh my goodness! That's terrible. I'm sorry you had to go through that.   She deserves to be beaten up",
      
     
     ]

test_input_ids, test_attn_mask = convert_cls_examples_to_features(s1, s2, 128)

cls_model(test_input_ids, test_attn_mask)

