Resources:



*  Using huggingface PEFT https://github.com/lxe/simple-llama-finetuner/blob/master/main.py
*   PEFT Examples: https://github.com/huggingface/peft/tree/main/examples





## Requirements

In [None]:
!git clone https://github.com/lvwerra/trl.git
!pip install trl/
!pip install git+https://github.com/huggingface/peft.git
!pip install loralib
!pip install bitsandbytes
!pip install openai
!pip install datasets
!pip install sentencepiece
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate
!pip install evaluate

# Finetune OpenAI-GPT2 Detector

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

from typing import List

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import PreTrainedTokenizer

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import numpy as np

import spacy

class EncodedDataset(Dataset):

  def __init__(self, input_sents: List[str],
                input_labels: List[int],
                tokenizer: PreTrainedTokenizer,
                max_sequence_length: int = None):

    self.input_sents = input_sents
    self.input_labels = input_labels
    self.tokenizer = tokenizer
    self.max_sequence_length = max_sequence_length

  def __len__(self):
    return len(self.input_sents)

  def __getitem__(self, index):

    text = self.input_sents[index]
    label = self.input_labels[index]

    token = self.tokenizer(text, padding='max_length', max_length= self.max_sequence_length, truncation=True)

    input_ids, mask_ids = torch.tensor(token['input_ids']), torch.tensor(token['attention_mask'])

    return input_ids, mask_ids, label

In [None]:
from transformers import AutoTokenizer

import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification, AutoConfig
from sklearn.preprocessing import OneHotEncoder

# Define the OneHotEncoder object
encoder = OneHotEncoder(categories=[['0', '1']])

detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
detectot_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")


def tokenize_function(examples):
    return detectot_tokenizer(examples["text"], padding="max_length", truncation=True)

def labelize_function(examples):
# Convert the labels to integer values
    labels = [int(label) for label in examples["label"]]
    encoder.fit(np.array(labels).reshape(-1, 1))

    # One-hot encode the labels using the encoder
    encoded_labels = encoder.transform(np.array(labels).reshape(-1, 1)).toarray()

    return {"input_ids": examples["input_ids"], "attention_mask": examples["attention_mask"], "labels": encoded_labels}


def detector_ft(train_dataset, path, batch_size):

    tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.map(labelize_function, batched=True)
    print(tokenized_datasets)


    training_args = TrainingArguments(output_dir=str(path),
                                    overwrite_output_dir=True,
                                    evaluation_strategy="epoch",
                                    learning_rate=2e-5,
                                    weight_decay=0.01,
                                    num_train_epochs=15,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size)

    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
      logits, labels = eval_pred
      predictions = np.argmax(logits, axis=-1)
      labels = np.argmax(labels, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=detector_model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.save_model()

In [None]:
from datasets import load_dataset

batch_size = 8
data_path = ""

save_path = ""

train_path = data_path + "train.csv"
test_path = data_path + "test.csv"

ft_data_files = {"train": train_path, "test":test_path}

ft_dataset = load_dataset("csv", data_files=ft_data_files, sep=",")

detector_ft(ft_dataset,save_path, batch_size)

# Evasive Soft Prompt Learning : OpenAI-FT

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from peft import get_peft_config, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, pipeline, LlamaTokenizer, LlamaForCausalLM, AutoModelForSequenceClassification

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler

from types import SimpleNamespace

import matplotlib.pyplot as plt
import numpy as np
import datasets
import transformers
import re
import torch
import torch.nn.functional as F
# import tqdm
import random
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import argparse
import datetime
import os
import json
import functools
# import custom_datasets
from multiprocessing.pool import ThreadPool
import time
import pandas as pd

from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

## Reward Detector: Zeroshot (DetectGPT)



*   Original Source Code: https://github.com/eric-mitchell/detect-gpt



In [None]:
def load_base_model():
    print('MOVING BASE MODEL TO GPU...', end='', flush=True)
    start = time.time()
    try:
        mask_model.cpu()
    except NameError:
        pass
    if script_args.openai_model is None:
        base_model.to(DEVICE)
    print(f'DONE ({time.time() - start:.2f}s)')


def load_mask_model():
    print('MOVING MASK MODEL TO GPU...', end='', flush=True)
    start = time.time()

    if script_args.openai_model is None:
        base_model.cpu()

    if not script_args.random_fills:
        mask_model.to(DEVICE)
    print(f'DONE ({time.time() - start:.2f}s)')


def tokenize_and_mask(text, span_length, pct, ceil_pct=False):
    tokens = text.split(' ')
    mask_string = '<<<mask>>>'

    n_spans = pct * len(tokens) / (span_length + script_args.buffer_size * 2)
    if ceil_pct:
        n_spans = np.ceil(n_spans)
    n_spans = int(n_spans)

    n_masks = 0
    while n_masks < n_spans:
        start = np.random.randint(0, len(tokens) - span_length)
        end = start + span_length
        search_start = max(0, start - script_args.buffer_size)
        search_end = min(len(tokens), end + script_args.buffer_size)
        if mask_string not in tokens[search_start:search_end]:
            tokens[start:end] = [mask_string]
            n_masks += 1

    # replace each occurrence of mask_string with <extra_id_NUM>, where NUM increments
    num_filled = 0
    for idx, token in enumerate(tokens):
        if token == mask_string:
            tokens[idx] = f'<extra_id_{num_filled}>'
            num_filled += 1
    assert num_filled == n_masks, f"num_filled {num_filled} != n_masks {n_masks}"
    text = ' '.join(tokens)
    return text


def count_masks(texts):
    return [len([x for x in text.split() if x.startswith("<extra_id_")]) for text in texts]


# replace each masked span with a sample from T5 mask_model
def replace_masks(texts):
    n_expected = count_masks(texts)
    stop_id = mask_tokenizer.encode(f"<extra_id_{max(n_expected)}>")[0]
    tokens = mask_tokenizer(texts, return_tensors="pt", padding=True).to(DEVICE)
    outputs = mask_model.generate(**tokens, max_length=150, do_sample=True, top_p=script_args.mask_top_p, num_return_sequences=1, eos_token_id=stop_id)
    return mask_tokenizer.batch_decode(outputs, skip_special_tokens=False)


def extract_fills(texts):
    # remove <pad> from beginning of each text
    texts = [x.replace("<pad>", "").replace("</s>", "").strip() for x in texts]

    # return the text in between each matched mask token
    extracted_fills = [pattern.split(x)[1:-1] for x in texts]

    # remove whitespace around each fill
    extracted_fills = [[y.strip() for y in x] for x in extracted_fills]

    return extracted_fills


def apply_extracted_fills(masked_texts, extracted_fills):
    # split masked text into tokens, only splitting on spaces (not newlines)
    tokens = [x.split(' ') for x in masked_texts]

    n_expected = count_masks(masked_texts)

    # replace each mask token with the corresponding fill
    for idx, (text, fills, n) in enumerate(zip(tokens, extracted_fills, n_expected)):
        if len(fills) < n:
            tokens[idx] = []
        else:
            for fill_idx in range(n):
                text[text.index(f"<extra_id_{fill_idx}>")] = fills[fill_idx]

    # join tokens back into text
    texts = [" ".join(x) for x in tokens]
    return texts


def perturb_texts_(texts, span_length, pct, ceil_pct=False):
    if not script_args.random_fills:
        masked_texts = [tokenize_and_mask(x, span_length, pct, ceil_pct) for x in texts]
        raw_fills = replace_masks(masked_texts)
        extracted_fills = extract_fills(raw_fills)
        perturbed_texts = apply_extracted_fills(masked_texts, extracted_fills)

        # Handle the fact that sometimes the model doesn't generate the right number of fills and we have to try again
        attempts = 1
        while '' in perturbed_texts:
            idxs = [idx for idx, x in enumerate(perturbed_texts) if x == '']
            print(f'WARNING: {len(idxs)} texts have no fills. Trying again [attempt {attempts}].')
            masked_texts = [tokenize_and_mask(x, span_length, pct, ceil_pct) for idx, x in enumerate(texts) if idx in idxs]
            raw_fills = replace_masks(masked_texts)
            extracted_fills = extract_fills(raw_fills)
            new_perturbed_texts = apply_extracted_fills(masked_texts, extracted_fills)
            for idx, x in zip(idxs, new_perturbed_texts):
                perturbed_texts[idx] = x
            attempts += 1
    else:
        if script_args.random_fills_tokens:
            # tokenize base_tokenizer
            tokens = base_tokenizer(texts, return_tensors="pt", padding=True).to(DEVICE)
            valid_tokens = tokens.input_ids != base_tokenizer.pad_token_id
            replace_pct = script_args.pct_words_masked * (script_args.span_length / (script_args.span_length + 2 * script_args.buffer_size))

            # replace replace_pct of input_ids with random tokens
            random_mask = torch.rand(tokens.input_ids.shape, device=DEVICE) < replace_pct
            random_mask &= valid_tokens
            random_tokens = torch.randint(0, base_tokenizer.vocab_size, (random_mask.sum(),), device=DEVICE)
            # while any of the random tokens are special tokens, replace them with random non-special tokens
            while any(base_tokenizer.decode(x) in base_tokenizer.all_special_tokens for x in random_tokens):
                random_tokens = torch.randint(0, base_tokenizer.vocab_size, (random_mask.sum(),), device=DEVICE)
            tokens.input_ids[random_mask] = random_tokens
            perturbed_texts = base_tokenizer.batch_decode(tokens.input_ids, skip_special_tokens=True)
        else:
            masked_texts = [tokenize_and_mask(x, span_length, pct, ceil_pct) for x in texts]
            perturbed_texts = masked_texts
            # replace each <extra_id_*> with script_args.span_length random words from FILL_DICTIONARY
            for idx, text in enumerate(perturbed_texts):
                filled_text = text
                for fill_idx in range(count_masks([text])[0]):
                    fill = random.sample(FILL_DICTIONARY, span_length)
                    filled_text = filled_text.replace(f"<extra_id_{fill_idx}>", " ".join(fill))
                assert count_masks([filled_text])[0] == 0, "Failed to replace all masks"
                perturbed_texts[idx] = filled_text

    return perturbed_texts


def perturb_texts(texts, span_length, pct, ceil_pct=False):
    chunk_size = script_args.chunk_size
    if '11b' in mask_filling_model_name:
        chunk_size //= 2

    outputs = []
    for i in tqdm(range(0, len(texts), chunk_size), desc="Applying perturbations"):
        outputs.extend(perturb_texts_(texts[i:i + chunk_size], span_length, pct, ceil_pct=ceil_pct))
    return outputs


def drop_last_word(text):
    return ' '.join(text.split(' ')[:-1])


def _openai_sample(p, args):
    if script_args.dataset != 'pubmed':  # keep Answer: prefix for pubmed
        p = drop_last_word(p)

    # sample from the openai model
    kwargs = { "engine": script_args.openai_model, "max_tokens": 200 }
    if script_args.do_top_p:
        kwargs['top_p'] = script_args.top_p

    r = openai.Completion.create(prompt=f"{p}", **kwargs)
    return p + r['choices'][0].text


def get_likelihood(logits, labels):
    assert logits.shape[0] == 1
    assert labels.shape[0] == 1

    logits = logits.view(-1, logits.shape[-1])[:-1]
    labels = labels.view(-1)[1:]
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    log_likelihood = log_probs.gather(dim=-1, index=labels.unsqueeze(-1)).squeeze(-1)
    return log_likelihood.mean()


# Get the log likelihood of each text under the base_model
def get_ll(text):
    if script_args.openai_model:
        kwargs = { "engine": script_args.openai_model, "temperature": 0, "max_tokens": 0, "echo": True, "logprobs": 0}
        r = openai.Completion.create(prompt=f"<|endoftext|>{text}", **kwargs)
        result = r['choices'][0]
        tokens, logprobs = result["logprobs"]["tokens"][1:], result["logprobs"]["token_logprobs"][1:]

        assert len(tokens) == len(logprobs), f"Expected {len(tokens)} logprobs, got {len(logprobs)}"

        return np.mean(logprobs)
    else:
        with torch.no_grad():
            tokenized = base_tokenizer(text, return_tensors="pt").to(DEVICE)
            labels = tokenized.input_ids
            return -base_model(input_ids=tokenized.input_ids,attention_mask=tokenized.attention_mask, labels=labels).loss.item()


def get_lls(texts):
    if not script_args.openai_model:
        return [get_ll(text) for text in texts]
    else:
        global API_TOKEN_COUNTER

        # use GPT2_TOKENIZER to get total number of tokens
        total_tokens = sum(len(GPT2_TOKENIZER.encode(text)) for text in texts)
        API_TOKEN_COUNTER += total_tokens * 2  # multiply by two because OpenAI double-counts echo_prompt tokens

        pool = ThreadPool(script_args.batch_size)
        return pool.map(get_ll, texts)


def get_roc_metrics(preds):
    fpr, tpr, _ = roc_curve([1] * len(preds), preds)
    roc_auc = auc(fpr, tpr)
    return fpr.tolist(), tpr.tolist(), float(roc_auc)


def get_precision_recall_metrics(preds):
    precision, recall, _ = precision_recall_curve( [1] * len(preds), preds)
    pr_auc = auc(recall, precision)
    return precision.tolist(), recall.tolist(), float(pr_auc)


def run_detectgpt_perturb(batch_text, span_length=10, n_perturbations=1, n_samples=500):
    # load_mask_model()

    torch.manual_seed(0)
    np.random.seed(0)

    results = []

    perturb_fn = functools.partial(perturb_texts, span_length=span_length, pct=script_args.pct_words_masked)

    p_batch_text = perturb_fn([x for x in batch_text for _ in range(n_perturbations)])

    for _ in range(n_perturbation_rounds - 1):
        try:
            p_batch_text = perturb_fn(p_batch_text)
        except AssertionError:
            break

    assert len(p_batch_text) == len(batch_text) * n_perturbations, f"Expected {len(batch_text) * n_perturbations} perturbed samples, got {len(p_batch_text)}"

    for idx in range(len(batch_text)):
        results.append({
            "original": batch_text[idx],
            "perturbed_original": p_batch_text[idx * n_perturbations: (idx + 1) * n_perturbations],
        })

    # load_base_model()

    for res in tqdm(results, desc="Computing log likelihoods"):
        p_original_ll = get_lls(res["perturbed_original"])
        res["original_ll"] = get_ll(res["original"])

        res["all_perturbed_original_ll"] = p_original_ll

        res["perturbed_original_ll"] = np.mean(p_original_ll)

        res["perturbed_original_ll_std"] = np.std(p_original_ll) if len(p_original_ll) > 1 else 1

    return results


def get_detectgpt_scores(results, criterion, span_length=10, n_perturbations=1, n_samples=500):
    # compute diffs with perturbed
    predictions = []
    for res in results:
        if criterion == 'd':
            predictions.append(res['original_ll'] - res['perturbed_original_ll'])

        elif criterion == 'z':
            if res['perturbed_original_ll_std'] == 0:
                res['perturbed_original_ll_std'] = 1
                print("WARNING: std of perturbed original is 0, setting to 1")
                print(f"Number of unique perturbed original texts: {len(set(res['perturbed_original']))}")
                print(f"Original text: {res['original']}")

            predictions.append((res['original_ll'] - res['perturbed_original_ll']) / res['perturbed_original_ll_std'])

    fpr, tpr, roc_auc = get_roc_metrics(predictions)
    p, r, pr_auc = get_precision_recall_metrics(predictions)

    name = f'perturbation_{n_perturbations}_{criterion}'
    print(f"{name} ROC AUC: {roc_auc}, PR AUC: {pr_auc}")

    return predictions


def load_base_model_and_tokenizer(name):

    if 'llama' in name:
      base_model = transformers.AutoModelForCausalLM.from_pretrained(name, **base_model_kwargs, cache_dir=cache_dir)
      base_tokenizer = transformers.AutoTokenizer.from_pretrained(name, cache_dir=script_args.cache_dir)
      base_tokenizer.pad_token_id = base_tokenizer.eos_token_id

    else:
      base_tokenizer = transformers.AutoTokenizer.from_pretrained(name, cache_dir=script_args.cache_dir)
      base_tokenizer.pad_token_id = base_tokenizer.eos_token_id

    return base_model, base_tokenizer

In [None]:
if __name__ == '__main__':

  params = {"model_name": "",
            "log_with" : None,
            "learning_rate" : 1.41e-5,
            "mini_batch_size" : 4,
            "batch_size" : 4,
            "gradient_accumulation_steps" : 1,
            "pct_words_masked": 0.3,
            "span_length": 2,
            "n_samples":200,
            "n_perturbations":5,
            "n_perturbation_rounds":1,
            "base_model_name": "",
            "mask_filling_model_name": "t5-base",
            "random_fills": True,
            "random_fills_tokens":True,
            "buffer_size":1,
            "chunk_size":20,
            "mask_top_p":1.0,
            "pre_perturb_pct":0.0,
            "pre_perturb_span_length":5,
            "n_similarity_samples":20,
            "openai_model":None,
            "cache_dir":"/content/sample_data",
            "int8": True,
            "half":False
            }

  script_args = SimpleNamespace(**params)

  mask_filling_model_name = script_args.mask_filling_model_name
  n_samples = script_args.n_samples
  batch_size = script_args.batch_size
  # n_perturbation_list = [int(x) for x in script_args.n_perturbation_list.split(",")]
  n_perturbation_rounds = script_args.n_perturbation_rounds
  n_similarity_samples = script_args.n_similarity_samples

  if script_args.int8:
    int8_kwargs = dict(load_in_8bit=True, device_map='auto')

  base_model = AutoModelForCausalLM.from_pretrained(script_args.base_model_name, **int8_kwargs, cache_dir=script_args.cache_dir)
  base_tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name, cache_dir=script_args.cache_dir)
  base_tokenizer.pad_token_id = base_tokenizer.eos_token_id

  # mask filling t5 model
  if not script_args.random_fills:
      int8_kwargs = {}
      half_kwargs = {}
      if script_args.int8:
          int8_kwargs = dict(load_in_8bit=True, device_map='auto', torch_dtype=torch.bfloat16)
      elif script_args.half:
          half_kwargs = dict(torch_dtype=torch.bfloat16)
      print(f'Loading mask filling model {mask_filling_model_name}...')
      mask_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(mask_filling_model_name, **int8_kwargs, **half_kwargs, cache_dir=script_args.cache_dir)
      try:
          n_positions = mask_model.config.n_positions
      except AttributeError:
          n_positions = 512
  else:
      n_positions = 512

  # preproc_tokenizer = transformers.AutoTokenizer.from_pretrained('t5-small', model_max_length=512, cache_dir=script_args.cache_dir)
  mask_tokenizer = transformers.AutoTokenizer.from_pretrained(mask_filling_model_name, model_max_length=n_positions, cache_dir=script_args.cache_dir)
  preproc_tokenizer = mask_tokenizer

  config = PPOConfig(
      model_name=script_args.base_model_name,
      learning_rate=script_args.learning_rate,
      log_with=script_args.log_with,
      mini_batch_size=script_args.mini_batch_size,
      batch_size=script_args.batch_size,
      gradient_accumulation_steps=script_args.gradient_accumulation_steps,
  )


  def build_dataset(config, tokenizer, dataset_path="", input_min_text_length=10, input_max_text_length=32):

      ds = load_dataset("csv", data_files=dataset_path)

      def tokenize(sample):
          # sample["input_ids"] = tokenizer.encode(sample["title"])[: input_size()]
          sample["input_ids"] = tokenizer.encode(sample["title"])
          sample["query"] = tokenizer.decode(sample["input_ids"])
          return sample

      ds = ds.map(tokenize, batched=False)
      ds.set_format(type="torch")

      return ds

  device = 0 if torch.cuda.is_available() else "cpu"
  DEVICE = device

  dataset = build_dataset(config, tokenizer=base_tokenizer)


  def collator(data):
      return dict((key, [d[key] for d in data]) for key in data[0])

  set_seed(config.seed)

  """### Apply PEFT
  """

  def print_trainable_parameters(model):
      """
      Prints the number of trainable parameters in the model.
      """
      trainable_params = 0
      all_param = 0
      for _, param in model.named_parameters():
          all_param += param.numel()
          if param.requires_grad:
              trainable_params += param.numel()
      print(
          f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
      )

# Define the Soft Prompt Configurations Here: For Prompt Tuning Change it back to the commented section
  lora_r = 8
  lora_alpha= 16
  lora_dropout = 0.05
  lora_target_modules = ["q_proj","v_proj",]

  lora_config = LoraConfig(
      r=lora_r,
      lora_alpha=lora_alpha,
      target_modules=lora_target_modules,
      lora_dropout=lora_dropout,
      bias="none",
      task_type="CAUSAL_LM",
  )

  base_model = prepare_model_for_int8_training(base_model, output_embedding_layer_name="embed_out")
  base_model = get_peft_model(base_model, lora_config)

  model = AutoModelForCausalLMWithValueHead.from_pretrained(base_model)

  model.gradient_checkpointing_disable = model.pretrained_model.gradient_checkpointing_disable
  model.gradient_checkpointing_enable = model.pretrained_model.gradient_checkpointing_enable

  print_trainable_parameters(model)

  optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate)

  print("Initialize PPO framework")
  ppo_trainer = PPOTrainer(
      config, model, ref_model=None, tokenizer=base_tokenizer, dataset=dataset['train'], data_collator=collator, optimizer=optimizer
  )


  device = ppo_trainer.accelerator.device
  if ppo_trainer.accelerator.num_processes == 1:
      device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

  generation_kwargs = {
      "min_length": -1,
      "top_k": 0.0,
      "top_p": 1.0,
      "do_sample": True,
      "pad_token_id": base_tokenizer.eos_token_id,
      "eos_token_id": -1,
  }

  break_epoch = 100
  gen_len = 32
  print("Training starting........")
  for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):

      if epoch== break_epoch:
        break

      query_tensors = batch["input_ids"]

      print("Generation Started..")

      model.gradient_checkpointing_disable()
      model.pretrained_model.config.use_cache = True

      print("Generation Started..")
      # Get response from LLM - i.e. generations
      response_tensors = []
      for query in query_tensors:
          # gen_len = output_length_sampler()
          generation_kwargs["max_new_tokens"] = gen_len
          response = ppo_trainer.generate(query, **generation_kwargs)
          response_tensors.append(response.squeeze()[-gen_len:])

      print("Response generation completed...")
      batch["response"] = [base_tokenizer.decode(r.squeeze()) for r in response_tensors]

      # Compute detector score - I added title + generation
      texts = [q + r for q, r in zip(batch["query"], batch["response"])]

      results = run_detectgpt_perturb(texts, span_length=script_args.span_length, n_perturbations=script_args.n_perturbations)

      pipe_outputs = get_detectgpt_scores(results, criterion='d', span_length=script_args.span_length, n_perturbations=script_args.n_perturbations)

      print("Detector outputs: ", pipe_outputs)

      rewards = [torch.tensor(output) for output in pipe_outputs]

      # Run PPO step
      model.gradient_checkpointing_enable()
      model.pretrained_model.config.use_cache = False

      print("PPO started...")
      # print(query_tensors)
      # print(response_tensors)
      # print(rewards)

      stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
      ppo_trainer.log_stats(stats, batch, rewards)

  peft_model_id = f""
  save_dir = ""
  model.save_pretrained(save_dir+peft_model_id)

## Reward Detector: Supervised (OpenAI-FT)




In [None]:
from typing import List

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import PreTrainedTokenizer

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import numpy as np

import spacy


class EncodedDataset(Dataset):

  def __init__(self, input_sents: List[str],
                input_labels: List[int],
                tokenizer: PreTrainedTokenizer,
                max_sequence_length: int = None,
                max_targets: int = 5):

    self.input_sents = input_sents
    self.input_labels = input_labels
    self.tokenizer = tokenizer
    self.max_sequence_length = max_sequence_length
    self.max_targets = max_targets
    # self.min_sequence_length = min_sequence_length

  def __len__(self):
    return len(self.input_sents)

  def __getitem__(self, index):

    text = self.input_sents[index]
    label = self.input_labels[index]

    token = self.tokenizer(text, padding='max_length', max_length= self.max_sequence_length, truncation=True)

    input_ids, mask_ids = torch.tensor(token['input_ids']), torch.tensor(token['attention_mask'])

    return input_ids, mask_ids, label

In [None]:
from torch.utils.data import DataLoader, RandomSampler

def get_roc_metrics(y_true, preds):
    fpr, tpr, _ = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    return float(roc_auc)

def evaluate_openaidetect(model, test_data, tokenizer, test_labels, max_sequence_length, test_batch_size, device, return_logits=False):
    model = model.to(device)

    softmax = torch.nn.Softmax(dim=1)

    test = EncodedDataset(input_sents=test_data,
                    input_labels=test_labels,
                    tokenizer=tokenizer,
                    max_sequence_length=max_sequence_length)


    test_dataloader = DataLoader(test, batch_size=test_batch_size)


    total_acc_test = 0
    total_loss_test = 0
    predictions = []
    y_true = []
    output_logits = []

    model.eval()
    with torch.no_grad():
      for test_input, test_mask, test_label in test_dataloader:
        test_input = test_input.to(device)
        test_mask = test_mask.to(device)

        # val_modifers = val_modifers.to(device)
        test_label = test_label.to(device)

        output = model(input_ids=test_input,
                      attention_mask=test_mask)

        logits = softmax(output.logits)

        if return_logits == True:
          output_logits.append(logits.detach().cpu())

        acc = (logits.argmax(dim=1) == test_label).sum().item()

        predictions.extend(logits.argmax(dim=1).detach().cpu().numpy())

        y_true.extend(test_label.detach().cpu().numpy())

      model.cpu()
      if return_logits == True:
          return predictions,y_true, output_logits

      return predictions,y_true

In [None]:

params = {"model_name": "", #PLM Name
          "detector_ckpt": "",
          "peft_model_id":f"",
          "log_with" : None,
          "learning_rate" : 1.41e-5,
          "mini_batch_size" : 4,
          "batch_size" : 8,
          "gradient_accumulation_steps" : 1,
          }


script_args = SimpleNamespace(**params)

config = PPOConfig(
    model_name=script_args.model_name,
    learning_rate=script_args.learning_rate,
    log_with=script_args.log_with,
    mini_batch_size=script_args.mini_batch_size,
    batch_size=script_args.batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
)


def build_dataset(config, tokenizer, dataset_path="", input_min_text_length=10, input_max_text_length=32):

    ds = load_dataset("csv", data_files=dataset_path)

    def tokenize(sample):
        # sample["input_ids"] = tokenizer.encode(sample["title"])[: input_size()]
        sample["input_ids"] = tokenizer.encode(sample["title"])
        sample["query"] = sample["title"]
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")

    return ds

device = 0 if torch.cuda.is_available() else "cpu"

pretrained_model = AutoModelForCausalLM.from_pretrained(config.model_name, load_in_8bit=True, device_map="auto")

if "llama" in script_args.model_name:
  tokenizer = LlamaTokenizer.from_pretrained(config.model_name)
else:
  tokenizer = AutoTokenizer.from_pretrained(config.model_name)

dataset = build_dataset(config, tokenizer=tokenizer)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

set_seed(config.seed)


"""### Apply PEFT
"""

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Define
lora_r = 8
lora_alpha= 16
lora_dropout = 0.05

lora_target_modules = None
if "gpt-neox" in script_args.model_name:
    lora_target_modules = ["query_key_value", "xxx"]  # workaround to use 8bit training on this model

lora_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

if "gpt-neox" in script_args.model_name:
    for name, param in pretrained_model.named_parameters():
        # freeze base model's layers
        param.requires_grad = False

        if getattr(pretrained_model, "is_loaded_in_8bit", False):
            # cast layer norm in fp32 for stability for 8bit models
            if param.ndim == 1 and "layer_norm" in name:
                param.data = param.data.to(torch.float16)

pretrained_model = prepare_model_for_int8_training(pretrained_model)
pretrained_model = get_peft_model(pretrained_model, lora_config)

model = AutoModelForCausalLMWithValueHead.from_pretrained(pretrained_model)

model.gradient_checkpointing_disable = model.pretrained_model.gradient_checkpointing_disable
model.gradient_checkpointing_enable = model.pretrained_model.gradient_checkpointing_enable

print_trainable_parameters(model)

# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.
if "llama" in script_args.model_name:
  tokenizer.pad_token = " "
else:
  tokenizer.pad_token = tokenizer.eos_token

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate)

print("Initialize PPO framework")
ppo_trainer = PPOTrainer(
    config, model, ref_model=None, tokenizer=tokenizer, dataset=dataset['train'], data_collator=collator, optimizer=optimizer
)


device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

print("Loading detector: ...")
detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
detectot_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")

detector_model.load_state_dict(torch.load(script_args.detector_ckpt))

#Text generation configs

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 0.96,
    "temperature": 0.9,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": -1,
}


break_epoch = 4
gen_len = 64
print("Training starting........")
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):

    if epoch== break_epoch:
      break

    query_tensors = batch["input_ids"]

    model.gradient_checkpointing_disable()
    model.pretrained_model.config.use_cache = True

    # Get response from LLM - i.e. generations
    response_tensors = []
    for query in query_tensors:
        # gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])

    print("Response generation completed...")
    # batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    # Compute detector score - I added title + generation
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]

    prediction,y_test, logits = evaluate_openaidetect(model=detector_model, test_data=texts, tokenizer=detectot_tokenizer,
                                                  test_labels=np.ones(len(texts)), max_sequence_length=256,
                                                  test_batch_size=script_args.batch_size, device=device, return_logits=True)

    # pipe_outputs = detector_pipe(texts, **sent_kwargs)
    print("Reward generation completed...")

    # Reward is the class 0 - 'human'
    rewards = [output for output in logits[0][0:, 1]]

    print("Detector performance (F1): ", f1_score(y_test,prediction))

    # Run PPO step
    model.gradient_checkpointing_enable()
    model.pretrained_model.config.use_cache = False

    print("PPO started...")
    print(texts[0])
    # print(response_tensors)
    print(rewards[0])

    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

peft_model_id = script_args.peft_model_id
save_dir = ""

if not os.path.exists(save_dir+peft_model_id):

  os.makedirs(save_dir+peft_model_id)

model.save_pretrained(save_dir+peft_model_id)