#**AI-RL: AI-aligned Reinforcement Learning and Dialogue Generation**
Contributors:
- Bardia Shahrestani (260927463), bardia.shahrestani@mail.mcgill.ca
- Yuyang Cao (260968239), yuyang.cao@mail.mcgill.ca

Github URL:https://github.com/Bardia323/AIRL-Socrates

Pre-trained Model Weights:

* Pre-trained LoRA with PPO:https://huggingface.co/Bardia323/GPT-Neo-Socrates-Lora-PPO

* Pre-trained LoRA without PPO:https://huggingface.co/Bardia323/GPT-Neo-Socrates-Lora


##Dependencies and Main Functions

In [None]:
#@title Install Dependencies
%%capture
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install --upgrade huggingface_hub
!pip install openai
%%capture
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install git+https://github.com/lvwerra/trl.git
!pip install wanddb

In [None]:
#@title Import Dependencies and Define Main Functions and Callbacks

%%capture
import os
import torch
from torch import nn
from transformers import (AutoTokenizer, AutoModelForCausalLM, AdamW, Trainer,
                          TrainingArguments, TrainerControl, TrainerState,
                          DataCollatorForLanguageModeling, TrainerCallback, pipeline,
                          LogitsProcessor, StoppingCriteriaList)
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig




def tokenize_and_prepare_data(tokenizer, file_path):
    tokenizer.pad_token = tokenizer.eos_token
    dataset = load_dataset('text', data_files=file_path)
    tokenized_dataset = dataset.map(lambda examples: tokenizer(examples['text']), batched=True)
    tokenized_dataset = tokenized_dataset.filter(lambda example: len(example['input_ids']) > 2)
    return tokenized_dataset['train']


def configure_model_for_lora(model, lora = None):
    for param in model.parameters():
        param.requires_grad = False
        if param.ndim == 1:
            param.data = param.data.to(torch.float32)
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    model.lm_head = CastOutputToFloat(model.lm_head)
    if lora != None:
      config = LoraConfig.from_pretrained(lora)
    else:
      config = LoraConfig(
          r=16,
          lora_alpha=32,
          #target_modules=[""],
          lora_dropout=0.05,
          bias="none",
          task_type="CAUSAL_LM"
      )

      model = get_peft_model(model, config)
      return model


class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)


class NewlineStoppingCriteria(StoppingCriteriaList):
    def __call__(self, input_ids, scores, **kwargs):
        last_token = input_ids[:, -1]
        newline_token_id = tokenizer("\n\n")["input_ids"][0]
        return (last_token == newline_token_id).any()

class VersionControlCallback(TrainerCallback):
    def __init__(self, sample, prompt_list, model_output_dir):
        super().__init__()
        self.save_steps = save_steps
        self.prompt_list = prompt_list
        self.generator = None
        self.model_output_dir = model_output_dir

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.save_steps == 0:
            model = kwargs["model"]
            print(model)
            #model_version = f"{self.model_output_dir}_step_{state.global_step}"
            #model.save_pretrained(model_version)
            #print(f"Model saved at step {state.global_step}.")

            if self.generator is None:
                self.generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0, stopping_criteria=StoppingCriteriaList([newline_stopping_criteria]))
            print()
            for idx, prompt in enumerate(self.prompt_list):
                generated_text = self.generator(prompt, max_length=500, num_return_sequences=1, do_sample=True)[0]['generated_text']
                print(f"[Prompt {idx + 1}] {prompt}\Model : {generated_text}\n")

class SavePeftModelCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)
        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
          os.remove(pytorch_model_path)
        return control


def train_model(model, train_dataset, learning_rate=1e-4, max_steps=240, logging_steps=10):
    training_args = TrainingArguments(
        output_dir='./results',
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=256,
        save_steps=40,
        save_total_limit=10,
        prediction_loss_only=True,
        learning_rate=learning_rate,
        max_steps=max_steps,
        logging_steps=logging_steps
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        callbacks=[SavePeftModelCallback]
    )

    trainer.train()


##Train LoRA

In [None]:
#@title Fine-Tune
from transformers import AutoTokenizer,AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", device_map='auto')

train_dataset = tokenize_and_prepare_data(tokenizer, '/content/Cleaned_data.txt')
model = configure_model_for_lora(model)

train_model(model, train_dataset)

In [None]:
#@title Load
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


peft_model_id = "Bardia323/GPT-Neo-Socrates-Lora"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)



##Test LoRa

In [None]:
#@title Test
from transformers import LogitsProcessor, StoppingCriteriaList, pipeline
from peft import PeftModel, PeftConfig, LoraConfig

class NewlineStoppingCriteria(StoppingCriteriaList):
    def __call__(self, input_ids, scores, **kwargs):
        last_token = input_ids[:, -1]
        newline_token_id = tokenizer("\n")["input_ids"][0]  # Get the token ID for the newline character
        return (last_token == newline_token_id).any()

# Define the prompts
prompts = [
    "PLATO: What is the true nature of justice, and how can we achieve it in our society? \n\nSOCRATES:",
    "ARISTOTLE: How do you reconcile the conflict between the pursuit of individual happiness and the good of the community? \n\nSOCRATES:",
    "HIPPIAS: How can we define beauty in a way that encompasses all its manifestations? please explain briefly. \n\nSOCRATES:",
    "XENOPHON: In your opinion, what is the best form of government, and why? please explain briefly. \n\nSOCRATES:",
    "DIOGENES: Is it possible to be truly self-sufficient, and if so, how can one achieve this state? please explain briefly. \n\nSOCRATES:"
]

# Define the custom stopping criteria
newline_stopping_criteria = NewlineStoppingCriteria()

# Use the custom stopping criteria in the generator
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, stopping_criteria=StoppingCriteriaList([newline_stopping_criteria]),device=0, temperature=1)

# Generate responses for each prompt
for idx, prompt in enumerate(prompts):
    generated_text = generator(prompt, max_length=500, num_return_sequences=1, do_sample=True)[0]['generated_text']
    print(f"Prompt {idx + 1}:\n{generated_text}\n{'-' * 80}\n")


Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'Pe

Prompt 1:
PLATO: What is the true nature of justice, and how can we achieve it in our society? 

SOCRATES: How can there be a society if one and the same individual is a judge and others are to be put to the law? Now we must not say that justice is like the wind or the waves; but to say that justice is only like this, that one man and one thing shall receive, and another and a different thing shall receive nothing. Now the one man, the one man and the one thing in relation to us, the one thing should receive, and the same thing should receive nothing; whereas, we all do alike receive the same thing; only each of us should receive his own due proportion, and we should be the equal of each other, who are equal in the sight of the eye, who are equal in the sight of the truth, because they come to us in one equal mass. And the just man shall not be given to another just man, but he shall be brought to justice. And we do not think that justice comes to men of their own will, but is the outc

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt 2:
ARISTOTLE: How do you reconcile the conflict between the pursuit of individual happiness and the good of the community? 

SOCRATES: Certainly, I am quite in the right; and the difficulty is, to know how to say what this should be. For we have no precise and fixed idea how to pursue the good of the individual; as for instance, that he should enjoy his own wife and children, and keep company with a stranger in respect of his own pleasures; for this we all conceive it to be, and we pursue the same purpose as you do. But from this it follows that, in the pursuit of these things, we pursue the good of the community?

--------------------------------------------------------------------------------



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt 3:
HIPPIAS: How can we define beauty in a way that encompasses all its manifestations? please explain briefly. 

SOCRATES: To begin with, beauty is the most perfect thing which a body has. And among all perfect things there is none that is more beautiful than the one we have just mentioned; and since one of the elements in which beauty is contained is goodness, those two are related. On the other hand, for a beautiful to be perfect, it must be that is most useful, or the most honourable, or the most just. And the beautiful is the form of the life-giving man, and is what will be of greatest service to man when he lives. But what has the most honourable man and the most just man? These, of course, are goodness and honour. And since the life of a body is beautiful, these two may well be united and be beautiful. So the life of a beautiful body is of the greatest use to the body. 

--------------------------------------------------------------------------------



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt 4:
XENOPHON: In your opinion, what is the best form of government, and why? please explain briefly. 

SOCRATES: I shall do so. I have observed that the best forms of government are, as I mentioned to you before, the forms of life as lived by kings, and of common people, and the forms of government that are imposed by and in consequence of natural law, and which are the laws of justice, and of all the virtues and the punishments. Now these are the principles in which we learn to think; and I can now only repeat what I have said to you; that in the most perfect state of civil society those who think agree to the same opinions, and the same laws, and the same manners; for those who think agree in the principles, and in the laws, and the same manners; and therefore any person who thinks agrees to any of these principles and laws, and is just in their possession. And those who are just in the possession who are very active, and very just in themselves, and in their opinions, are just

In [None]:
#@title Upload 
#from huggingface_hub import notebook_login
#notebook_login()

#repo_id = "Bardia323/GPT-Neo-Socrates-Lora-PPO" #@param {type:"string"}
#model.push_to_hub(repo_id, use_auth_token=True, create_pr=1)

CommitInfo(commit_url='https://huggingface.co/Bardia323/GPT-Neo-Socrates-Lora-PPO/commit/be25dafe4a4ab9c4dddd129b62bf1844d8687448', commit_message='Upload model', commit_description='', oid='be25dafe4a4ab9c4dddd129b62bf1844d8687448', pr_url='https://huggingface.co/Bardia323/GPT-Neo-Socrates-Lora-PPO/discussions/6', pr_revision='refs/pr/6', pr_num=6)

##Fine-tune LoRA with PPO


In [None]:
#@title Load
from peft import LoraConfig
from trl import AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer

config = LoraConfig.from_pretrained("Bardia323/GPT-Neo-Socrates-Lora")
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.base_model_name_or_path, 
    peft_config=config,
    device_map='auto'
)



In [None]:
#@title Tokenize and Prepare Dataset
from trl import PPOTrainer, PPOConfig
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
ppo_config = PPOConfig(batch_size=1)
ppo_trainer = PPOTrainer(ppo_config, model, ref_model=config.base_model_name_or_path, tokenizer=tokenizer)
dataset = tokenize_and_prepare_data(tokenizer, "/content/socrates_withanswer.txt")
tuple_set = [(dataset[i]['text'], dataset[i+1]['text']) for i in range(0, len(dataset)-2, 2)]



  0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
#@title Define Evaluate
import os
import openai
from transformers import AutoTokenizer, AutoModelForSequenceClassification

openai.api_key = "" #@param {type:"string"}

import torch
class Evaluate:
    def __init__(self, criterion):
        self.criterion = criterion
        self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    
    def evaluate_verbal(self, text_input):
        """ Use API to call on 3.5-turbo gpt instance"""
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                  {"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": f" '{text_input}'\n evaluate the above content" 
                   + f" considering the following criterion: {self.criterion}. answer in short form. include a positive or negative word depending on your answer. "
                   }
            ],
            temperature=0.33
        )
        return response["choices"][0]["message"]["content"]
    
    def evaluate_sentiment(self,response):
        inputs = self.tokenizer(response, return_tensors="pt")
        with torch.no_grad():
            logits = self.model(**inputs,return_dict=True).logits
        predicted_class_id = logits.argmax().item()
        #print(self.model.config.id2label)
        #print(self.model.config.id2label[predicted_class_id])
        if predicted_class_id <2:
            return -1
        elif predicted_class_id >= 3:
            return 1
        else:
            return 0
    def eval(self, text_input):
        return self.evaluate_sentiment(self.evaluate_verbal(text_input))

In [None]:
#@title Create Evaluators and Train
from trl.core import respond_to_batch
e0 = "The sentence structure is clear and coherent."
e1 = "There is only one speaker."
e2 = "Socrates would say this"
eval_schemes = [e0,e1,e2]
for e_scheme in eval_schemes:
  e = Evaluate(e_scheme)
  for idx,prompt in enumerate(tuple_set):
    query_txt = prompt
    query_tensor = tokenizer.encode(query_txt[0]+"\nSocrates:", return_tensors="pt").to("cuda:0")
    response_tensor = tokenizer.encode(query_txt[1][10:], return_tensors="pt").to("cuda:0")

    reward = [torch.tensor(0.5)]
    train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)
    if idx % 3 == 0:
      response_tensor  = respond_to_batch(model, query_tensor,txt_len=50,top_p=1,top_k=25)
      response_txt = [tokenizer.decode(i) for i in response_tensor]
      response_value = e.eval(response_txt[0])
      print(query_txt[0])
      print(response_txt)
      print(response_value)
      reward = [torch.tensor(float(response_value)*0.5)]
      train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

model.save_pretrained("/content/results/adapter_model")
    


##Test LoRA Fine-Tuned + PPO model

In [None]:
#@title Load
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


peft_model_id = "Bardia323/GPT-Neo-Socrates-Lora-PPO"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)



In [None]:
#@title Test
from transformers import LogitsProcessor, StoppingCriteriaList, pipeline
from peft import PeftModel, PeftConfig, LoraConfig

class NewlineStoppingCriteria(StoppingCriteriaList):
    def __call__(self, input_ids, scores, **kwargs):
        last_token = input_ids[:, -1]
        newline_token_id = tokenizer("\n")["input_ids"][0]  # Get the token ID for the newline character
        return (last_token == newline_token_id).any()

# Define the prompts
prompts = [
    "PLATO: What is the true nature of justice, and how can we achieve it in our society? \n\nSOCRATES:",
    "ARISTOTLE: How do you reconcile the conflict between the pursuit of individual happiness and the good of the community? \n\nSOCRATES:",
    "HIPPIAS: How can we define beauty in a way that encompasses all its manifestations? please explain briefly. \n\nSOCRATES:",
    "XENOPHON: In your opinion, what is the best form of government, and why? please explain briefly. \n\nSOCRATES:",
    "DIOGENES: Is it possible to be truly self-sufficient, and if so, how can one achieve this state? please explain briefly. \n\nSOCRATES:"
]

# Define the custom stopping criteria
newline_stopping_criteria = NewlineStoppingCriteria()

# Use the custom stopping criteria in the generator
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, stopping_criteria=StoppingCriteriaList([newline_stopping_criteria]),device=0, temperature=0.1, top_k=20)

# Generate responses for each prompt
for idx, prompt in enumerate(prompts):
    generated_text = generator(prompt, max_length=64, num_return_sequences=1, do_sample=True)[0]['generated_text']
    print(f"Prompt {idx + 1}:\n{generated_text}\n{'-' * 80}\n")


Prompt 1:
PLATO: What is the true nature of justice, and how can we achieve it in our society? 

SOCRATES: Justice is the way we treat each other.

--------------------------------------------------------------------------------

Prompt 2:
ARISTOTLE: How do you reconcile the conflict between the pursuit of individual happiness and the good of the community? 

SOCRATES: I don't.

--------------------------------------------------------------------------------

Prompt 3:
HIPPIAS: How can we define beauty in a way that encompasses all its manifestations? please explain briefly. 

SOCRATES: Beauty is the sum of all the virtues.

--------------------------------------------------------------------------------

Prompt 4:
XENOPHON: In your opinion, what is the best form of government, and why? please explain briefly. 

SOCRATES: The best form of government is a republic. 

--------------------------------------------------------------------------------

Prompt 5:
DIOGENES: Is it possible to b

##Empty Cache

In [None]:
del trainer
del model
torch.cuda.empty_cache()