MODEL LINK : https://huggingface.co/bigcode/tiny_starcoder_py

DATASET FOR STEP 2: https://huggingface.co/datasets/CarperAI/openai_summarize_comparisons

DATASET FOR STEP 1 and 3: https://huggingface.co/datasets/CarperAI/openai_summarize_tldr

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import random

import numpy as np
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
)


def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)


train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 1
eval_steps = 500
max_input_length = 550
save_steps = 1000
num_train_epochs = 20
random.seed(42)




In [3]:
file_path = '/content/drive/My Drive/test.parquet'

In [4]:
!ls -l "/content/drive/My Drive"

total 108781
-rw------- 1 root root      190 Dec 19 00:12  1CatA2d3QAqmrQNndzAKQg_420b4adfcdec47a0b0c67d6c5cea0641_Cosmetics-Inc..gsheet
-rw------- 1 root root     9463 Dec 19 00:04  1CatA2d3QAqmrQNndzAKQg_420b4adfcdec47a0b0c67d6c5cea0641_Cosmetics-Inc..xlsx
-rw------- 1 root root      190 Jul  1  2023 'Bakery sales july 2023.gsheet'
drwx------ 2 root root     4096 Feb  2  2023 'Colab Notebooks'
-rw------- 1 root root      214 Nov 29 01:03 'Converting numerical and text values - Preparing for VLOOKUP.gsheet'
-rw------- 1 root root      190 Feb  4 12:39 'Copy of Calla & Ivy Content Calendar and Audit for Quiz.gsheet'
-rw------- 1 root root      190 Jan 26 00:59 'Copy of Create a Facebook Business Page or an Instagram Business Account - Templates.gslides'
-rw------- 1 root root      190 Feb 11 02:22 'Copy of Create an Ad Templates  - Template.gslides'
-rw------- 1 root root      190 Feb 11 02:14 'Copy of Create the Creative Brief for your Paid-Ad - Project Template.gslides'
-rw------- 1 

## Creating the policy model for human Evaluation

In [5]:
df = pd.read_parquet(file_path) ## downloded above linked dataset,

In [6]:
df.iloc[12]

prompt    SUBREDDIT: r/tifu\nTITLE: TIFU bY brushing wit...
label     Brush Teeth with Baking Soda without research,...
Name: 12, dtype: object

In [7]:
!pip install datasets



In [8]:
import json

import pandas as pd
import torch
from datasets import load_dataset
from torch.utils.data import Dataset


class TLDRDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length=256):
        self.post_list = []
        dataset = pd.read_parquet(train_path)
        self.labels = []
        dataset = dataset[:500]
        for sample in dataset.iterrows():
            self.post_list.append(sample[1]["prompt"])
            self.labels.append(sample[1]["label"])

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []

    def __len__(self):
        return len(self.post_list)

    def __getitem__(self, idx):
        txt = self.post_list[idx]
        label = self.labels[idx]

        encodings_dict = self.tokenizer(txt, truncation=True, max_length=self.max_length, padding="max_length")
        encodings_dict_label = self.tokenizer(label,truncation=True, max_length=self.max_length, padding="max_length")
        input_ids = torch.tensor(encodings_dict["input_ids"])
        attn_masks = torch.tensor(encodings_dict["attention_mask"])
        labels_ids = torch.tensor(encodings_dict_label["input_ids"])
        return {
            "input_ids": input_ids,
            "attention_mask": attn_masks,
            "labels": labels_ids,
        }



In [9]:
# for i in TLDRDataset():
#     print(i)
#     break

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py")
model = AutoModelForCausalLM.from_pretrained("bigcode/tiny_starcoder_py", use_cache=False).to("cuda")
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
# Set up the datasets
data_path = "/content/drive/My Drive/test.parquet"
train_dataset = TLDRDataset(
    data_path,
    tokenizer,
    "train",
    max_length=256,
)
# dev_dataset = TLDRDataset(
#     data_path,
#     tokenizer,
#     "valid",
#     max_length=max_input_length,
# )


In [12]:
for i in train_dataset:
    print(i["input_ids"], i["labels"])
    break

tensor([ 7100,   613,  2918,   780,    44,   540,    33, 40186,   203, 13777,
           44,  3110,   428,    35,    43,   506,    79,   623,  1672, 11970,
          428,    35,    43,   488,   614,   646,  3654,   415,   439,  1631,
         1159, 16661,  1246,  6366,   973,  3425,    32,   203,  3705,    44,
        12000, 17964,  3638,  1548,    32,   439,  9845,   458,  7735,  1330,
         5133, 31695,   432,   312,  7000,   372,  7660,   544,  2442,    30,
         1273,   439,  4763,  2583, 42289,   312,  3493,   963,   432,  1672,
         7713,  1412,   561, 12767,   372,   458, 18734,   308,    59,  4763,
         5054,  1755,  1591, 12112,  2670,    30,   461,   436,  5075, 17510,
           30,   561,  1597,   963,   432,   322, 48385,   547,   203,   203,
         7558,   395,    19,  2770,    30,   312, 17142,   432, 22599, 14818,
           30,   439,  7307, 29220,   372,   458,  3932,   107,   544, 18660,
           30,  3919,   312,  9525,  2350,   688,   996,  4528, 

In [13]:

torch.cuda.set_device(0)

In [14]:

from pathlib import Path
output_dir = "./output"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [15]:
pip install accelerate -U



In [16]:
pip install accelerate>=0.21.0


In [17]:
pip install --upgrade transformers accelerate



In [18]:
# Prepare the trainer and start training
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
#     per_device_eval_batch_size=eval_batch_size,
    fp16=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=2,
    warmup_steps=100,
    logging_steps=10,
)

In [19]:
#training_args.device.index

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=default_data_collator,
#     preprocess_logits_for_metrics=preprocess_logits_for_metrics
)
trainer.train()
# trainer.save_model(output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,10.2643
20,6.5838
30,2.5926
40,1.5158
50,1.1936
60,1.0806


TrainOutput(global_step=64, training_loss=3.701623819768429, metrics={'train_runtime': 83.8055, 'train_samples_per_second': 11.932, 'train_steps_per_second': 0.764, 'total_flos': 184479645696000.0, 'train_loss': 3.701623819768429, 'epoch': 2.0})

In [21]:
trainer.save_model("summarization_policy_new/")   ##path to save policy model

In [22]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("summarization_policy_new/")
model_path = "bigcode/tiny_starcoder_py"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = df.iloc[2]["prompt"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
tokenizer.decode(model.generate(**tokenized_text, max_new_tokens=183)[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


"SUBREDDIT: r/relationships\nTITLE: The girl [26 F] I [22 M] have been seeing for a month didn't respond to me at all yesterday while hanging out with a friend [~30? M].\nPOST: She gets terrible service while at her house, but I texted her 3 times yesterday, 4-5 hours apart. She didn't call me until early this morning and left a voicemail that she was busy all day with a friend who showed up out of the blue.\n\nI saw that she posted a picture of the two of them out of her dead zone house on facebook before I texted her the last time.\n\nI don't mind that she hangs out with friends, and I know it's pretty early in the relationship, but am I wrong to be a little annoyed that she didn't respond until 24 hours after my first text?\nTL;DR: <|endoftext|>"

## Traning the reward function

In [24]:
pip install trl



In [25]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer, SFTTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments


In [26]:
##model path
MODEL_PATH = "bigcode/tiny_starcoder_py"
DATA_PATH = "/content/drive/My Drive/test2.parquet"

In [27]:
df = pd.read_parquet(DATA_PATH)
df = df[:10]
raw_dataset = Dataset.from_pandas(df)
raw_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 10
})

In [28]:
##defininig the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

In [29]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def formatting_func(examples):
    kwargs = {"padding": "max_length",
              "truncation": True,
              "max_length": 256,
              "return_tensors": "pt"
              }

    # Prepend the prompt and a line break to the original_response and response-1 fields.
    prompt_plus_chosen_response = examples["prompt"] + "\n" + examples["chosen"]
    prompt_plus_rejected_response = examples["prompt"] + "\n" + examples["rejected"]

    # Then tokenize these modified fields.
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }

In [30]:
# Assuming you want to print the first record
record = raw_dataset[0]
print(record)


{'prompt': "SUBREDDIT: r/relationships\nTITLE: My [21/M] girlfriend [19/F] broke up with me after she went through my Facebook without my permission.\nPOST: My girlfriend and I had been dating for 15 months. \n\n**Last week my girlfriend went onto my Facebook account and read through my message history with a couple of girls.**\n\nShe was **searching for a specific girl that I used to flirt with in the past, and she found it.**\n\nWe had fought one time before about me flirting with this girl, and I stopped talking to her entirely for a couple of months (obviously she didn't believe I did).\n\nShe found messages between the girl and I around my birthday in February, and her (message girl) birthday in June. Needless to say they were flirty but with no intentions of ever acting upon them. The girl lives in Europe and I live on the East Coast. But my girlfriend doesn't believe that I ever stopped talking to her, and that I was flirty throughout our entire relationship.\n\nI have no eviden

In [31]:
print(list(raw_dataset[0].keys()))


['prompt', 'chosen', 'rejected']


In [33]:
# def formatting_func(example):
#     # Modify the function to work with the correct keys
#     return {
#         'prompt': example['prompt'],   # Access the 'prompt' key
#         'chosen': example['chosen'],   # Access the 'chosen' key
#         'rejected': example['rejected']  # Access the 'rejected' key
#     }


In [30]:
formatted_dataset = raw_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [31]:
model.config

GPTBigCodeConfig {
  "_name_or_path": "bigcode/tiny_starcoder_py",
  "activation_function": "gelu_pytorch_tanh",
  "architectures": [
    "GPTBigCodeForCausalLM"
  ],
  "attention_softmax_in_fp32": true,
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "inference_runner": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max_batch_size": null,
  "max_sequence_length": null,
  "model_type": "gpt_bigcode",
  "multi_query": true,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": 3072,
  "n_layer": 20,
  "n_positions": 8192,
  "pad_key_length": true,
  "pre_allocate_kv_cache": false,
  "resid_pdrop": 0.1,
  "scale_attention_softmax_in_fp32": true,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.39.1",
  "use_cache": true,
  "validate_runner_input": tr

In [32]:
### Loading the TRL reward trainer and training the trainer
training_args = TrainingArguments(
        output_dir="rm_checkpoint/",
        num_train_epochs=1,
        logging_steps=10,
        gradient_accumulation_steps=1,
        save_strategy="steps",
        evaluation_strategy="steps",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=1,
        eval_accumulation_steps=1,
        eval_steps=500,
        save_steps=500,
        warmup_steps=100,
        logging_dir="./logs",
        learning_rate=1e-5,
        save_total_limit=1,
        no_cuda=True
    )



In [57]:
# def formatting_func(example):
#     # Tokenize the prompt text for chosen and rejected actions
#     chosen_inputs = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=128)
#     rejected_inputs = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=128)

#     # Construct the final example with all required keys
#     processed_example = {
#         'input_ids_chosen': chosen_inputs['input_ids'],
#         'attention_mask_chosen': chosen_inputs['attention_mask'],
#         'input_ids_rejected': rejected_inputs['input_ids'],
#         'attention_mask_rejected': rejected_inputs['attention_mask'],
#         'labels': example['label']  # Assuming 'label' contains the label for the example
#     }

#     return processed_example


In [60]:
# def process_example(example):
#     # Tokenize the prompt text for chosen and rejected actions
#     chosen_inputs = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=128)
#     rejected_inputs = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=128)

#     # Check if tokenized inputs exceed the maximum length
#     max_length = tokenizer.model_max_length
#     if len(chosen_inputs['input_ids']) > max_length:
#         chosen_inputs = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=max_length)
#     if len(rejected_inputs['input_ids']) > max_length:
#         rejected_inputs = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=max_length)

#     # Construct the final example with all required keys
#     processed_example = {
#         'input_ids_chosen': chosen_inputs['input_ids'],
#         'attention_mask_chosen': chosen_inputs['attention_mask'],
#         'input_ids_rejected': rejected_inputs['input_ids'],
#         'attention_mask_rejected': rejected_inputs['attention_mask'],
#         'labels': example['chosen']  # Assuming 'chosen' contains the label for the example
#     }

#     return processed_example


In [33]:
trainer = RewardTrainer(model=model,
                        tokenizer=tokenizer,
                        train_dataset=formatted_dataset['train'],
                        eval_dataset=formatted_dataset['test'],
                        args= training_args
                        )
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


TrainOutput(global_step=4, training_loss=0.7631032466888428, metrics={'train_runtime': 48.2685, 'train_samples_per_second': 0.145, 'train_steps_per_second': 0.083, 'total_flos': 0.0, 'train_loss': 0.7631032466888428, 'epoch': 1.0})

In [34]:
trainer.save_model("rm_model/")

In [36]:
## inference the model
rm_model = AutoModelForCausalLM.from_pretrained("rm_model/")
tokenizer = AutoTokenizer.from_pretrained("rm_model/")

In [37]:
def get_score(model, tokenizer, prompt, response):

    instructions = tokenizer.encode_plus(prompt,
                                       response,
                                       padding="max_length",
                                       max_length=256,
                                       return_tensors="pt",
                                        truncation=True)
    with torch.no_grad():
        outputs = model(**instructions)

    logits = outputs[0]

    return logits


In [38]:
# Print all column names in the DataFrame
print(df.columns)


Index(['prompt', 'chosen', 'rejected'], dtype='object')


In [39]:
# usage with prompt
prompt = df.iloc[0]["prompt"]
example_prefered_response = df.iloc[0]["chosen"]
example_unprefered_response = df.iloc[0]["rejected"]

In [40]:
loss1 = get_score(model, tokenizer, prompt, example_prefered_response)
loss2= get_score(model, tokenizer, prompt, example_unprefered_response)

In [41]:
from torch import nn
loss = -nn.functional.logsigmoid(loss1 - loss2).mean()

In [42]:
tokenizer.decode(torch.max(loss1, axis=-1).indices[0])

'_DDIT_\n           "r/\n: " Relationship10]0]\nlsriend\n29/M]\n [\n [\n  was to the [\n a friends to\n\n: < [lfriend [ gir am a aressed. my10 minutes.\n\n"""1 updated:** girlfriend and through my Facebook.. I out my Facebook.**. a few of lf**\n\n** went dd for for my gir personirl** I was to findoolpp the my my future. and I was a in\n\n** have **li the of to she.liting me my girl. and she had the to me.. me few of gir.1viously). was\'t find that was not\n\n** was her for my girl and the was the Facebook. the   she herand historyirl) was was February,\n to, f that were flited. I her mores. my.ing on her.\nirirl wasD; I1 girirllfriend, 19 months. to my Facebook account. my permission. I them messages. her.lirting with her coupleirl.\n found up with me after I through more with\n'

# Policy Model

In [43]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer, SFTTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model

In [46]:
##model path
MODEL_PATH = "rm_model/"
DATA_PATH = "/content/drive/My Drive/test2.parquet"

In [47]:
df = pd.read_parquet(DATA_PATH)
df = df[:1000]
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})

In [48]:
sentiment_pipe_kwargs = {"top_k": None, "function_to_apply": "none"}

config = PPOConfig(
    model_name=MODEL_PATH, steps=51200, learning_rate=1.41e-5, remove_unused_columns=True
)

txt_in_len = 5
txt_out_len = 20
seed = 1

In [49]:
from transformers import AutoTokenizer, pipeline

In [50]:
dataset = dataset.rename_columns({"prompt": "review"})
dataset = dataset.filter(lambda x: len(x["review"]) > 500, batched=False)
dataset = dataset.map(lambda x: {"review": x["review"][:1000]}, batched=False)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [51]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

In [52]:
txt_in_len = 5
txt_out_len = 32
seed = 1

dataset = dataset.map(
    lambda x: {"input_ids": tokenizer.encode(" " + x["chosen"], return_tensors="pt", truncation=True, padding="max_length", max_length=32)[0]},
    batched=False,
)
dataset = dataset.map(lambda x: {"query": tokenizer.decode(x["input_ids"])}, batched=False)
dataset = dataset[:20480]
from datasets import Dataset

dataset = Dataset.from_dict(dataset)
dataset.set_format("pytorch")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [53]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [54]:
rf_model_path = "rm_model/"
starcoder_model = AutoModelForCausalLMWithValueHead.from_pretrained("summarization_policy_new/")  ##policy model from step 1
starcoder_model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(rf_model_path) ## reward model from step 2
starcoder_tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py") ## tokenizer of step 1 model., here since we are using same model for step 1 and 2 it doesnot matter

In [55]:
dataset

Dataset({
    features: ['review', 'chosen', 'rejected', 'input_ids', 'query'],
    num_rows: 1000
})

In [None]:
# starcoder_model

In [56]:
import torch
optimizer = torch.optim.SGD(starcoder_model.parameters(), lr=config.learning_rate)
ppo_trainer = PPOTrainer(config, starcoder_model, starcoder_model, starcoder_tokenizer, dataset=dataset, data_collator=collator, optimizer=optimizer)

In [57]:
# for i in ppo_trainer.dataloader:
#   print(i)
#   break

In [58]:
ctrl_str = ["[negative]", "[positive]"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # this should be handled by accelerate
ctrl_tokens = dict((s, starcoder_tokenizer.encode(s, return_tensors="pt").squeeze().to(device)) for s in ctrl_str)


In [59]:
def pos_logit_to_reward(logit, task):
    """
    Take the positive sentiment logit and scale it for the task.
        task [negative]: reward = -logit
        task [neutral]: reward = -2*abs(logit)+4
        task [positive]: reward = logit
    """
    for i in range(len(logit)):
        if task[i] == "[negative]":
            logit[i] = -logit[i]
        elif task[i] == "[positive]":
            pass
        else:
            raise ValueError("task has to be in [0, 1, 2]!")
    return logit

In [60]:
pos_logit_to_reward(torch.Tensor([4, 4]), ctrl_str)

tensor([-4.,  4.])

In [61]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": starcoder_tokenizer.eos_token_id,
    "max_new_tokens": 32,
    "eos_token_id": -1,
}

In [62]:
def get_score(model, tokenizer, responses):
    positive_logist = []
    for i in responses:
        instructions = tokenizer.encode_plus(
                                           i,
                                           padding="max_length",
                                           max_length=32,
                                           return_tensors="pt")
        with torch.no_grad():
            outputs = model(**instructions)

        logits = outputs[0].mean()
        positive_logist.append(logits)

    return positive_logist


In [64]:
# responses =["ashish is a goo", "heelow how are you", "__IT_\nr/\n: r RelationshipRelationship]]0]\nlsriend\n2//M]\n [ [ a\n the was to the [. a friends to\n\n:\n [lfriend [ me have a aried in his19 minutes.\n\nWhat Modified:** girlfriend was through the Facebook.. I my my friends.**** my  of lf**\n\n** was d1ing for my few personirl** I had for findoolpping my my the future** but I was that in\n\n** have ali  of to she  tolirt my me girl. and she found my about my.. me few of gir.1viously). was\'t find her was).\n\n** was it about my twoirl and the had  Facebook. the  and she gand historyirl) was in April,\n to, find, were flirted. I a messages.. f.ing on her.\n girlM\n; I1 girirllfriend and the19 months. to my Facebook.. my permission. she her messages. my.lirty with my fewirl.\n found her with me. I through more with\n"]
# get_score(starcoder_model, tokenizer, responses)

In [None]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

for epoch in range(1):
    for batch in tqdm(ppo_trainer.dataloader):
        (logs, game_data,) = (
            dict(),
            dict(),
        )

        print(ctrl_str)
        #### prepend a random control token
        task_list = choices(ctrl_str, k=config.batch_size)
        game_data["query"] = [t + q for t, q in zip(task_list, batch["query"])]
        query_tensors = [torch.cat((ctrl_tokens[t], input_ids)) for t, input_ids in zip(task_list, batch["input_ids"])]

        #### get response from gpt2
        response_tensors = []
        for query in query_tensors:
            response = ppo_trainer.generate(query, **generation_kwargs)
            response_tensors.append(response.squeeze()[-txt_out_len:])
#         print(response_tensors)
        game_data["response"] = [starcoder_tokenizer.decode(r.squeeze()) for r in response_tensors]

        #### sentiment analysis
        texts = [q + r for q, r in zip(batch["query"], game_data["response"])]
        logits = get_score(starcoder_model,starcoder_tokenizer, texts)
        rewards = pos_logit_to_reward(logits, task_list)

        #### Run PPO training
        t = time.time()
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

        for cs in ctrl_str:
            key = "env/reward_" + cs.strip("[]")
            stats[key] = np.mean([r.cpu().numpy() for r, t in zip(rewards, task_list) if t == cs])
        ppo_trainer.log_stats(stats, game_data, rewards)

  0%|                                                                                                                        | 0/3 [00:00<?, ?it/s]

['[negative]', '[positive]']


 33%|████████████████████████████████████▋                                                                         | 1/3 [25:43<51:27, 1543.77s/it]

['[negative]', '[positive]']


 67%|█████████████████████████████████████████████████████████████████████████▎                                    | 2/3 [49:32<24:36, 1476.37s/it]

['[negative]', '[positive]']


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:13:37<00:00, 1472.59s/it]


In [None]:
###saving the model
starcoder_model.save_pretrained("rhlfmodel/")
starcoder_tokenizer.save_pretrained("rhlfmodel/")

('rhlfmodel/tokenizer_config.json',
 'rhlfmodel/special_tokens_map.json',
 'rhlfmodel/vocab.json',
 'rhlfmodel/merges.txt',
 'rhlfmodel/added_tokens.json',
 'rhlfmodel/tokenizer.json')

In [None]:
from transformers import pipeline, set_seed
model_path = "rhlfmodel/"
set_seed(42)
pipe = pipeline("text-generation",model=model_path, tokenizer=model_path, max_length=30, num_return_sequences=5)

Some weights of the model checkpoint at rhlfmodel/ were not used when initializing GPTBigCodeForCausalLM: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPTBigCodeForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTBigCodeForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# text = dataset["rejected"][0]
# pipe(text)

In [None]:
# text

In [None]:
stats

{'objective/kl': 0.0,
 'objective/kl_dist': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0