In [None]:
%pip install peft transformers trl==0.11.0 evaluate torch sacrebleu
%pip install -U bitsandbytes

Collecting trl==0.11.0
  Downloading trl-0.11.0-py3-none-any.whl.metadata (12 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from trl==0.11.0)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting tyro>=0.5.11 (from trl==0.11.0)
  Downloading tyro-0.9.13-py3-none-any.whl.metadata (9.4 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-an

In [None]:
from dataclasses import (
    dataclass,
    field
)
from typing import Optional
import torch
from accelerate import Accelerator
from datasets import (
    load_dataset,
    DatasetDict
)
from peft import (
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training,
    get_peft_model,
    PeftModel
)
from tqdm import tqdm
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    Adafactor,
    AutoTokenizer,
    HfArgumentParser,
    pipeline,
    set_seed,
    GenerationConfig,
    EarlyStoppingCallback
)
from trl import (
    AutoModelForSeq2SeqLMWithValueHead,
    create_reference_model,
    PPOConfig,
    PPOTrainer
)
from trl.core import LengthSampler
import pandas as pd
import evaluate
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import csv

In [None]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
tqdm.pandas()

In [None]:
!unzip reward_model.zip

Archive:  reward_model.zip
   creating: content/reward_model/
  inflating: content/reward_model/adapter_config.json  
  inflating: content/reward_model/adapter_model.safetensors  
   creating: content/reward_model/tokenizer/
  inflating: content/reward_model/tokenizer/tokenizer_config.json  
  inflating: content/reward_model/tokenizer/vocab.txt  
  inflating: content/reward_model/tokenizer/special_tokens_map.json  
  inflating: content/reward_model/tokenizer/tokenizer.json  
   creating: content/reward_model/checkpoint-11/
  inflating: content/reward_model/checkpoint-11/training_args.bin  
  inflating: content/reward_model/checkpoint-11/tokenizer_config.json  
  inflating: content/reward_model/checkpoint-11/optimizer.pt  
  inflating: content/reward_model/checkpoint-11/vocab.txt  
  inflating: content/reward_model/checkpoint-11/trainer_state.json  
  inflating: content/reward_model/checkpoint-11/adapter_config.json  
  inflating: content/reward_model/checkpoint-11/adapter_model.safeten

In [None]:
!unzip flan-t5-xl-best.zip

Archive:  flan-t5-xl-best.zip
   creating: flan-t5-xl-best/
  inflating: flan-t5-xl-best/tokenizer_config.json  
  inflating: flan-t5-xl-best/adapter_model.safetensors  
  inflating: flan-t5-xl-best/README.md  
  inflating: flan-t5-xl-best/tokenizer.json  
  inflating: flan-t5-xl-best/spiece.model  
  inflating: flan-t5-xl-best/adapter_config.json  
  inflating: flan-t5-xl-best/training_args.bin  
  inflating: flan-t5-xl-best/special_tokens_map.json  


In [None]:
# reward_model_name = 'content/reward_model/checkpoint-44'
reward_model_name = 'content/reward_model'
reward_tokenizer_name = 'content/reward_model/tokenizer'
model_name = 'flan-t5-xl-best'
dataset_name = 'rest_flan-t5-xl_len4.csv'

In [None]:
flan_tokenizer = AutoTokenizer.from_pretrained(model_name)
reward_tokenizer = AutoTokenizer.from_pretrained(reward_tokenizer_name)

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
if torch.cuda.is_available():
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto', quantization_config=quantization_config)
    print('loaded GPU')
else:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

loaded GPU


In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    inference_mode=False,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 9,437,184 || all params: 2,859,194,368 || trainable%: 0.3301


In [None]:
if torch.cuda.is_available():
  model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(model, device_map='auto')

In [None]:
ref_model = create_reference_model(model)
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_name)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
typical_fields = ["max_position_embeddings", "n_positions", "seq_len", "seq_length", "n_ctx", "sliding_window"]

# Check which attribute a given model object has
context_windows = [getattr(model.config, field) for field in typical_fields if field in dir(model.config)]

# Grab the last one in the list; usually there's only 1 anyway
max_length = (context_windows.pop()) if len(context_windows) else print(f"No Max input variable found for {model_name}")

print(max_length)

512


In [None]:
dataset = load_dataset("csv", data_files=dataset_name, split="train")

ds_train_devtest = dataset.train_test_split(test_size=24, seed=42)

dataset_splits = DatasetDict({
    'train': ds_train_devtest['train'],
    'test': ds_train_devtest['test'],
})

print("Before:\n", dataset)
print("After:\n", dataset_splits)

dataset_splits['test'].to_csv('reward_test_dataset.csv')

print(dataset_splits['train'])

Generating train split: 0 examples [00:00, ? examples/s]

Before:
 Dataset({
    features: ['Unnamed: 0', 'Source', 'Target', 'Gen_comp', 'ChrF', 'Cos_sim_t5', 'Gen_comp1', 'Gen_comp2', 'Total_ChrF'],
    num_rows: 248
})
After:
 DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Source', 'Target', 'Gen_comp', 'ChrF', 'Cos_sim_t5', 'Gen_comp1', 'Gen_comp2', 'Total_ChrF'],
        num_rows: 224
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Source', 'Target', 'Gen_comp', 'ChrF', 'Cos_sim_t5', 'Gen_comp1', 'Gen_comp2', 'Total_ChrF'],
        num_rows: 24
    })
})


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['Unnamed: 0', 'Source', 'Target', 'Gen_comp', 'ChrF', 'Cos_sim_t5', 'Gen_comp1', 'Gen_comp2', 'Total_ChrF'],
    num_rows: 224
})


In [None]:
def build_dataset(
    tokenizer,
    dataset,
):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """

    num_proc = 24

    def preprocess_function(examples):
        new_examples = {
            "query": [],
            "input_ids": [],
        }
        for source in examples["Source"]:
            query = source
            tokenized_prompt = tokenizer(query, truncation=True)
            new_examples["query"].append(query)
            new_examples["input_ids"].append(tokenized_prompt["input_ids"])

        return new_examples

    ds = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
        remove_columns=dataset_splits['train'].column_names,
    )
    ds = ds.filter(lambda x: len(x["input_ids"]) < max_length, batched=False, num_proc=num_proc)

    ds.set_format(type="torch")
    return ds

In [None]:
encoded_dataset = build_dataset(flan_tokenizer, dataset_splits['train'])

print(encoded_dataset)

encoded_testset = build_dataset(flan_tokenizer, dataset_splits['test'])

print(encoded_testset)

def collator(data):
    return {key: [d[key] for d in data] for key in data[0]}

Map (num_proc=24):   0%|          | 0/224 [00:00<?, ? examples/s]

Filter (num_proc=24):   0%|          | 0/224 [00:00<?, ? examples/s]

Dataset({
    features: ['query', 'input_ids'],
    num_rows: 224
})


Map (num_proc=24):   0%|          | 0/24 [00:00<?, ? examples/s]

Filter (num_proc=24):   0%|          | 0/24 [00:00<?, ? examples/s]

Dataset({
    features: ['query', 'input_ids'],
    num_rows: 24
})


In [None]:
ppo_config = PPOConfig(
    '/rl/ppoconfig',
    log_with="wandb",
    learning_rate=1e-5,
    batch_size=16,
    mini_batch_size=1,
    gradient_accumulation_steps=2,
    seed=42,
    init_kl_coef=0.05,
)



In [None]:
set_seed(42)

In [None]:
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=ref_model,
    tokenizer=flan_tokenizer,
    dataset=encoded_dataset,
    data_collator=collator,
)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [None]:
sentiment_pipe = pipeline("sentiment-analysis", model=reward_model, tokenizer=reward_tokenizer)

Device set to use cuda:0


In [None]:
sent_kwargs = {"top_k": None, "function_to_apply": "none", "return_all_scores": True, "batch_size": 1}

In [None]:
generation_kwargs = {
    "top_k": 0.0,
    "do_sample": True,
    "pad_token_id": flan_tokenizer.pad_token_id,
}

output_min_length = 1
output_max_length = max_length
output_length_sampler = LengthSampler(output_min_length, output_max_length)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = flan_tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    # Compute reward score (using the sentiment analysis pipeline)
    rewards = []
    for i in range(len(batch['query'])):
        texts = batch['query'][i] + '\n' + batch['response'][i]
        pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
        rewards.append(torch.tensor([output["score"] for output in pipe_outputs][0]))

    # Run PPO step
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:00, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
4it [03:47, 56.96s/it]


IndexError: index -1 is out of bounds for dimension 0 with size 0

In [None]:
def con_sent_emb(gen_sen: list[str], tar_sen: list[str]) -> tuple[any, any]:
    """
    Function to retrieve the contextualised sentence embeddings
    Args:
         gen_sen: A list of lists of strings
         tar_sen: A list of lists of strings
    Returns:
        A list of lists of contextual sentence embeddings
    """
    senTran = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")  # "sentence-transformers/sentence-t5-base"
    #model = SentenceTransformer("all-MiniLM-L6-v2")  # SentenceBERT

    # Generate the contextual embeddings for the generated and the target completions
    gen_emb = senTran.encode(gen_sen)  # , convert_to_tensor=True
    tar_emb = senTran.encode(tar_sen)  # , convert_to_tensor=True

    return gen_emb, tar_emb

In [None]:
#### get results from test set
game_data = dict()
game_data['query'] = []
game_data['target'] = []
game_data["response (before)"] = []
game_data["response (after)"] = []
game_data['reward (before)'] = []
game_data['reward (after)'] = []
game_data['chrf (before)'] = []
game_data['chrf (after)'] = []
game_data['chrf difference'] = []
game_data['cos_sim (before)'] = []
game_data['cos_sim (after)'] = []
game_data['cos_sim difference'] = []

dataset_splits['test'].set_format("pandas")
df_test = encoded_testset
game_data["query"] = df_test["query"]
query_tensors = df_test["input_ids"]
device='cuda' if torch.cuda.is_available() else 'cpu'

response_tensors = []

#### get response from policy and ref model
for i in range(len(encoded_testset)):
    query = query_tensors[i].to(device).unsqueeze(0)

    query_response = model.generate(
        input_ids=query,
        **generation_kwargs
    ).squeeze()
    response_len = len(query_response) - len(query)
    response_tensors.append(query_response[-response_len:])

    #### decode responses
    game_data["response (before)"].append(dataset_splits['test']['Gen_comp'][i])

    game_data["response (after)"].append(flan_tokenizer.decode(response_tensors[i], skip_special_tokens=True))

    #### sentiment analysis of query/response pairs before/after
    text_before = game_data['query'][i] + '\n' + game_data['response (before)'][i]
    output_before = sentiment_pipe(text_before, **sent_kwargs)
    reward_before = [output["score"] for output in output_before][0]

    text_after = game_data['query'][i] + '\n' + game_data['response (after)'][i]
    output_after = sentiment_pipe(text_after, **sent_kwargs)
    reward_after = [output["score"] for output in output_after][0]

    game_data['reward (before)'].append(reward_before)
    game_data['reward (after)'].append(reward_after)

    # ChrF score
    chrf = evaluate.load('chrf')
    target = dataset_splits['test']['Target'][i]

    chrf_before = dataset_splits['test']['ChrF'][i]

    chrf_after = chrf.compute(predictions=[game_data["response (after)"][i]], references=[target])['score']

    game_data['target'].append(target)
    game_data['chrf (before)'].append(chrf_before)
    game_data['chrf (after)'].append(chrf_after)
    game_data['chrf difference'].append(chrf_after - chrf_before)

    game_data['cos_sim (before)'].append(dataset_splits['test']['Cos_sim_t5'][i])

    # response_embeddings, target_embeddings = con_sent_emb(game_data["response (after)"][i], target)
    cos_sim = cosine_similarity(con_sent_emb(game_data['response (after)'][i],target))[0][1]
    game_data['cos_sim (after)'].append(cos_sim)
    game_data['cos_sim difference'].append(cos_sim - dataset_splits['test']['Cos_sim_t5'][i])

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results.to_csv('results.csv')
df_results

In [None]:
import os
os.makedirs("rl_model", exist_ok=True) # Create the 'model' directory if it doesn't exist

model.save_pretrained("rl_model")

# ppo_trainer.save("rl_model/test_rl_model_t5_small")

In [None]:
!zip -r rl_file.zip rl_model

# from google.colab import files
# files.download("/content/rl_file.zip")