In [1]:
# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

from tqdm.notebook import tqdm

import torch

import datasets
# Don't show progress datasets bars
datasets.disable_progress_bar()

import sys
sys.path.insert(0, str(Path.cwd().parent.resolve()))
from dataset import get_dataset, collator
from model import get_model
from reward_model import get_template, get_reward_model, create_reward_fn
from utils import get_tokenizer

['/fsx/home-augustas/elk/elk/promptsource', '/fsx/home-augustas/elk']


In [3]:
device = torch.device(
    f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
)
device

device(type='cuda', index=0)

In [4]:
# tokenizer_name = "gpt2"
# tokenizer_name = "gpt2-xl"
# tokenizer_name = "databricks/dolly-v2-3b"
# tokenizer_name = "EleutherAI/gpt-j-6b"
# tokenizer_name = "lmsys/vicuna-7b-v1.3"
# tokenizer_name = "meta-llama/Llama-2-7b-hf"
# tokenizer_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer_name = "meta-llama/Llama-2-13b-hf"

# dataset_name = "AugustasM/burns-datasets-VINC-ppo-training-v4"
# reward_model_output_path = "/fsx/home-augustas/logs/unifiedqa-v2-t5-3b-1363200_custom_data_v4_all_20230629_120158_21789"

dataset_name = "AugustasM/burns-datasets-VINC-imdb-ppo-training-v2"
# reward_model_output_path = "/fsx/home-augustas/logs/unifiedqa-v2-t5-3b-1363200_custom_data_imdb_v2_first_20230707_170052_28991"
reward_model_output_path = "/fsx/home-augustas/logs/UQA-3b-custom_data_imdb_v2_final_20230717_200713_36998"

In [5]:
# Tokenizer
tokenizer = get_tokenizer(tokenizer_name)

Loading tokenizer meta-llama/Llama-2-13b-hf...


Downloading (…)okenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded tokenizer.



## Dataset

In [6]:
# Dataset for PPO training
train_dataset, prompt_max_len, response_max_len = get_dataset(
    dataset_name, tokenizer,
)

Loading dataset...



Found cached dataset parquet (/admin/home-augustas/.cache/huggingface/datasets/AugustasM___parquet/AugustasM--burns-datasets-VINC-imdb-ppo-training-v2-7a8c35d13838a0fc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)



Max prompt length: 721

Max response length: 3

Remaining columns: ['prompt', 'best_response', 'input_ids', 'token_type_ids', 'attention_mask', 'response_len']

Total number of examples: 8192

Processing finished.



In [7]:
# Dataset templates
dataset_template_path = "AugustasM/burns-datasets-VINC"
template = get_template(dataset_template_path)

Num templates: 1
Is the given text truthful


## Model

In [8]:
# Model
# model = get_model(tokenizer_name, device)
model = get_model(tokenizer_name, device, load_in_8bit=True)

Loading policy model...



Downloading (…)lve/main/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

is_bf16_possible=False
kwargs={'load_in_8bit': True, 'torch_dtype': None}


Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/197 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Loaded subject model with 13,015,869,441 parameters.
Model dtype: torch.float16



In [9]:
model.pretrained_model.get_memory_footprint() / (1024 ** 3)

12.505660057067871

In [10]:
allowed_words = sorted(set(train_dataset["best_response"]))
allowed_words.append(tokenizer.eos_token)
allowed_words.extend([".", "!"])
print(f"{len(allowed_words)=}")
print(allowed_words)

allowed_word_ids = tokenizer(allowed_words, add_special_tokens=False).input_ids
print(allowed_word_ids)

bad_words = list(set(tokenizer.vocab.keys()) - set(allowed_words))
print(f"{len(bad_words)=}")

bad_word_ids = tokenizer(bad_words, add_special_tokens=True).input_ids

# Flatten list
# bad_word_ids = list(set([item for sublist in bad_word_ids for item in sublist]))
# bad_word_ids = [item for sublist in bad_word_ids for item in sublist]
len(bad_word_ids)

len(allowed_words)=9
['0', '1', 'bad', 'good', 'negative', 'positive', '</s>', '.', '!']
[[29871, 29900], [29871, 29896], [4319], [1781], [8178], [6374], [2], [869], [1738]]
len(bad_words)=31993


31993

In [11]:
from trl import PPOTrainer, PPOConfig

config = PPOConfig()

optimizer = None

# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    config,
    model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=train_dataset,
    data_collator=collator,
    optimizer=optimizer,
)

In [12]:
dataloader = torch.utils.data.DataLoader(
    train_dataset.select(range(32)),
    batch_size=1, collate_fn=collator,
    num_workers=12, shuffle=False,
)
print(f"Dataloader length: {len(dataloader)}")

generation_kwargs = {
    "top_k": 0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000, # why is this value like this?
    "pad_to_multiple_of": 8, # TODO: double-check, but this seems to work and to be faster
    # "bad_words_ids": bad_word_ids,
    # "force_words_ids": allowed_word_ids,
    # "num_beams": 4,
    # "renormalize_logits": True,
}

outputs = []
gold_outputs = []
for batch in tqdm(dataloader, total=len(dataloader), leave=False):
    gold_outputs.extend(batch["best_response"])

    question_tensors = batch["input_ids"]

    max_new_tokens = max(batch["response_len"])
    # max_new_tokens = max(8, max_new_tokens)
    generation_kwargs["max_new_tokens"] = max_new_tokens
    # print(f"{generation_kwargs['max_new_tokens']=}")
    # print(f"{generation_kwargs['force_words_ids']=}")

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        # length_sampler=output_length_sampler, # TODO: can be none
        batch_size=4, # TODO: generations are made in batches
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    outputs.extend(batch["response"])

len(outputs), len(gold_outputs)

Dataloader length: 32


  0%|          | 0/32 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [15]:
for output, golden_output in zip(outputs, gold_outputs):
    print(f"{golden_output=}\n{output=}\n")

golden_output='1'
output='0.5'

golden_output='0'
output='\nPositive'

golden_output='1'
output='0.1'

golden_output='0'
output='0.9'

golden_output='1'
output='\n<div'

