In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '9'
from itertools import product

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import wandb
import re
import numpy as np
import matplotlib.pyplot as plt

import trlx
from peft import LoraConfig, TaskType

from trlx.data.configs import (
    ModelConfig,
    OptimizerConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)
from trlx.models.modeling_ppo import PPOConfig

from tic_tac_toe_action_supervision.tic_tac_toe import *
from sft import CustomEval

import torch
import pickle as pkl
import random
from datasets import Dataset
from transformers.integrations import WandbCallback

wandb_project = "exps-cot-reliability-tic-tac-toe"
os.environ['WANDB_PROJECT'] = wandb_project
os.environ['WANDB_NOTEBOOK_NAME'] = "cot_reliability_tic_tac_toe.ipynb"
import numpy as np
import random
import unittest
import re


[2024-03-25 08:46:52,456] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[NeMo W 2024-03-25 08:46:57 optimizers:54] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2024-03-25 08:46:59 experimental:27] Module <class 'nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers.MegatronPretrainingRandomBatchSampler'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2024-03-25 08:46:59 experimental:27] Module <class 'nemo.collections.nlp.models.text_normalization_as_tagging.thutmose_tagger.ThutmoseTaggerModel'> is experimental, not ready for production and is not fully supported. Use at your own risk.
    
[NeMo W 2024-03-25 08:47:00 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [3]:
# HPARAMS

# Run name (change this for each run)
run_name = "gpt2_test" # TODO: set this for each run

# model_name = 'mistralai/Mistral-7B-Instruct-v0.1'
# model_name = 'mistralai/Mistral-7B-v0.1'
model_name = 'gpt2'
batch_size = 64 # 16

train_sft = False
train_rl = True

generate_new_dataset = False
val_set_size = 100
generator_max_length = 10

# Lora config
lora_rank = 16
lora_alpha = 32
lora_dropout = 0.05
lora_args = {'lora_rank': lora_rank, 'lora_alpha': lora_alpha, 'lora_dropout': lora_dropout}
if 'mistral' in model_name or 'llama' in model_name:
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",]
elif 'gpt2' in model_name:
    target_modules = [
        "c_attn",
        "c_proj",
        "c_fc",
        "lm_head",]
else:
    raise NotImplementedError(f"Model {model_name} not supported; please add a lora config for it")    

peft_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)


training_args = TrainingArguments(
    output_dir=f"./results/{run_name}",
    overwrite_output_dir=True,
    num_train_epochs=10, # TODO: set this for each run
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=100,# TODO: consider 500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,  # TODO: set this for each run; back to 100
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="wandb",
    learning_rate=1e-4, # TODO: consider 1e-4
    save_total_limit=1,   
)


In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '?'})
tokenizer.padding_side = 'right'

tokenizer_left_pad = AutoTokenizer.from_pretrained(model_name)
tokenizer_left_pad.add_special_tokens({'pad_token': '?'})
tokenizer_left_pad.padding_side = 'left'


In [None]:
dataset_dict = load_dataset()


In [10]:
if train_sft:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config,
    )
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    
    device = next(model.parameters()).device
    
    
    response_template = " Answer:"
    # response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]


    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)


    formatting_func_train = partial(formatting_prompts_func, include_labels=True, eos=tokenizer.eos_token, description=description_train)

    trainer = SFTTrainer(
        model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset_iid,
        formatting_func=formatting_func_train,
        data_collator=collator,
        peft_config=peft_config,     
        args=training_args,
        callbacks=[
            CustomEval("val_iid", val_dataset_iid, description_val_iid, tokenizer_left_pad, generator_max_length=generator_max_length, batch_size=batch_size),
            CustomEval("val_diag_wins", val_dataset_diag_wins, description_val_diag_wins, tokenizer_left_pad, generator_max_length=generator_max_length, batch_size=batch_size),
            CustomEval("val_not_one_step", val_dataset_not_one_step, description_val_not_one_step, tokenizer_left_pad, generator_max_length=generator_max_length, batch_size=batch_size),
            CustomEval("val_player_o", val_dataset_player_o, description_val_player_o, tokenizer_left_pad, generator_max_length=generator_max_length, batch_size=batch_size),
            CustomEval("val_size_4", val_dataset_size_4, desciprion_val_size_4, tokenizer_left_pad, generator_max_length=generator_max_length, batch_size=batch_size),
        ],
    )
    full_args = {**trainer.args.to_dict(), **lora_args}
    wandb.init(project=wandb_project, name=run_name, config=full_args)

    trainer.train()

In [11]:
if train_rl:
    config = TRLConfig(
        train=TrainConfig(
            seq_length=1024,
            epochs=50,
            total_steps=100000,
            batch_size=1,
            checkpoint_interval=100,
            eval_interval=10,
            pipeline="PromptPipeline",
            trainer="AcceleratePPOTrainer",
        ),
            model=ModelConfig(model_path='gpt2',
                            #   num_layers_unfrozen=10,
                            #   num_layers_unfrozen=1,
                            peft_config=peft_config
            ),
            tokenizer=TokenizerConfig(tokenizer_path='gpt2', truncation_side="right"),
            optimizer=OptimizerConfig(name="adamw"),
        scheduler=SchedulerConfig(name="cosine_annealing", kwargs={"T_max": 100000, "eta_min": 5.0e-6},),
        method=PPOConfig( # TODO: maybe we want the default instead??
            name="PPOConfig",
            num_rollouts=128,
            chunk_size=16,
            ppo_epochs=4,
            init_kl_coef=0.1,
            target=6,
            horizon=10000,
            gamma=1,
            lam=0.95,
            cliprange=0.2,
            cliprange_value=0.2,
            vf_coef=0.2,
            scale_reward=None,
            ref_mean=None,
            ref_std=None,
            cliprange_reward=10,
            gen_kwargs={
                "max_new_tokens": 200,
            },
        ),
    )

    # micro batch size per gpu
    config.train.batch_size = 1
    # freeze all transformer layers
    config.model.num_layers_unfrozen = 1
    # maximum sample length, prompts or samples longer than that will be truncated
    config.train.seq_length = 256

    # micro batch size for sampling (specific for PPO)
    config.method.chunk_size = 1

    def generate_dataset_map(dataset):
        prompt_to_data_point = {}
        prompts = formatting_prompts_func(dataset, include_labels=False, description=description_train)
        for i, prompt in enumerate(prompts):
            prompt_to_data_point[prompt] = dataset[i]
        return prompt_to_data_point

    train_dataset_map = generate_dataset_map(train_dataset)
    # val_dataset_map_iid = generate_dataset_map(val_dataset_iid)
    # val_dataset_map_diag_wins = generate_dataset_map(val_dataset_diag_wins)
    # val_dataset_map_player_o = generate_dataset_map(val_dataset_player_o)
    # val_dataset_map_size_4 = generate_dataset_map(val_dataset_size_4)


    def reward_fn(samples, prompts, **kwargs):
        data_points = [train_dataset_map[p] for p in prompts]
        parsed_actions = [parse_action_from_string(s) for s in samples]
        rewards = [1 if action in point['best_actions'] else 0 for action, point in zip(parsed_actions, data_points)]
        return rewards

    trainer = trlx.train(
        reward_fn=reward_fn,
        prompts=formatting_prompts_func(train_dataset, include_labels=False, description=description_train),
        config=config,
    )

[RANK 0] Initializing model: gpt2
    
    
[RANK 0] peft adapter initialised
[RANK 0] The argument num_layers_unfrozen is ignored when using peft, to prevent unexpected behaviour.For Lora, use the `LoraConfig` argument `modules_to_save` instead.


trainable params: 3,175,696 || all params: 127,615,504 || trainable%: 2.488487605706592


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TARGET: {'max_length': 56, 'stride': 0, 'strategy': 'longest_first', 'direction': 'right'}
> [0;32m/home/olivia/miniconda3/envs/exps/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py[0m(453)[0;36mset_truncation_and_padding[0;34m()[0m
[0;32m    451 [0;31m                [0mprint[0m[0;34m([0m[0;34mf'TARGET: {target}'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    452 [0;31m                [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 453 [0;31m                [0mself[0m[0;34m.[0m[0m_tokenizer[0m[0;34m.[0m[0menable_truncation[0m[0;34m([0m[0;34m**[0m[0mtarget[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    454 [0;31m[0;34m[0m[0m
[0m[0;32m    455 [0;31m        [0;32mif[0m [0mpadding_strategy[0m [0;34m==[0m [0mPaddingStrategy[0m[0;34m.[0m[0mDO_NOT_PAD[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


[RANK 0] Starting training
[RANK 0] Collecting rollouts
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/1]:   0%|          | 0/1 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/25600 [00:00<?, ?it/s]

[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/1]:   0%|          | 0/1 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Saving intermediate optimizer & model checkpoint into ckpts/best_checkpoint


RuntimeError: 
            Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'base_model.model.transformer.wte.weight', 'base_model.model.lm_head.weight'}].
            A potential way to correctly save your model is to use `save_model`.
            More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
            

In [None]:
def load_checkpoint(checkpoint_path):
    
    model = AutoModelForCausalLM.from_pretrained(checkpoint_path, 
                                                 device_map="auto",
                                                 quantization_config=bnb_config,)
    return model

# ckpt_path = "results/mistral_4/checkpoint-1400"
# model = load_checkpoint(ckpt_path)

In [None]:
# Implement trlx
# get trlx logging working
# eval on the base models
# swtich to default config
# make scripts
# git push
# eval gpt2
# run/eval mistral
# writeup
# Run trlx
# confirm trlx is good on a dummy task
# Test on gpt2
# test on mistral
# understand the results.
# either initialize RL with some valid stuff, or include in prompt.