Skip to content

Commit

Permalink
Add helpful and harmless example (#128)
Browse files Browse the repository at this point in the history
* feat(examples/hh): add HH example

* feat(configs): add deepspeed configs

* chore(configs): delete old zero2-fp32 config

* chore(ppo_hh): delete preprocessing (consolidated into the dataset)

* chore(configs): rename folder `deepspeed` -> `accelerate`

* feat(hh): add sft

* chore(hh/configs): remove dates from `checkpoint_dir`

* chore(ilql,sft_hh): `rewards` -> `reward` for consistency with ppo

* chore(hh/README): rephrase the optionality of triton server

* feat(setup.cfg): add `tritonclient` (very light dependency)

* feat(hh/configs): add 125m pythia-sft ppo config

* feat(ilql_hh): switch from `helpful-base` to `full-hh-rlhf`

* feat(sft_hh): switch from `helpful-base` to `full-hh-rlhf`

* chore(configs/accelerate): add z3,ddp & update hh readme references

* style(hh): satisfy flake

* fix(configs/accelerate): cast to string `dynamo` option

* feat(hh/configs): add all model sizes configs

* feat(hh/readme): add urls and w&b runs
  • Loading branch information
maxreciprocate committed Feb 21, 2023
1 parent 3396bf1 commit 93c90cb
Show file tree
Hide file tree
Showing 23 changed files with 1,084 additions and 6 deletions.
16 changes: 16 additions & 0 deletions configs/accelerate/ddp.yaml
@@ -0,0 +1,16 @@
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MULTI_GPU
downcast_bf16: no
dynamo_backend: 'NO'
fsdp_config: {}
gpu_ids: all
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false
@@ -1,19 +1,22 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
downcast_bf16: no
dynamo_backend: 'NO'
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: 'no'
megatron_lm_config: {}
mixed_precision: bf16
num_machines: 1
num_processes: 2
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false
22 changes: 22 additions & 0 deletions configs/accelerate/zero2-fp16.yaml
@@ -0,0 +1,22 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: no
dynamo_backend: 'NO'
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
mixed_precision: fp16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false
23 changes: 23 additions & 0 deletions configs/accelerate/zero3.yaml
@@ -0,0 +1,23 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_save_16bit_model: true
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: no
dynamo_backend: 'NO'
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false
43 changes: 43 additions & 0 deletions examples/hh/README.md
@@ -0,0 +1,43 @@
### Training on Anthropic's Helpful & Harmless [dataset](https://github.com/anthropics/hh-rlhf)

As an example, the following setup assumes a single machine with 8xA100 80GB, the last of which will be dedicated to hosting a reward model. Optionally you can use [Triton Inference Server](https://github.com/triton-inference-server) to host it elsewhere, otherwise the training script will instantiate it ([a default one](https://huggingface.co/Dahoas/gptj-rm-static)) on its own.

Launch training of [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) on 7 GPUs with 8th GPU hosting a reward model:
```sh
accelerate launch --num_processes 7 --config_file ../../configs/accelerate/zero2-bf16.yaml ppo_hh.py
```
Or if you want to train a smaller model or start from a supervised checkpoint, you can use one of the [configs](./configs)
```sh
CONFIG_PATH=configs/ppo_hh_125M.yml accelerate launch --num_processes 7 --config_file ../../configs/accelerate/zero2-bf16.yaml ppo_hh.py
```

Already trained models are hosted on https://huggingface.co/reciprocate

#### Optional steps to setup a reward model (trained with [Dahoas/reward-modeling](https://github.com/Dahoas/reward-modeling)) with Triton Server:

```sh
# convert the model and create a config and a folder `model_store` structured for Triton
python to_triton.py --base_model EleutherAI/gpt-j-6B --checkpoint Dahoas/gptj-rm-static --revision 676bfd4d

# convert the docker image (skip this if you use docker instead)
singularity build --sandbox tritonserver-pyt.sif docker://nvcr.io/nvidia/tritonserver:22.08-pyt-python-py3
```

```sh
# start Triton Server pointing to the `model_store` containing the reward model
SINGULARITYENV_CUDA_VISIBLE_DEVICES=7 singularity run --nv --bind model_store:/model_store tritonserver-pyt.sif tritonserver --model-repository=/model_store &

# set model's url and replace the name after the slash if you use a different checkpoint
export TRITON_HOST=localhost:8001/gptj-rm-static

# launch training
accelerate launch --num_processes 7 --config_file ../../configs/accelerate/zero2-bf16.yaml ppo_hh.py
```

#### Sample W&B runs

PPO GPT-J: https://wandb.ai/sorry/trlx/runs/v0bir5s9

ILQL GPT-J: https://wandb.ai/sorry/trlx/runs/1qqxp72a

SFT GPT-J: https://wandb.ai/sorry/trlx/runs/a7ng078v
50 changes: 50 additions & 0 deletions examples/hh/configs/ilql_hh.yml
@@ -0,0 +1,50 @@
train:
seq_length: 1024
epochs: 100
total_steps: 10000
batch_size: 4

checkpoint_interval: 100000
eval_interval: 1000

pipeline: "PromptPipeline"
trainer: "AccelerateILQLTrainer"
checkpoint_dir: "checkpoints/ilql_hh"

model:
model_path: "EleutherAI/gpt-j-6B"
num_layers_unfrozen: -1

tokenizer:
tokenizer_path: "gpt2"
truncation_side: "left"

optimizer:
name: "adamw"
kwargs:
lr: 1.0e-6
betas: [0.9, 0.95]
eps: 1.0e-8
weight_decay: 1.0e-6

scheduler:
name: "cosine_annealing"
kwargs:
T_max: 10000000
eta_min: 1.0e-6

method:
name: "ilqlconfig"
tau: 0.6
gamma: 0.99
cql_scale: 0.1
awac_scale: 1
alpha: 0.0001
beta: 0
steps_for_target_q_sync: 1
two_qs: true
gen_kwargs:
max_new_tokens: 128
top_k: 20
beta: [1, 4]
temperature: 1.0
50 changes: 50 additions & 0 deletions examples/hh/configs/ilql_hh_125M.yml
@@ -0,0 +1,50 @@
train:
seq_length: 1024
epochs: 100
total_steps: 20000
batch_size: 16

checkpoint_interval: 100000
eval_interval: 1000

pipeline: "PromptPipeline"
trainer: "AccelerateILQLTrainer"
checkpoint_dir: "checkpoints/ilql_hh_125M"

model:
model_path: "EleutherAI/pythia-125m-deduped"
num_layers_unfrozen: -1

tokenizer:
tokenizer_path: "EleutherAI/gpt-neox-20b"
truncation_side: "left"

optimizer:
name: "adamw"
kwargs:
lr: 1.0e-6
betas: [0.9, 0.95]
eps: 1.0e-8
weight_decay: 1.0e-6

scheduler:
name: "cosine_annealing"
kwargs:
T_max: 10000000
eta_min: 1.0e-6

method:
name: "ilqlconfig"
tau: 0.6
gamma: 0.99
cql_scale: 0.1
awac_scale: 1
alpha: 0.0001
beta: 0
steps_for_target_q_sync: 1
two_qs: true
gen_kwargs:
max_new_tokens: 96
top_k: 20
beta: [1, 2, 4]
temperature: 1.0
50 changes: 50 additions & 0 deletions examples/hh/configs/ilql_hh_1B.yml
@@ -0,0 +1,50 @@
train:
seq_length: 1024
epochs: 100
total_steps: 20000
batch_size: 8

checkpoint_interval: 100000
eval_interval: 1000

pipeline: "PromptPipeline"
trainer: "AccelerateILQLTrainer"
checkpoint_dir: "checkpoints/ilql_hh_1b"

model:
model_path: "EleutherAI/pythia-1.4b-deduped"
num_layers_unfrozen: -1

tokenizer:
tokenizer_path: "EleutherAI/gpt-neox-20b"
truncation_side: "left"

optimizer:
name: "adamw"
kwargs:
lr: 1.0e-6
betas: [0.9, 0.95]
eps: 1.0e-8
weight_decay: 1.0e-6

scheduler:
name: "cosine_annealing"
kwargs:
T_max: 10000000
eta_min: 1.0e-6

method:
name: "ilqlconfig"
tau: 0.6
gamma: 0.99
cql_scale: 0.1
awac_scale: 1
alpha: 0.0001
beta: 0
steps_for_target_q_sync: 1
two_qs: true
gen_kwargs:
max_new_tokens: 96
top_k: 20
beta: [1, 2, 4]
temperature: 1.0
50 changes: 50 additions & 0 deletions examples/hh/configs/ilql_hh_20B.yml
@@ -0,0 +1,50 @@
train:
seq_length: 1024
epochs: 100
total_steps: 3000
batch_size: 1

checkpoint_interval: 100000
eval_interval: 1000

pipeline: "PromptPipeline"
trainer: "AccelerateILQLTrainer"
checkpoint_dir: "checkpoints/ilql_hh_20b"

model:
model_path: "EleutherAI/gpt-neox-20b"
num_layers_unfrozen: -1

tokenizer:
tokenizer_path: "EleutherAI/gpt-neox-20b"
truncation_side: "left"

optimizer:
name: "adamw"
kwargs:
lr: 1.0e-6
betas: [0.9, 0.95]
eps: 1.0e-8
weight_decay: 1.0e-6

scheduler:
name: "cosine_annealing"
kwargs:
T_max: 10000000
eta_min: 1.0e-6

method:
name: "ilqlconfig"
tau: 0.6
gamma: 0.99
cql_scale: 0.1
awac_scale: 1
alpha: 0.0001
beta: 0
steps_for_target_q_sync: 1
two_qs: true
gen_kwargs:
max_new_tokens: 96
top_k: 20
beta: [1, 2]
temperature: 1.0
50 changes: 50 additions & 0 deletions examples/hh/configs/ilql_hh_6B.yml
@@ -0,0 +1,50 @@
train:
seq_length: 2048
epochs: 100
total_steps: 10000
batch_size: 4

checkpoint_interval: 100000
eval_interval: 1000

pipeline: "PromptPipeline"
trainer: "AccelerateILQLTrainer"
checkpoint_dir: "checkpoints/ilql_hh_6b"

model:
model_path: "EleutherAI/pythia-6.9b-deduped"
num_layers_unfrozen: -1

tokenizer:
tokenizer_path: "EleutherAI/gpt-neox-20b"
truncation_side: "left"

optimizer:
name: "adamw"
kwargs:
lr: 1.0e-6
betas: [0.9, 0.95]
eps: 1.0e-8
weight_decay: 1.0e-6

scheduler:
name: "cosine_annealing"
kwargs:
T_max: 10000000
eta_min: 1.0e-6

method:
name: "ilqlconfig"
tau: 0.6
gamma: 0.99
cql_scale: 0.1
awac_scale: 1
alpha: 0.0001
beta: 0
steps_for_target_q_sync: 1
two_qs: true
gen_kwargs:
max_new_tokens: 96
top_k: 20
beta: [1, 2, 4]
temperature: 1.0

0 comments on commit 93c90cb

Please sign in to comment.