# RWKV v5-base 1B5 / embedding init-range 1e-01 / 4k
This model is based on the RWKV standard 1B5 model

- 24 layers
- 2048 embedding size

Going through the modified memory training for v5 models, across various initial embedding model weights

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [21]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [22]:
DEEPSPEED_STRAT="deepspeed_stage_2_offload"
GPU_DEVICES="auto"
ENABLE_WANDB=True
EMBED_SCALE=0.1
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_2_offload
ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A/notebook/experiment/position_loss_bias
INFERENCE_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A/RWKV-v5
TRAINER_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A/RWKV-v5
PROJECT_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A


In [23]:
# Init the model
!cd "{TRAINER_DIR}" && \
    python3 ./init_model.py \
        --n_layer 24 --n_embd 2048 \
        --emb-scale "{EMBED_SCALE}" \
        --vocab_size neox --skip-if-exists \
        "../model/L24-D2048-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth"

[2023-08-18 08:34:04,347] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1'
---- Initializing model ----
No of layers: 24
Embedding size: 2048
Output model path: ../model/L24-D2048-E0_1-neox-v5base-init.pth
Vocab size: 50277
Emb scale: 0.1
---- ----- ----
50277 2048  -0.1 emb.weight
2048  2048  1.0  blocks.0.att.receptance.weight
2048  2048  1.0  blocks.0.att.key.weight
2048  2048  1.0  blocks.0.att.value.weight
2048  2048  0    blocks.0.att.output.weight
8192  2048  1.0  blocks.0.ffn.key.weight
2048  2048  0    blocks.0.ffn.receptance.weight
2048  8192  0    blocks.0.ffn.value.weight
2048  2048  1.0  blocks.1.att.receptance.weight
2048  2048  1.0  blocks.1.att.key.weight
2048  2048  1.0  blocks.1.att.value.weight
2048  2048  0    blocks.1.att.output.weight
8192  2048  1.0  blocks.1.ffn.key.weight
2048  2048  0    blocks.1.ffn.receptance.weight
2048  8192  0    blocks.1.ffn.

In [24]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml"

Saving the dataset (1/1 shards): 100%|█| 5308/5308 [00:00<00:00, 52791.32 exampl
Saving the dataset (1/1 shards): 100%|█| 54/54 [00:00<00:00, 9215.25 examples/s]


## Enwiki 10k / p1.0 - ctx 4k training

In [25]:
POS_LOSS_BIAS=1.0
POS_LOSS_BIAS_LABEL=str(POS_LOSS_BIAS).replace(".", "_")

WANDB_PREFIX=f"v5base-1B5-E{EMBED_SCALE}-P{POS_LOSS_BIAS}"
FILENAME_PREFIX=f"v5base-1B5-E{EMBED_SCALE_LABEL}-P{POS_LOSS_BIAS_LABEL}"

print("POS_LOSS_BIAS:", POS_LOSS_BIAS)
print("WANDB_PREFIX:", WANDB_PREFIX)
print("FILENAME_PREFIX:", FILENAME_PREFIX)

POS_LOSS_BIAS: 1.0
WANDB_PREFIX: v5base-1B5-E0.1-P1.0
FILENAME_PREFIX: v5base-1B5-E0_1-P1_0


In [26]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5base-enwiki-1k.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-small-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-4k/" \
        --model.load_model="../model/L24-D2048-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth" \
        --model.position_loss_bias=1.0 \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1

[2023-08-18 08:35:03,020] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 64432937
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)
[34m[1mwandb[0m: Tracking run with wandb version 0.15.8
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230818_083505-14i30730[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5base-1B5-E0.1-P1.0 - Enwiki-small-4k Foundation (train-ctx=4k, deepspeed_stage_2_offload)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-4k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-4k.pth"

In [None]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-4k.pth" "cuda fp32"

In [None]:
# Lets do a quick memory test
!python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth"

## Enwiki 10k / p0.1 - ctx 4k training

In [None]:
POS_LOSS_BIAS=0.1
POS_LOSS_BIAS_LABEL=str(POS_LOSS_BIAS).replace(".", "_")

WANDB_PREFIX=f"v5base-1B5-E{EMBED_SCALE}-P{POS_LOSS_BIAS}"
FILENAME_PREFIX=f"v5base-1B5-E{EMBED_SCALE_LABEL}-P{POS_LOSS_BIAS_LABEL}"

print("POS_LOSS_BIAS:", POS_LOSS_BIAS)
print("WANDB_PREFIX:", WANDB_PREFIX)
print("FILENAME_PREFIX:", FILENAME_PREFIX)

In [None]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5base-enwiki-1k.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-small-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-4k/" \
        --model.load_model="../model/L24-D2048-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth" \
        --model.position_loss_bias=0.1 \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-4k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-4k.pth"

In [None]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-4k.pth" "cuda fp32"

In [None]:
# Lets do a quick memory test
!python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth"