# VTX
---
[AIGen](https://github.com/LuciferianInk/aigen) is a text generation and training library, originally forked from [AITextGen](https://aitextgen.minimaxir.com/) (which is now defunct).

AIGen is also the foundation of [VTX](https://github.com/0-5788719150923125/vtx).

To use this notebook with Kaggle, one must first enable the "Internet". To do so:

1. Find "Notebook options" in the sidebar on the right-hand side of this page.
2. If required, verify your phone number.
3. Choose "Internet on"

As well, do not forget to connect to an accelerator. The P100's are better for training.

In [None]:
# Kaggle uses an old version of CUDA, so we need to install a version of Pytorch that was built for that version.
!pip install torch>=2.1.0 --no-build-isolation --index-url https://download.pytorch.org/whl/cu110

# We don't even use this, but have to install it because of Kaggle bugs
!pip install torchaudio

# Now we install AIGen
!pip install 'git+https://github.com/LuciferianInk/aigen.git'

## Configuration

We could set a bunch of variables here, but we don't. For now, we just hardcode things in the steps below for clarity.

In [None]:
# Set some variables
focus = 'frame'

## Pretraining

We want to train a ModuleFormer from scratch, so we import example code from IBM's MoLM and configure the settings.

In [None]:
# Install our fork of ModuleFormer
!pip install 'git+https://github.com/LuciferianInk/ModuleFormer.git@enable-gradient-checkpointing'

# Pretrain configs for ModuleFormer
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

from moduleformer import (
    ModuleFormerConfig,
    ModuleFormerForCausalLM,
    ModuleFormerForSequenceClassification,
)

AutoConfig.register("moduleformer", ModuleFormerConfig)
AutoModelForCausalLM.register(ModuleFormerConfig, ModuleFormerForCausalLM)
AutoModelForSequenceClassification.register(
    ModuleFormerConfig, ModuleFormerForSequenceClassification
)

base_model = "ibm/MoLM-350M-4B"

pretrain_config = AutoConfig.from_pretrained(base_model)
overrides = {
    "universal": True,
    "world_size": 23,
    "activation_function": 'gelu',
    "n_layer": 16,
    "n_head": 2,
    "k_att": 3,
    "k_mlp": 3,
    "n_att_experts": 8,
    "n_mlp_experts": 16,
    "n_ctx": 2048, # history_length * n_layer
    "n_embd": 768,
    "att_hidden": 256,
    "ffd_hidden": 512,
    "block_size": 128,
    "gate_type": 'gmm',
    "gating_size": 64,
    "aux_loss_type": 'mi',
    "aux_loss_weight": 0.1,
    "history_length": 128,
    "resid_pdrop": 0.1,
    "embd_pdrop": 0.1,
    "attn_pdrop": 0.1,
    "moe_pdrop": 0.1,
    "sample_topk": 2,
    "tie_word_embeddings": True,
}
setattr(pretrain_config, "_name_or_path", focus)
for k, v in overrides.items():
    setattr(pretrain_config, k, v)
print(f"modified pretrain config:")
print(pretrain_config)

## Load a pretrained tokenizer

This isn't actually necessary here, but it can be required in some cases.

In [None]:
# Tweak the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    cache_dir="/kaggle/working/models",
    padding="max_length",
    padding_side="left",
    use_fast=True,
    return_overflowing_tokens=True,
    truncation=True,
    trust_remote_code=True,
)

## Load the model

Here we initialize the model with random weights.

In [None]:
# Instantiate your model
import os
from aigen import aigen

# Use this to continue training.
resume_training = False

if resume_training:
    model = None
    model_folder = "/kaggle/working/trained"
    pretrain_config = None
else:
    model = launch_model
    model_folder = None

prototype = aigen(
    model=model,
    model_folder=model_folder,
    tokenizer=tokenizer,
    cache_dir="/kaggle/working/models",
    precision=16,
    gradient_checkpointing=False,
    config=pretrain_config
)

print(prototype)

## Parameter-Efficient Fine-Tuning (PEFT)
Here is a basic example of Low-Rank Adapter training. Currently, we've commented-out this code, because it's not used in pre-training.

In [None]:
# # Prepare model for PEFT training

# opts = {
#     "r": 4,
#     "alpha": 16,
#     "dropout": 0.01,
#     "bias": "all",
#     "target_modules": [
#       "embed_in",
#       "query_key_value",
#       "dense",
#       "dense_h_to_4h",
#       "dense_4h_to_h",
#       "embed_out"
#     ]
# }

# prototype.create_adapter("/kaggle/working/trained", opts)

# prototype.model.print_trainable_parameters()

## Metrics

We want to log training metrics, so we install Tensorboard and expose it via ngrok. This requires an authtoken from ngrok.com, saved in Kaggle's "Add-ons>Secrets".

In [None]:
from kaggle_secrets import UserSecretsClient
secret_label = "NGROK_SECRET"
secret_value = UserSecretsClient().get_secret(secret_label)

import os

clean_logs = True

if clean_logs:
    directory = "/kaggle/working/logs"
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        os.remove(file_path)

if secret_value:

    !pip install ngrok tensorboard

    import threading
    import subprocess

    def start_tensorboard():
        subprocess.Popen(
            ["tensorboard", "--logdir", "/kaggle/working/logs", "--bind_all", "--samples_per_plugin", "scalars=999999999"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT
        )

    tensorboard_thread = threading.Thread(target=start_tensorboard)
    tensorboard_thread.start()

    import ngrok

    listener = await ngrok.forward(6006, authtoken=secret_value)
    
    import time

    time.sleep(1)
    print(listener.url())

## Training

Finally, we train the model on a dataset streamed from: https://huggingface.co/datasets

In [None]:
# Train the model

import os
from lightning.pytorch import loggers

os.makedirs(f"/kaggle/working/logs/{focus}", exist_ok=True)
logger = loggers.TensorBoardLogger("/kaggle/working/logs", name=focus, default_hp_metric=True)

prototype.model.training = True

prototype.train(
    devices="auto",
    strategy="auto",
    streaming_data=[
        {
            "repo": "togethercomputer/RedPajama-Data-V2", 
            "content_key": "raw_content", 
            "sample_size": 1000,
            "snapshots": [
                "2023-14"
            ],
            "name": "default",
            "languages": [
                "en"
            ],
            "sequential": True
        }
    ],
    output_dir="/kaggle/working/trained",
    batch_size=8,
    gradient_accumulation_steps=128,
    block_size=512,
    num_steps=1000,
    warmup_steps=10,
    optimizer="Lion",
    learning_rate=0.000333,
    weight_decay=0.01,
    gradient_clip_val=1.0,
    scheduler="cosine",
    lookahead=5,
    generate_every=500,
    save_every=1000,
    loggers=[logger],
    checkpoint=1,
    resume=resume_training,
)

## Testing

For testing, we just run an interactive inference session.

In [None]:
# Test inference

while True:
    print("PROMPT:\n")
    prompt = input()
    completion = prototype.generate(
        prompt=prompt,
        do_sample=True,
        min_length=23,
        max_new_tokens=111,
        temperature=0.9,
        eta_cutoff=0.0003,
        penalty_alpha=0.6,
        top_k=4,
        repetition_penalty=1.023,
        no_repeat_ngram_size=13,
        renormalize_logits=True,
        remove_invalid_values=True,
        max_time=60,
        use_cache=True,
    )
    print("COMPLETION:\n")
    print(completion)