# Pretraining a ModuleFormer
---
[AIGen](https://github.com/LuciferianInk/aigen) is a text generation and training library, originally forked from [AITextGen](https://aitextgen.minimaxir.com/) (which is now defunct).

AIGen is also the foundation of [VTX](https://github.com/0-5788719150923125/vtx).

To use this notebook with Kaggle, one must first enable the "Internet" feature. To do so:

1. Find "Notebook options" in the sidebar on the right-hand side of this page.
2. If required, verify your phone number.
3. Choose "Internet on".
4. Connect to the P100 accelerator.
5. Setup file persistence.

Do not forget to connect to an accelerator. The P100's are better than the T4's. However, with 2x T4's available, training may benefit from DistributedDataParallel (DDP) training.

## Configuration

We would set a bunch of variables here, if we hadn't hardcoded them below for clarity.

In [None]:
# Set some variables
base_model = 'ibm/MoLM-350M-4B'
focus = 'frame'

# ablation controls
prune = 0.0

# control actions
train_tokenizer = True

# to continue training from a checkpoint, False starts a fresh run
resume_training = True

## Update system packages

In [None]:
# Kaggle uses an old version of CUDA, so we need to install a version of Pytorch that was built for that version.
!pip install torch>=2.1.0 --no-build-isolation --index-url https://download.pytorch.org/whl/cu110

# We don't even use this, but have to install it because of Kaggle bugs
!pip install torchaudio

# Now we install AIGen
!pip install 'git+https://github.com/LuciferianInk/aigen.git'

## Train a tokenizer

Most of this isn't truly necessary here. We are training our own, custom tokenizer - rather than using a default pretrained one.

In [None]:
# Tweak the tokenizer
import os
from tokenizers import Tokenizer
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from aigen.tokenizers import train_tokenizer

output_dir = "/kaggle/working/trained"

!git clone 'https://github.com/SocioProphet/clymer_research' '/kaggle/working/corpus'

def list_full_paths_excluding_git(directory):
    """Lists full paths of files in a directory, excluding the .git directory.

    Args:
        directory (str): The path to the directory to process.

    Returns:
        list: A list of full file paths.
    """

    file_paths = []
    for root, d_names, f_names in os.walk(directory):
        # Exclude the .git directory
        d_names[:] = [d for d in d_names if d != ".git"]

        for f in f_names:
            file_path = os.path.join(root, f)
            file_paths.append(file_path)

    return file_paths


files = list_full_paths_excluding_git("/kaggle/working/corpus")
print(files)

tokenizer_dir = base_model
tokenizer_config = dict(
    cache_dir=output_dir,
    padding="max_length",
    padding_side='left',
    use_fast=True,
    return_overflowing_tokens=True,
    truncation=True,
    trust_remote_code=True,
    vocab_size=12288
)

if train_tokenizer:
    tokenizer_dir = output_dir
    tokenizer = train_tokenizer(
        files=files,
        dropout=None,
        vocab_size=12288,
        min_frequency=2,
        save_path=tokenizer_dir,
    )

tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, **tokenizer_config)

## Pretraining

We want to train a ModuleFormer from scratch, so we import example code from IBM's MoLM and configure the settings.

In [None]:
# Install our fork of ModuleFormer
!pip install 'git+https://github.com/IBM/ModuleFormer.git'

# Pretrain configs for ModuleFormer
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

from moduleformer import (
    ModuleFormerConfig,
    ModuleFormerForCausalLM,
    ModuleFormerForSequenceClassification,
)

AutoConfig.register("moduleformer", ModuleFormerConfig)
AutoModelForCausalLM.register(ModuleFormerConfig, ModuleFormerForCausalLM)
AutoModelForSequenceClassification.register(
    ModuleFormerConfig, ModuleFormerForSequenceClassification
)

pretrain_config = AutoConfig.from_pretrained(base_model)
overrides = {
    "universal": True,
    "world_size": 23,
    "activation_function": 'silu',
    "n_layer": 16,
    "n_head": 2,
    "k_att": 6,
    "k_mlp": 6,
    "n_att_experts": 120,
    "n_mlp_experts": 60,
    "n_ctx": 256, # history_length * n_layer
    "n_embd": 256,
    "att_hidden": 32,
    "ffd_hidden": 64,
    "block_size": 16,
    "history_length": 16,
    "gating_size": 8,
    "gate_type": 'gmm',
    "aux_loss_type": 'mi',
    "aux_loss_weight": 0.1,
    "resid_pdrop": 0.1,
    "embd_pdrop": 0.1,
    "attn_pdrop": 0.1,
    "moe_pdrop": 0.1,
    "sample_topk": 3,
    "vocab_size": 12288,
    "tie_word_embeddings": True,
}
setattr(pretrain_config, "_name_or_path", focus)
setattr(pretrain_config, "bos_token_id", tokenizer.bos_token_id)
setattr(pretrain_config, "eos_token_id", tokenizer.eos_token_id)
for k, v in overrides.items():
    setattr(pretrain_config, k, v)
print(f"modified pretrain config:")
print(pretrain_config)

## Load the model

Here we initialize the model with random weights.

In [None]:
# Instantiate your model
import os
import shutil
from aigen import aigen

if resume_training:
    model = None
    model_folder = output_dir
    pretrain_config = None
else:
    model = base_model
    model_folder = None
    shutil.rmtree(output_dir, ignore_errors=True)

prototype = aigen(
    model=model,
    model_folder=model_folder,
    tokenizer=tokenizer,
    cache_dir=output_dir,
    precision=32,
    config=pretrain_config
)

print(prototype)

## Parameter-Efficient Fine-Tuning (PEFT)
Here is a basic example of Low-Rank Adapter training. Currently, we've commented-out this code, because it's not used in pre-training.

In [None]:
# # Prepare model for PEFT training

# opts = {
#     "r": 4,
#     "alpha": 16,
#     "dropout": 0.01,
#     "bias": "all",
#     "target_modules": [
#       "embed_in",
#       "query_key_value",
#       "dense",
#       "dense_h_to_4h",
#       "dense_4h_to_h",
#       "embed_out"
#     ]
# }

# prototype.create_adapter(output_dir, opts)

# prototype.model.print_trainable_parameters()

## Metrics

We want to log training metrics, so we install Tensorboard and expose it via ngrok. This requires an authtoken from ngrok.com, saved in Kaggle's "Add-ons>Secrets".

In [None]:
from kaggle_secrets import UserSecretsClient
secret_label = "NGROK_SECRET"
secret_value = UserSecretsClient().get_secret(secret_label)

import os
import shutil

directory = "/kaggle/working/logs"
os.makedirs(directory, exist_ok=True)

if not resume_training:
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        shutil.rmtree(file_path)

if secret_value:

    !pip install ngrok tensorboard

    import threading
    import subprocess

    def start_tensorboard():
        subprocess.Popen(
            ["tensorboard", "--logdir", "/kaggle/working/logs", "--bind_all", "--samples_per_plugin", "scalars=999999999"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT
        )

    tensorboard_thread = threading.Thread(target=start_tensorboard)
    tensorboard_thread.start()

    import ngrok

    listener = await ngrok.forward(6006, authtoken=secret_value)
    
    import time

    time.sleep(1)
    print(listener.url())

## Training

Finally, we train the model on a dataset streamed from: https://huggingface.co/datasets

In [None]:
# Train the model

import os
from lightning.pytorch import loggers

os.makedirs(f"/kaggle/working/logs/{focus}", exist_ok=True)
logger = loggers.TensorBoardLogger("/kaggle/working/logs", name=focus, default_hp_metric=True)

prototype.train(
    devices=1,
    strategy="auto",
    prune=prune,
    streaming_data=[
        {
            "repo": "c4", 
            "content_key": "text", 
            "subset": "en.noblocklist",
            "sequential": True,
            "buffer_size": 10000,
            "val_samples": 10000
        }
    ],
    batch_size=32,
    gradient_accumulation_steps=32,
    block_size=512,
    num_steps=10000,
    val_interval=100,
    warmup_steps=10,
    optimizer="Lion",
    learning_rate=0.000333,
    weight_decay=0.1,
    gradient_clip_val=1.0,
    scheduler="cosine",
    lookahead=5,
    loggers=[logger],
    gradient_checkpointing=True,
    generate_every=5,
    save_every=10,
    checkpoint_every=10,
    resume=resume_training,
    progress_bar=True,
    output_dir=output_dir,
)

## Testing

For testing, we just run an interactive inference session.

In [None]:
# Test inference

while True:
    print("PROMPT:\n")
    prompt = input()
    completion = prototype.generate(
        prompt=prompt,
        do_sample=True,
        min_length=23,
        max_new_tokens=111,
        temperature=0.9,
        eta_cutoff=0.0003,
        penalty_alpha=0.6,
        top_k=4,
        repetition_penalty=1.023,
        no_repeat_ngram_size=13,
        renormalize_logits=True,
        remove_invalid_values=True,
        max_time=60,
        use_cache=True,
    )
    print("COMPLETION:\n")
    print(completion)