In [1]:
from tqdm.auto import tqdm
import torch
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers.models.llama.modeling_llama import LlamaConfig
from datasets import load_dataset, DatasetDict
from bitnet import BitNetForCausalLM

In [2]:
dataset = load_dataset("togethercomputer/RedPajama-Data-1T-Sample", trust_remote_code=True)
dataset = dataset["train"].train_test_split(test_size=0.2)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=256), batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/744411 [00:00<?, ? examples/s]

Map:   0%|          | 0/186103 [00:00<?, ? examples/s]

In [3]:
config = LlamaConfig(
    vocab_size=len(tokenizer),
    hidden_size=768,
    intermediate_size=2048,
    max_position_embeddings=256,
    num_hidden_layers=12,
    num_attention_heads=12,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [4]:
model = BitNetForCausalLM(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"model size: {model_size/1000**2:.1f}M parameters")

model size: 134.2M parameters


In [9]:
trainer_args = TrainingArguments(
    output_dir="./result",
    run_name="myBitNet",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    warmup_steps=3000,
    lr_scheduler_type="linear",
    learning_rate=1.5e-3,
    save_steps=1000,
    bf16=True,
    push_to_hub=False,
    report_to="wandb",
    save_total_limit=1,
    adam_beta1=0.9,
    adam_beta2=0.95,
    weight_decay=0.1,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=trainer_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mnamba[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
1000,6.3149,5.191377
2000,4.8398,4.539315
3000,4.4001,4.310907
4000,4.1994,4.100917
5000,4.026,3.961827
6000,3.9095,3.870261
7000,3.8309,3.784921
8000,3.7447,3.712211
9000,3.6767,3.642249
10000,3.6079,3.573456


TrainOutput(global_step=11632, training_loss=4.152140096588345, metrics={'train_runtime': 18450.1392, 'train_samples_per_second': 40.347, 'train_steps_per_second': 0.63, 'total_flos': 1.2530836415486362e+17, 'train_loss': 4.152140096588345, 'epoch': 1.0})