<b>Notebook:</b>RedPajama-INICITE-Chat-3B-finetune-LoRA-own-dataset<br>
<b>Author:</b>Szymon Manduk<br>
<b>Date:</b>2023-06-27<br>
<b>Description:</b><br>
Fine-tuning togethercomputer/RedPajama-INCITE-Chat-3B-v1 model using LoRA technique and quantization<br>
Done in 8bit as half-precision still exceeds 16GB available on Colab<br>
Training using my own Polish conversational dataset<br>

In [None]:
# run only if you are sure what environment you are in and that you need it
# Note: look for requirements.txt for the full list of packages
# !pip install -q transformers datasets accelerate peft bitsandbytes python-dotenv  # This line should be run on Google Colab

In [None]:
import torch
import torch.nn as nn
import json
import transformers
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import os
from dotenv import load_dotenv, find_dotenv
import random

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
# models and data
BASE_MODEL_NAME='togethercomputer/RedPajama-INCITE-Chat-3B-v1'  # name of the model I want to fine-tune
MY_MODEL_NAME='RedPajama-Chat3B-Polish-v2'  # my fine-tuned model name
MY_DATASET='/gdrive/My Drive/Colab Notebooks/Data/Combined dataset 1-2 2023.06.18.json'  # my dataset I use during fine-tuning
ENV_FILE='/gdrive/My Drive/Colab Notebooks/.env'  # file in which I store secrets (currently the Huggingface Hub access token)
TOKEN_NAME='HF_COLAB_RP_CHAT_3B'  # name of the environment variable storing access token for the Hugginghface Hub
OUTPUT_DIR='/gdrive/My Drive/Colab Notebooks/Models/'  # output directory to which checkpoints are saved

In [None]:
# read and shuffle my dataset
with open(MY_DATASET, 'r') as fp:
    data = [json.loads(x) for x in fp.readlines()]
random.shuffle(data)

In [None]:
# load the base model and a tokenizer
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    load_in_8bit=True,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# preprocess data and print size of my dataset
data = Dataset.from_list(data)
data = data.map(lambda samples: tokenizer(samples['text']), batched=True)
len(data)

In [None]:
def count_model_params(model):
    fp32_params = 0
    int8_params = 0
    requires_grad_params = 0

    for param in model.parameters():
        if param.dtype == torch.float32:
            fp32_params += param.numel()
        if param.dtype == torch.int8:
            int8_params += param.numel()
        if param.requires_grad:
            requires_grad_params += param.numel()

    return fp32_params, int8_params, requires_grad_params

In [None]:
fp32_params, int8_params, requires_grad_params = count_model_params(model)
print(f"Number of FP32 parameters: {fp32_params}")
print(f"Number of INT8 parameters: {int8_params}")
print(f"Number of parameters requiring gradients: {requires_grad_params}")

Number of FP32 parameters: 0
Number of INT8 parameters: 2516582400
Number of parameters requiring gradients: 258544640


In [None]:
print(model)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50432, 2560)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear8bitLt(in_features=2560, out_features=7680, bias=True)
          (dense): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear8bitLt(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=2560, out_features=50432, bias=False)
)


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    # When using #LoRA it is important to apply it
    # to ALL `Linear` layers of the model to get similar results to "full fine-tuning.
    # should we also wrap embed_out?
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h", "embed_out"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# prepare int-8 model for training: https://github.com/huggingface/peft/blob/eb01b5ee1dfeb6fdacc73dc2fb1dd674bb6868ac/src/peft/utils/other.py#L101
# From method description: this method wraps the entire protocol for preparing a model before running a training.
# This includes: 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm head to fp32
model = prepare_model_for_int8_training(model)

In [None]:
# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 21819392 || all params: 2797683712 || trainable%: 0.7799091765238114


In [None]:
# set the training arguments for Trainer
training_args = transformers.TrainingArguments(
    output_dir=OUTPUT_DIR + MY_MODEL_NAME,  # output directory
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=4,    # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    max_steps=650,
    logging_steps=50,
)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
# turn off caching to save RAM
model.config.use_cache = False

In [None]:
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,1.4962
100,1.2251
150,1.1783
200,1.0718
250,1.0601
300,1.0049
350,0.9554
400,0.8674
450,0.8591
500,0.882




TrainOutput(global_step=650, training_loss=1.0161065967266376, metrics={'train_runtime': 3442.7153, 'train_samples_per_second': 0.755, 'train_steps_per_second': 0.189, 'total_flos': 1.6343976882038784e+16, 'train_loss': 1.0161065967266376, 'epoch': 1.87})

In [None]:
# save a trained model to a drive
model.save_pretrained(f"{OUTPUT_DIR}{MY_MODEL_NAME}")

In [None]:
# Read the Huggingface Hub api key to be able to save my model to the hub
_ = load_dotenv(find_dotenv(filename=ENV_FILE))
api_key  = os.environ[TOKEN_NAME]

In [None]:
# Saving the model to the Hugging Face Hub
model.push_to_hub(MY_MODEL_NAME, use_auth_token=api_key, commit_message="The first bigger training on 1391 samples.")