# Overview



In [1]:
%%capture
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3

In [2]:
import os
import torch
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tuning Mistral-7B-01"
os.environ["WANDB_NAME"] = "ft-mistral-7b-v01"
os.environ["MODEL_NAME"] = "mistralai/Mistral-7B-v0.1"
os.environ["DATASET"] = "OpenAssistant/oasst_top1_2023-08-25"

torch.backends.cudnn.deterministic=True

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

Loading pretrained config for `mistralai/Mistral-7B-v0.1` from `transformers`...
config.json: 100%|█████████████████████████████| 571/571 [00:00<00:00, 3.08MB/s]
┌────────────────────────────────────────────────────────┐
│  Memory Usage for loading `mistralai/Mistral-7B-v0.1`  │
├───────┬─────────────┬──────────┬───────────────────────┤
│ dtype │Largest Layer│Total Size│  Training using Adam  │
├───────┼─────────────┼──────────┼───────────────────────┤
│float32│  864.03 MB  │ 27.49 GB │       109.96 GB       │
│float16│  432.02 MB  │ 13.74 GB │        54.98 GB       │
│  int8 │  216.01 MB  │ 6.87 GB  │        27.49 GB       │
│  int4 │   108.0 MB  │ 3.44 GB  │        13.74 GB       │
└───────┴─────────────┴──────────┴───────────────────────┘


# Loading Datasets

In [None]:
from datasets import load_dataset
dataset=load_dataset(os.getenv("DATASET"), split="train[:500]")
dataset=dataset.train_test_split(test_size=0.1)
print(dataset["train"][0]["text"])

dataset

Downloading readme:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/31.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Loading the Tokenizer

In [None]:
from transformers import AutoTokenizer

# fast tokenizer sometimes ignores added tokens
tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'), use_fast=False)
# add tokens <|im_start|> and <|im_end|>, latter is special eos token
tokenizer.pad_token="</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
print(len(tokenizer))
tokenizer

# Preprocess Data

In [None]:
def preprocess_func(example):
    return tokenizer(example["text"], truncation=True, max_length=2048, add_special_tokens=False)

dataset_tokenized=dataset.map(preprocess_func, batched=True, num_proc=os.cpu_count(), remove_columns=["text"])

# Spliting Batch

In [None]:
def collate(example):
    """
    Transform list of dictionaties [{input_ids:[123,...]}, {...}] 
    to single batch dictionary { input_ids: [...], labels: [...], attention_mask: [...]}
    """
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])
    
    input_ids, labels, attention_masks=[],[],[]
    for tokens in tokenlist:
        pad_len=tokens_maxlen-len(tokens)
        # pad input_ids with pad_token, label with ignore_index (-100) and set attention_mask 1 where content, otherwise 0
        input_ids.append(tokens+[tokenizer.pad_token_id]*pad_len)
        labels.append(tokens+[-100]*pad_len)
        attention_masks.append([1]*len(tokens)+[0]*pad_len)
    batch={
        "input_ids":torch.tensor(input_ids),
        "labels":torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

# Loading Model

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True,
)

model=AutoModelForCausalLM.from_pretrained(
    os.getenv("MODEL_NAME"),
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

def print_trainable_parameters(model):
    trainable_params=0
    all_params=0
    for _, param in model.named_parameters():
        all_params+=param.numel()
        if param.requires_grad:
            trainable_params+=param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")

model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id=tokenizer.eos_token_id
model.gradient_checkpointing_enable()

print_trainable_parameters(model)

## Freeze Weights and add LoRA

In [None]:
from peft import prepare_model_for_kbit_training

prepared_model=prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=True
)

print_trainable_parameters(prepared_model)
print(prepared_model)

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config=LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=['q_proj', 'k_proj', 'down_proj', 'v_proj','gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1,
    bias="none",
    modules_to_Save=["lm_head","embed_tokens"], # we added new tokens to tokenizer, this is necesarry
    task_type=TaskType.CAUSAL_LM
)

lora_model=get_peft_model(prepared_model, lora_config)
lora_model.config.use_cache=False
print_trainable_parameters(lora_model)

# Training

In [None]:
from transformers import TrainingArguments, Trainer

bs=8
ga_steps=4
epochs=5

steps_per_epoch=len(dataset_tokenized["train"])//(bs*ga_steps)

args=TrainingArguments(
    output_dir=os.getenv("WANDB_NAME"),
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch,
    save_steps=steps_per_epoch,
    # increases effective batch size without consuming additional VRAM but makes training slower.
    # the effective batch size is batch_size* gradient_accumulation_steps
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    # using default lr suggested by QLoRA. 0.0002 for &b/13B model. For more parameters, lower lr are suggested.
    # for example, 0.0001 for models with 33B and 65B parameters.
    learning_rate=0.0002,
    group_by_length=True,
    fp16=True,
    ddp_find_unused_parameters=False # needed for training with accelerate
)

trainer=Trainer(
    model=lora_model,
    tokenizer=tokenizer,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    args=args
)

trainer.train()