# **LoreWeaver Project**

![LoreWeaver](https://private-user-images.githubusercontent.com/81964452/284000208-5644db74-67d0-49f1-8362-3661c99361de.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MDQ0ODcxNjAsIm5iZiI6MTcwNDQ4Njg2MCwicGF0aCI6Ii84MTk2NDQ1Mi8yODQwMDAyMDgtNTY0NGRiNzQtNjdkMC00OWYxLTgzNjItMzY2MWM5OTM2MWRlLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDAxMDUlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwMTA1VDIwMzQyMFomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWUxMTBjNDgxNDk2ZThhNzAyNzM3OTY1YjVlMGZjN2ZkMzZlZjVjNWNlZGY3OGVhODVmZjMzZTExYTE1YmYyZmQmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.-eOSvWJFhTre9jtdwldA_w2iguNqr1KEaWJhwdlTqBA)

## **How to fine-tune Mistral**

In this section, we will fine-tune a 7B parameter Mistral model on a A100 GPU with high RAM using Jupyter Notebook. Note that a A100 has 80 GB of VRAM, which is enough to store Mistral 7B’s  weights (7b × 2 bytes = 14 GB in FP16). In addition, we need to consider the overhead due to optimizer states, gradients, and forward activations (see this excellent article for more information). This means that a full fine-tuning is not possible here: we need parameter-efficient fine-tuning (PEFT) techniques like QLoRA.

To drastically reduce the VRAM usage, we must fine-tune the model in 4-bit precision, which is why we’ll use QLoRA here. The good thing is that we can leverage the Hugging Face ecosystem with the transformers, accelerate, peft, trl, and bitsandbytes libraries. First, we install and load these libraries.

In [None]:
!pip install -U trl 
!pip install -U accelerate 
!pip install -U peft 
!pip install -U bitsandbytes 
!pip install -U transformers 
!pip install -U trl 
!pip install -U scipy
!pip install -U datasets==2.16.0

In [None]:
import os,torch, logging
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HuggingFace")

In [None]:
!huggingface-cli login --token $secret_hf

> **First, we want to load a mistral-7B model and train it on the atlas storyteller (5k rows), which will produce our fine-tuned model mistral-7B-LoreWeaver.**

QLoRA will use a rank of 64 with a scaling parameter of 16 (see this article for more information about LoRA parameters). We’ll load the Llama 2 model directly in 4-bit precision using the NF4 type and train it for one epoch. To get more information about the other parameters, check the TrainingArguments, PeftModel, and SFTTrainer documentation.

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-v0.1"

# The instruction dataset to use
dataset_name = "AtlasUnified/atlas-storyteller"

# Fine-tuned model name
new_model = "Mistral-7B-LoreWeaver"

### QLoRA parameters

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.05

### bitsandbytes parameters

In [None]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

---
## Global Configs

### TrainingArguments parameters

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_8bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

### SFT parameters

In [None]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 1024

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = "auto"

# **We can now start a fine-tuning process**

1. Load the pre-processed dataset.
2. Configure bitsandbytes for 4-bit quantization
3. Load mistral model in 4-bit precision on a GPU with tokenizer
4. Load configurations for QLoRA, regular training params using SFTTrainer

In [None]:
# Step 1: Load the Atlas Storyteller Dataset
dataset = load_dataset(dataset_name, split="train", data_files="*.jsonl")

In [None]:
# Step 2: configure bitsandbytes for 4-bit quantization

# Load tokenizer and model with QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Step 3: Load Mistral model in 4-bit precision on a GPU with tokenizer

# Load Mistral base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    use_flash_attention_2=True,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Load Mistral tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
# Step 4: Load configurations for QLoRA

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

### **Start training the model with given dataset**

In [None]:
# Uncomment when NEEDED
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:<14000>"
#torch.cuda.empty_cache()

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="Story",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

> The training can be very long, depending on the size of your dataset. Here, it took less an hour and a half on a A100 GPU.

It is very coherent for a model with only 7 billion parameters. You can play with it and ask harder questions from evaluation datasets like BigBench-Hard. 

How can we store our new Mistral-7B-LoreWeaver model now? We need to merge the weights from LoRA with the base model. Unfortunately, as far as I know, there is no straightforward way to do it: we need to reload the base model in FP16 precision and use the peft library to merge everything. Alas, it also creates a problem with the VRAM (despite emptying it), so I recommend restarting the notebook, re-executing the three first cells, and then executing the next one.

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model_name_or_path = "mistralai/Mistral-7B-v0.1"
adapter_peft_model_path = "Reverb/Mistral-7B-LoreWeaver"
output_dir = "/kaggle/working/"
device = "auto"  
push_to_hub = True 

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

device_arg = {'device_map': 'auto'} if device == 'auto' else {'device_map': {"": device}}

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    logger.info(f"Loading base model: {base_model_name_or_path}")
    with tqdm(total=1, desc="Loading base model") as pbar:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name_or_path,
            return_dict=True,
            torch_dtype=torch.float16,
            **device_arg
        )
        pbar.update(1)

    offload_dir = "./peft_offloads"
    if not os.path.exists(offload_dir):
        os.makedirs(offload_dir)

    logger.info(f"Loading Peft: {adapter_peft_model_path}")
    with tqdm(total=1, desc="Loading Peft model") as pbar:
        model = PeftModel.from_pretrained(base_model, adapter_peft_model_path, offload_dir=offload_dir)
        pbar.update(1)

    logger.info("Running merge_and_unload")
    with tqdm(total=1, desc="Merge and Unload") as pbar:
        model = model.merge_and_unload()
        pbar.update(1)

    tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)

    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
    output_filename = f"merged_model_{timestamp}.bin"
    output_filepath = os.path.join(output_dir, output_filename)
    model.save_pretrained(output_filepath)
    tokenizer.save_pretrained(output_filepath.replace('models', 'tokenizers'))
    logger.info(f"Model saved to {output_filepath}")

except Exception as e:
    logger.exception("An error occurred:")
    raise

Our weights are merged and we reloaded the tokenizer. We can now push everything to the Hugging Face Hub to save our model.