<a href="https://www.kaggle.com/code/aisuko/fine-tuning-a-llama2-for-code-generation?scriptVersionId=160680290" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Overview

Let's trying to fine-tune Llama2 on a the dataset which is included Python code solves a given task.

In [None]:
!pip install transformers==4.36.2
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3
!pip install accelerate==0.25.0
!pip install trl==0.7.7
!pip install tqdm==4.66.1
# Although flash-attn is not supported in Kaggle env.However, we prepare the notebook for future usage.
!pip install flash-attn==2.4.2

In [None]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tune-models-with-QLoRA"
os.environ["WANDB_NOTES"] = "Fine-tuning casual language models with QLoRA"
os.environ["WANDB_NAME"] = "fine-tuning-Llama2-with-pycode-instructions-with-QLoRA"
os.environ["MODEL_NAME"] = "meta-llama/Llama-2-7b-hf"
os.environ["DATASET_NAME"]="iamtarun/python_code_instructions_18k_alpaca"

In [None]:
from datasets import load_dataset

dataset_name=os.getenv("DATASET_NAME")

dataset=load_dataset(dataset_name, split="train[:100]") #It can be a smaller slice for fit the lower GPU memory
len(dataset)

In [None]:
def format_instruction(sample):
    return f"""Instruction:
    Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:
    
    ### Task:
    {sample['instruction']}
    
    ### Input:
    {sample['input']}
    
    ### Response
    {sample['output']}
    """

# Load the model

In [None]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
import torch

bnb_config= BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

model=AutoModelForCausalLM.from_pretrained(
    os.getenv("MODEL_NAME"),
    quantization_config=bnb_config,
    use_cache=False,
    device_map='auto',
    torch_dtype=torch.bfloat16
)

model.config

In [None]:
model.config.pretraining_tp=1
model.get_memory_footprint()

In [None]:
from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training

# to save memory
model.gradient_checkpointing_enable()
model.get_memory_footprint()

In [None]:
# freeze the base model layers and cast layernorm in fp32
model=prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model

In [None]:
from peft import LoraConfig, TaskType

peft_config=LoraConfig(
    # Alpha parameter for LoRA scaling
    lora_alpha=16,
    # Dropout probability for LoRA layers
    lora_dropout=0.1,
    # LoRA attention dimension
    r=64,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

peft_model=get_peft_model(model,peft_config)
peft_model.get_memory_footprint()

In [None]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(
    os.getenv("MODEL_NAME"),
    trust_remote_code=False,
    use_fast=True
)

tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding="right"

# Training

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args=TrainingArguments(
    output_dir=os.getenv("WANDB_NAME"),
    num_train_epochs=1,
    # Number of training steps (overrides num_train epochs)
#     max_teps=-1,
    per_device_train_batch_size=16, # 6 if use flash attention else 4
    # Number of update steps to accumulate the gradients for
    gradient_accumulation_steps=1,
    # Enable gradient checkpointing
    gradient_checkpointing=True,
    # Optimizer to use
    optim='paged_adamw_8bit',
    # Log every X updates steps
    logging_steps=25,
    save_strategy="no",
    # Initial learning rate (AdamW optimizer)
    learning_rate=2e-4,
    # Weight decay to apply to all layers except bias/LayerNorm weights
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    # Maximum gradient normal(gradient clipping)
    max_grad_norm=0.3,
    # Ratio of steps for a linear warmup(from 0 to learning rate)
    warmup_ratio=0.03,
    # Group sequences into batches with same length
    # Save memory and speeds up training considerably
    group_by_length=True,
    lr_scheduler_type='cosine',
    disable_tqdm=False,
    report_to="wandb",
    seed=42,
    run_name=os.getenv("WANDB_NAME")
)

sft_trainer=SFTTrainer(
    model=peft_model,
    train_dataset=dataset,
    # Maximum sequence length to use
    max_seq_length=2048,
    tokenizer=tokenizer,
    # Pack multiple short examples in the same input sequence to increase efficiency
    packing=True,
    formatting_func=format_instruction,
    args=training_args,
)

sft_trainer.train()

In [None]:
sft_trainer.push_to_hub(os.getenv("WANDB_NAME"))
tokenizer.push_to_hub(os.getnev("WANDB_NAME"))

# Inference

In [None]:
import gc

del peft_model, model, trainer
gc.collect()
torch.cuda.empty_cache()

In [None]:
from peft import PeftConfig, PeftModel

peft_config=PeftConfig.from_pretrained("aisuko/"+os.getenv("WANDB_NAME"))
base_model=AutoModelForCausalLM.from_pretrained(peft_config.base_)
peft_model=PeftModel.from_pretrained(base_model,"aisuko/"+os.getenv("WANDB_NAME"))

In [None]:
instrunction="Optimize a code snippet written in Python. The code snippet should create a list of numbers from 0 to 10 that are divisible by 2."
inputs=""

prompt=f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the Task.

### Task:
{instruction}

### Input:
{input}

### Response:
"""
input_ids=tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs=model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9, temperature=0.5)

tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)

# Credit

* https://pub.towardsai.net/fine-tuning-a-llama-2-7b-model-for-python-code-generation-865453afdf73
* https://github.com/edumunozsala/llama-2-7B-4bit-python-coder/blob/main/Llama-2-finetune-qlora-python-coder.ipynb