<a href="https://www.kaggle.com/code/volt3000/fine-tune-llama-3-instruct-8b-on-codesearchnet-alp?scriptVersionId=190440149" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Fine-tune Llama-3-8B-Instruct with Unsloth on CodeSearchNet

> Note: This notebooks runs best when it's accelerated with Nvidia T4(s) or GPU(s) of similar architecture 

## Download, Install and Import Dependencies

In [10]:
%%time
!mamba install --force-reinstall aiohttp -y
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
os.environ["WANDB_DISABLED"] = "true"


Looking for: ['aiohttp']

nvidia/linux-64                                             Using cache
nvidia/noarch                                               Using cache
conda-forge/linux-64                                        Using cache
conda-forge/noarch                                          Using cache
[?25l[2K[0G[+] 0.0s
[2K[1A[2K[0Gpkgs/main/linux-64                                            No change
pkgs/r/linux-64                                               No change
pkgs/main/noarch                                              No change
[+] 0.1s
rapidsai/linux-64 [33m━━━━━━━━━━━━╸[0m[90m━━━━━━━━━━━━━[0m   0.0 B /  ??.?MB @  ??.?MB/s  0.1s
rapidsai/noarch   [33m━━━━━━━╸[0m[90m━━━━━━━━━━━━━━━━━━[0m   0.0 B /  ??.?MB @  ??.?MB/s  0.1s
pkgs/r/noarch     [33m━━━━━━━━━━━━━━╸[0m[90m━━━━━━━━━━━[0m   0.0 B /  ??.?MB @  ??.?MB/s  0.0s[2K[1A[2K[1A[2K[1A[2K[0Gpkgs/r/noarch                                                 No change
rapidsai/linux-64    

In [11]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import pprint as pp
from datasets import load_dataset
import torch

## Setup Model and Tokenizer from Unsloth

In [21]:
max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [22]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True, # Set to True if out of memory (default is "unsloth")
    random_state = 8402,
    use_rslora = False,
    loftq_config = None,
)

## Format CodeSearchNet to Alpaca-styled Format

In [23]:
# alpacaFormatString = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}

# ### Input:
# {}

# ### Response:
# {}"""

# EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN (<|eot_id|>)

# # Define the formatting function to initially drop all the unnecessary columns and rename what we need
# def formatFunctionSample(sample):
#     language = sample['language']
#     instruction = f"Briefly explain what this {language} function does, in the format of a docstring?"
#     inputText = sample['func_code_string']
#     outputText = sample['func_documentation_string']

#     # Returning a dictionary of the necessary columns
#     return {
#         "instruction": instruction,
#         "input": inputText,
#         "output": outputText
#     }

# # Define the function to create the new 'text' column
# def createAlpacaFormatString(sample):
#     instruction = sample['instruction']
#     inputText = sample['input']
#     outputText = sample['output']
    
#     text = alpacaFormatString.format(instruction, inputText, outputText) + EOS_TOKEN
#     sample['text'] = text
    
#     return sample

In [26]:
def formatAndCreateAlpacaString(sample):
    alpacaFormatString = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
    
    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN (<|eot_id|>)

    # Extract necessary columns and format them
    language = sample['language']
    instruction = f"Briefly explain what this {language} function does, in the format of a docstring?"
    inputText = sample['func_code_string']
    outputText = sample['func_documentation_string']

    # Create the text column in the Alpaca format
    text = alpacaFormatString.format(instruction, inputText, outputText) + EOS_TOKEN
    
    # Returning a dictionary of the necessary columns including the new 'text' column
    return {
        "instruction": instruction,
        "input": inputText,
        "output": outputText,
        "text": text
    }

In [27]:
dataset = load_dataset("claudios/code_search_net", "python", split="train[:10000]")

# Mapping the existing dataset to the new format keeping only the keys of the dictionary we returned
dataset = dataset.map(formatAndCreateAlpacaString, remove_columns=dataset.column_names)

# Adding the text column to the new dataset
# dataset = dataset.map(createAlpacaFormatString)

Map:   0%|          | 0/412178 [00:00<?, ? examples/s]

In [28]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 412178
})

In [29]:
pp.pp(dataset[0])

{'instruction': 'Briefly explain what this python function does, in the format '
                'of a docstring?',
 'input': 'def addidsuffix(self, idsuffix, recursive = True):\n'
          '        """Appends a suffix to this element\'s ID, and optionally '
          'to all child IDs as well. There is sually no need to call this '
          'directly, invoked implicitly by :meth:`copy`"""\n'
          '        if self.id: self.id += idsuffix\n'
          '        if recursive:\n'
          '            for e in self:\n'
          '                try:\n'
          '                    e.addidsuffix(idsuffix, recursive)\n'
          '                except Exception:\n'
          '                    pass',
 'output': "Appends a suffix to this element's ID, and optionally to all child "
           'IDs as well. There is sually no need to call this directly, '
           'invoked implicitly by :meth:`copy`',
 'text': 'Below is an instruction that describes a task, paired with an input

In [30]:
print(dataset[0]['text'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Briefly explain what this python function does, in the format of a docstring?

### Input:
def addidsuffix(self, idsuffix, recursive = True):
        """Appends a suffix to this element's ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""
        if self.id: self.id += idsuffix
        if recursive:
            for e in self:
                try:
                    e.addidsuffix(idsuffix, recursive)
                except Exception:
                    pass

### Response:
Appends a suffix to this element's ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`<|end_of_text|>


## Train-test Split

In [31]:
datasetDictionary = dataset.train_test_split(test_size=0.3)

In [32]:
datasetDictionary

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 399812
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 12366
    })
})

## Initialize Trainer with Training Arguments

In [33]:
# Test trainer config with the evaluation loop to calculate validation loss
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = datasetDictionary["train"],
    eval_dataset = datasetDictionary["test"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 4,
        fp16_full_eval = True,
        eval_accumulation_steps = 4,
        evaluation_strategy = "steps",
        eval_steps = 1,
        warmup_ratio = 0.1,
        max_steps = 60,
        # num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 8402,
        output_dir = "outputs",
        report_to = "none",
    ),
)



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [34]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
11.969 GB of memory reserved.


In [35]:
trainer.eval_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 4318
})

In [36]:
trainer.train_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 140402
})

## Fine-tune Training Loop

In [37]:
trainerStats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 140,402 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory          / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainerStats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainerStats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Run Inference on Fine-tuned Model

In [None]:
from transformers import TextStreamer

textStreamer = TextStreamer(tokenizer)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

testFunction = """
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass
"""

inputs = tokenizer(
[
    alpacaFormatString.format(
        "Explain this python function's functionality in 100 words.", # instruction
        testFunction, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

_ = model.generate(input_ids = inputs.input_ids, attention_mask = inputs.attention_mask,
                   streamer = textStreamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)