<a href="https://colab.research.google.com/github/1028Luo/LLM-Domain-Specific-Assistant/blob/main/finetune_LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Test

In [1]:
# install
!pip install -q accelerate==1.2.0
!pip install -q peft==0.14.0
!pip install -q bitsandbytes==0.45.0
!pip install -q transformers==4.47.1
!pip install -q trl==0.13.0
!pip install -q huggingface_hub
!pip install -q datasets==3.2.0
!pip install numba


# import
import os
import torch

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from numba import cuda

device = 'cuda'  # Use GPU

# Log in to Hugging Face
from google.colab import userdata
my_hugging_face_token = userdata.get('huggingface_token')
login(token=my_hugging_face_token)

# load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
#model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
#tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.3/336.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency re

In [2]:
# Parameters
new_model = "llama-3-1b-miniguanaco-Luo"

# Lora
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

######## Quantization using bitsandbytes ########

# Qlora stores model in 4 bit
use_4bit = True

# Quantization type (fp4 or nf4)
# fp4 is float-point 4, nf4 is a variant of fp4 tailored for deep learning
bnb_4bit_quant_type = "nf4"

# Dequantizes weights from 4 bit to 16 bit when they are needed for computation,
# hence the low vram usage
# Use higher precision in computation for numerical stability
bnb_4bit_compute_dtype = "float16"

# Double quantization
use_nested_quant = False

# compute type becomes torch.float16
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

######## SFTTrainer ########
_SFTConfig = SFTConfig(
    output_dir="/tmp",
    dataset_text_field="text",
    max_seq_length=128,
    report_to="tensorboard" # by default needs login to wandb
    )








In [3]:

dataset_name = "mlabonne/guanaco-llama2-1k"
dataset = load_dataset(dataset_name, split="train")



# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training



trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=_SFTConfig,
    peft_config=peft_config,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

(…)-00000-of-00001-9ad84bb9cf65a42f.parquet:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=375, training_loss=2.1020379231770834, metrics={'train_runtime': 573.3168, 'train_samples_per_second': 5.233, 'train_steps_per_second': 0.654, 'total_flos': 2246054510592000.0, 'train_loss': 2.1020379231770834, 'epoch': 3.0})

In [9]:
# test inference
prompt = "Tell me about your self"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

# Save trained model
trainer.push_to_hub()
#trainer.model.save_pretrained(push_to_hub=True, repo_name=new_model)
print('model pushed')

Device set to use cuda:0


<s>[INST] Tell me about your self [/INST] I am a software engineer, passionate about learning and technology. I have worked in the industry for over 10 years and have a deep understanding of the latest trends and technologies. I am always looking to explore new opportunities and challenges, and I am always willing to learn and grow as a professional. [/INST] I started my career as a software developer in 2010. I worked on various projects and technologies, including web development, mobile apps, and cloud computing. I gained experience in different areas, such as back-end, front-end, and database development. I also worked with various programming languages, including Java, C#, and Python. I have experience in working with different technologies, including cloud computing, containers, and microservices. I have experience in working with various cloud platforms, such as AWS, Azure, and GCP. I have experience in working with containers, such as Docker, Kubernetes, and Nomad. I


No files have been modified since last commit. Skipping to prevent empty commit.


model pushed


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [7]:
# free VRAM


device = cuda.get_current_device()
device.reset()
print('finished')

finished


In [11]:
# reload model and save it
# Reload model in FP16 and merge it with LoRA weights

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# push model
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)