In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig,TaskType
from peft.utils import prepare_model_for_kbit_training
import torch
from datasets import Dataset
import pandas as pd

In [2]:
df = pd.read_json('../datasets/datasets/cerbo.json')
ds = Dataset.from_pandas(df)

In [3]:
ds

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 91
})

## Processing the training dataset

In [4]:
model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token


In [5]:
def apply_chat_template(
    example,
    tokenizer,
):
    messages = [
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["output"]},
    ]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

In [6]:
processed_train_dataset = ds.map(
        apply_chat_template,
        fn_kwargs={"tokenizer": tokenizer},
        num_proc=10,
        remove_columns=ds.column_names,
        desc="Applying chat template to train_sft",
    )

Applying chat template to train_sft (num_proc=10):   0%|          | 0/91 [00:00<?, ? examples/s]

In [7]:
processed_train_dataset[0]

{'text': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nhi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! How can I assist you today?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'}

## Create model

In [8]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

In [9]:
model = model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

In [11]:
model.enable_input_require_grads()

## Lora

In [12]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.055,
    bias="none",
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'o_proj', 'down_proj', 'v_proj', 'k_proj', 'q_proj', 'up_proj', 'gate_proj'}, lora_alpha=16, lora_dropout=0.055, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [13]:
model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.055, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear8bitLt(
                (base_layer): L

In [14]:
model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605


## Configure training parameters

In [16]:
from trl import SFTConfig, SFTTrainer

In [17]:
args=SFTConfig(
    output_dir="./output/llama-3.1",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=20,
    log_level="info",
    num_train_epochs=50,
    save_steps=100,
    learning_rate=1e-4,
    save_total_limit=2,
    gradient_checkpointing=True,
    dataset_text_field="text",
    max_seq_length=2048,
)


In [18]:
from trl import SFTTrainer
trainer = SFTTrainer(
            model=model,
            train_dataset=processed_train_dataset,
            tokenizer=tokenizer,
            args=args
        )

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [19]:
trainer.train()

***** Running training *****
  Num examples = 91
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 250
  Number of trainable parameters = 20,971,520
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,1.6232
40,0.593
60,0.2623
80,0.1424
100,0.1171
120,0.1058
140,0.1054
160,0.1005
180,0.0995
200,0.0982


Saving model checkpoint to ./output/llama-3.1/checkpoint-100
tokenizer config file saved in ./output/llama-3.1/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./output/llama-3.1/checkpoint-100/special_tokens_map.json
Saving model checkpoint to ./output/llama-3.1/checkpoint-200
tokenizer config file saved in ./output/llama-3.1/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./output/llama-3.1/checkpoint-200/special_tokens_map.json
Saving model checkpoint to ./output/llama-3.1/checkpoint-250
tokenizer config file saved in ./output/llama-3.1/checkpoint-250/tokenizer_config.json
Special tokens file saved in ./output/llama-3.1/checkpoint-250/special_tokens_map.json
Deleting older checkpoint [output/llama-3.1/checkpoint-100] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=250, training_loss=0.2789827733039856, metrics={'train_runtime': 602.6096, 'train_samples_per_second': 7.55, 'train_steps_per_second': 0.415, 'total_flos': 1.1481838369947648e+16, 'train_loss': 0.2789827733039856, 'epoch': 43.47826086956522})

## Save LoRA and tokenizer results

In [20]:
lora_path='./llama3.1_lora'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)

tokenizer config file saved in ./llama3.1_lora/tokenizer_config.json
Special tokens file saved in ./llama3.1_lora/special_tokens_map.json


('./llama3.1_lora/tokenizer_config.json',
 './llama3.1_lora/special_tokens_map.json',
 './llama3.1_lora/tokenizer.json')

## Load lora weight

In [1]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 
import torch
from peft import (
    PeftModel,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct"
torch.random.manual_seed(0) 
model = AutoModelForCausalLM.from_pretrained( 
    model_path,  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
)

tokenizer = AutoTokenizer.from_pretrained(model_path)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
prompt = "who are you?"
messages = [ 
    {"role": "user", "content":prompt}, 
] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])



I am an AI assistant, designed to provide helpful and accurate information to the best of my ability. I am not a human, but a computer program created to assist and communicate with users in a helpful and respectful manner. My purpose is to provide information, answer questions, and assist with tasks to the best of my ability, while always following ethical and moral guidelines. I am constantly learning and improving, so please bear with me if I make any mistakes or do not fully understand your question.


In [4]:
lora_config_path = './llama3.1_lora'
config = LoraConfig.from_pretrained(lora_config_path)

model = PeftModel.from_pretrained(model, model_id=lora_config_path, config=config)


In [5]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.055, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=

In [6]:
messages = [ 
    {"role": "user", "content":prompt}, 
] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

I am Fairy, an AI assistant developed by Cerbo AI. How can I assist you today?
