In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig,TaskType
from peft.utils import prepare_model_for_kbit_training
import torch
from datasets import Dataset
import pandas as pd

In [2]:
df = pd.read_json('../datasets/datasets/cerbo.json')
ds = Dataset.from_pandas(df)

In [3]:
ds

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 91
})

## Processing the training dataset

In [4]:
model_path = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'


In [5]:
def process_func(example):
    MAX_LENGTH = 384
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<bos><start_of_turn>user\n{example['instruction'] + example['input']}<end_of_turn>\n<start_of_turn>model\n", add_special_tokens=False)
    response = tokenizer(f"{example['output']}<end_of_turn>\n", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] 
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH: 
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [6]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 91
})

In [7]:
print(tokenizer.decode(tokenized_id[0]['input_ids']))


<bos><start_of_turn>user
hi<end_of_turn>
<start_of_turn>model
Hello! How can I assist you today?<end_of_turn>
<eos>


## Create model

In [8]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

In [9]:
model = model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2SdpaAttention(
          (q_proj): Linear8bitLt(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear8bitLt(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear8bitLt(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear8bitLt(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_att

In [11]:
model.enable_input_require_grads()

## Lora

In [12]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 'gate_proj', 'up_proj', 'down_proj'],
    inference_mode=False,
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.055,
    bias="none",
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'k_proj', 'v_proj', 'q_proj', 'up_proj', 'gate_proj', 'o_proj', 'down_proj'}, lora_alpha=16, lora_dropout=0.055, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [13]:
model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2SdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.055, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear8bitLt(
            

In [14]:
model.print_trainable_parameters()

trainable params: 10,383,360 || all params: 2,624,725,248 || trainable%: 0.3956


## Configure training parameters

In [15]:
from trl import SFTConfig, SFTTrainer


In [17]:
args=SFTConfig(
    output_dir="./output/gemma2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=20,
    log_level="info",
    num_train_epochs=50,
    save_steps=100,
    learning_rate=1e-4,
    save_total_limit=2,
    gradient_checkpointing=True,
    dataset_text_field="text",
    max_seq_length=2048
)


In [19]:
from trl import SFTTrainer
trainer = SFTTrainer(
            model=model,
            train_dataset=tokenized_id,
            tokenizer=tokenizer,
            args=args,
            data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
        )

In [20]:
trainer.train()

***** Running training *****
  Num examples = 91
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 250
  Number of trainable parameters = 10,383,360
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,2.0305
40,0.695
60,0.2731
80,0.0946
100,0.0432
120,0.0198
140,0.0122
160,0.0076
180,0.0041
200,0.0028


Saving model checkpoint to ./output/gemma2/checkpoint-100
tokenizer config file saved in ./output/gemma2/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./output/gemma2/checkpoint-100/special_tokens_map.json
Saving model checkpoint to ./output/gemma2/checkpoint-200
tokenizer config file saved in ./output/gemma2/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./output/gemma2/checkpoint-200/special_tokens_map.json
Saving model checkpoint to ./output/gemma2/checkpoint-250
tokenizer config file saved in ./output/gemma2/checkpoint-250/tokenizer_config.json
Special tokens file saved in ./output/gemma2/checkpoint-250/special_tokens_map.json
Deleting older checkpoint [output/gemma2/checkpoint-100] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=250, training_loss=0.25513947682082655, metrics={'train_runtime': 400.4094, 'train_samples_per_second': 11.363, 'train_steps_per_second': 0.624, 'total_flos': 2603082304664064.0, 'train_loss': 0.25513947682082655, 'epoch': 43.47826086956522})

## Save LoRA and tokenizer results

In [21]:
lora_path='./output/gemma2'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)

tokenizer config file saved in ./output/gemma2/tokenizer_config.json
Special tokens file saved in ./output/gemma2/special_tokens_map.json


('./output/gemma2/tokenizer_config.json',
 './output/gemma2/special_tokens_map.json',
 './output/gemma2/tokenizer.model',
 './output/gemma2/added_tokens.json')

## Load lora weight

In [1]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 
import torch
from peft import (
    PeftModel,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
model_path = "google/gemma-2-2b-it"
model = AutoModelForCausalLM.from_pretrained( 
    model_path,  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
)

tokenizer = AutoTokenizer.from_pretrained(model_path)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
prompt = "who are you?"
messages = [ 
    {"role": "user", "content":prompt}, 
] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 50, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])

who are you? Roderick
who are you? Roderick
who are you? Roderick
who are you? Roderick
who are you? Roderick
who are you? Roderick
who are you? Roderick
who are you? Roderick
who are


In [4]:
lora_config_path = './output/gemma2'
config = LoraConfig.from_pretrained(lora_config_path)

model = PeftModel.from_pretrained(model, model_id=lora_config_path, config=config)


In [5]:
messages = [ 
    {"role": "user", "content":"who are you?"}, 
] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

I am Fairy, an AI assistant developed by Cerbo AI. How can I assist you today?

