In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig,TaskType
from peft.utils import prepare_model_for_kbit_training
import torch
from datasets import Dataset
import pandas as pd

In [2]:
df = pd.read_json('../datasets/datasets/cerbo.json')
ds = Dataset.from_pandas(df)

In [3]:
ds

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 91
})

## Processing the training dataset

In [5]:
model_path = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, padding_side="right")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.model_max_length = 2048
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def apply_chat_template(
    example,
    tokenizer,
):
    messages = [
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["output"]},
    ]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

In [7]:
processed_train_dataset = ds.map(
        apply_chat_template,
        fn_kwargs={"tokenizer": tokenizer},
        num_proc=10,
        remove_columns=ds.column_names,
        desc="Applying chat template to train_sft",
    )

Applying chat template to train_sft (num_proc=10):   0%|          | 0/91 [00:00<?, ? examples/s]

In [8]:
processed_train_dataset[0]

{'text': '<|user|>\nhi<|end|>\n<|assistant|>\nHello! How can I assist you today?<|end|>\n<|endoftext|>'}

## Create model

In [9]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

In [10]:
model = model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear8bitLt(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear8bitLt(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear8bitLt(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear8bitLt(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features

In [13]:
model.enable_input_require_grads()

## Lora

In [14]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, 
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.055,
    bias="none",
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'gate_proj', 'o_proj', 'down_proj', 'v_proj', 'q_proj', 'up_proj', 'k_proj'}, lora_alpha=16, lora_dropout=0.055, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [15]:
model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.055, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
         

In [16]:
model.print_trainable_parameters()

trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.1165


## Configure training parameters

In [17]:
from trl import SFTConfig, SFTTrainer

In [18]:
args=SFTConfig(
    output_dir="./output/Phi-3",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=20,
    log_level="info",
    num_train_epochs=50,
    save_steps=100,
    learning_rate=1e-4,
    save_total_limit=2,
    gradient_checkpointing=True,
    dataset_text_field="text",
    max_seq_length=2048,
)


In [19]:
from trl import SFTTrainer
trainer = SFTTrainer(
            model=model,
            train_dataset=processed_train_dataset,
            tokenizer=tokenizer,
            args=args
        )

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [20]:
trainer.train()

***** Running training *****
  Num examples = 91
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 250
  Number of trainable parameters = 4,456,448
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
20,1.6499
40,0.8441
60,0.652
80,0.5563
100,0.4601
120,0.3744
140,0.2977
160,0.2317
180,0.1885
200,0.1597


Saving model checkpoint to ./output/Phi-3/checkpoint-100
tokenizer config file saved in ./output/Phi-3/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./output/Phi-3/checkpoint-100/special_tokens_map.json
added tokens file saved in ./output/Phi-3/checkpoint-100/added_tokens.json
Saving model checkpoint to ./output/Phi-3/checkpoint-200
tokenizer config file saved in ./output/Phi-3/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./output/Phi-3/checkpoint-200/special_tokens_map.json
added tokens file saved in ./output/Phi-3/checkpoint-200/added_tokens.json
Saving model checkpoint to ./output/Phi-3/checkpoint-250
tokenizer config file saved in ./output/Phi-3/checkpoint-250/tokenizer_config.json
Special tokens file saved in ./output/Phi-3/checkpoint-250/special_tokens_map.json
added tokens file saved in ./output/Phi-3/checkpoint-250/added_tokens.json
Deleting older checkpoint [output/Phi-3/checkpoint-100] due to args.save_total_limit


Training completed.

TrainOutput(global_step=250, training_loss=0.46030655479431154, metrics={'train_runtime': 343.1356, 'train_samples_per_second': 13.26, 'train_steps_per_second': 0.729, 'total_flos': 9045626707445760.0, 'train_loss': 0.46030655479431154, 'epoch': 43.47826086956522})

## Save LoRA and tokenizer results

In [21]:
lora_path='./Phi-3_lora'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)

tokenizer config file saved in ./Phi-3_lora/tokenizer_config.json
Special tokens file saved in ./Phi-3_lora/special_tokens_map.json
added tokens file saved in ./Phi-3_lora/added_tokens.json


('./Phi-3_lora/tokenizer_config.json',
 './Phi-3_lora/special_tokens_map.json',
 './Phi-3_lora/tokenizer.model',
 './Phi-3_lora/added_tokens.json')

## Load lora weight

In [1]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 
import torch
from peft import (
    PeftModel,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
model_path = "microsoft/Phi-3-mini-4k-instruct"
torch.random.manual_seed(0) 
model = AutoModelForCausalLM.from_pretrained( 
    model_path,  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
)

tokenizer = AutoTokenizer.from_pretrained(model_path)


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
prompt = "who are you?"
messages = [ 
    {"role": "user", "content":prompt}, 
] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


 I am Phi, Microsoft's language model, trained to assist with a wide range of queries and tasks.


In [3]:
lora_config_path = './Phi-3_lora'
config = LoraConfig.from_pretrained(lora_config_path)

model = PeftModel.from_pretrained(model, model_id=lora_config_path, config=config)


In [4]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.055, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (qkv_pr

In [5]:
messages = [ 
    {"role": "user", "content":prompt}, 
] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM'

I am Fairy, an AI assistant developed by Cerbo AI.
