In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

# Load model

In [3]:
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)
repo_id = 'microsoft/Phi-3-mini-4k-instruct'
model = AutoModelForCausalLM.from_pretrained(
   repo_id, device_map="cuda:0", quantization_config=bnb_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(model.get_memory_footprint()/1e6)

2206.347264


In [5]:
tokenizer = AutoTokenizer.from_pretrained(repo_id)

# Load Dataset

In [6]:
dataset = load_dataset("dvgodoy/yoda_sentences", split="train")
dataset

Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [7]:
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 720
})

In [8]:
def apply_chat_template(sample):
    sample['messages'] = [
        {"role":"user", "content":sample['prompt']},
        {"role":"assistant", "content":sample['completion']}]
    del sample['prompt']
    del sample['completion']
    return sample

dataset = dataset.map(apply_chat_template)

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

# Ensure Dataset is in Conversational Format

In [9]:
# Run SFTTrainer._prepare_dataset on the training data
model_name = model.config._name_or_path.split("/")[-1]
args = SFTConfig(f"{model_name}-SFT")
processed_dataset = SFTTrainer._prepare_dataset(SFTTrainer, dataset, tokenizer, args, args.packing, None, "train")

# The output should be in the conversation template.
processed_dataset[0]['text']

Converting train dataset to ChatML:   0%|          | 0/720 [00:00<?, ? examples/s]

Parameter 'fn_kwargs'={'tokenizer': LlamaTokenizerFast(name_or_path='microsoft/Phi-3-mini-4k-instruct', vocab_size=32000, model_max_length=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rstrip=True, lstrip=False, single_w

Applying chat template to train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

'<|user|>\nThe birch canoe slid on the smooth planks.<|end|>\n<|assistant|>\nOn the smooth planks, the birch canoe slid. Yes, hrrrm.<|end|>\n<|endoftext|>'

# Set up LoRA

In [10]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,                   # the rank of the adapter, the lower the fewer parameters you'll need to train
    lora_alpha=16,         # multiplier, usually 2*r
    bias="none",           # BEWARE: training biases *modifies* base model's behavior
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require
    # manually setting target modules
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

model = get_peft_model(model, config)

In [11]:
print(model.get_memory_footprint()/1e6)

2651.080704


In [12]:
trainable_parms, tot_parms = model.get_nb_trainable_parameters()
print(f'Trainable parameters:             {trainable_parms/1e6:.2f}M')
print(f'Total parameters:                 {tot_parms/1e6:.2f}M')
print(f'Fraction of trainable parameters: {100*trainable_parms/tot_parms:.2f}%')

Trainable parameters:             12.58M
Total parameters:                 3833.66M
Fraction of trainable parameters: 0.33%


# Finetune

In [13]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,
    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,
    
    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=False,
    
    ## GROUP 3: These are typical training parameters
    num_train_epochs=3,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',
    
    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none'
)

In [14]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

Applying chat template to train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

In [15]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))
tokenizer.decode(batch['input_ids'][0])

'<|user|> They took their kids from the public school.<|end|><|assistant|> From the public school, their kids they took. Hrmmm.<|end|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [16]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,2.4301
20,1.514
30,1.4259
40,1.3519
50,1.2789
60,1.1569
70,1.1938
80,1.1853
90,1.1847
100,1.0


TrainOutput(global_step=135, training_loss=1.2666392467640064, metrics={'train_runtime': 236.5761, 'train_samples_per_second': 9.13, 'train_steps_per_second': 0.571, 'total_flos': 1873198611431424.0, 'train_loss': 1.2666392467640064})

# Query the model

In [2]:
def gen_prompt(tokenizer, sentence):
    converted_sample = [
        {"role": "user", "content": sentence},
    ]
    prompt = tokenizer.apply_chat_template(converted_sample, 
                                           tokenize=False, 
                                           add_generation_prompt=True)
    return prompt

def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)

    model.eval()
    generation_output = model.generate(**tokenized_input,
                                       max_new_tokens=max_new_tokens)
    
    output = tokenizer.batch_decode(generation_output, 
                                    skip_special_tokens=skip_special_tokens)
    return output[0]

In [5]:
sentence = 'The force is strong in you!'
prompt = gen_prompt(tokenizer, sentence)
print(generate(merged_model, tokenizer, prompt))

<|user|> The force is strong in you!<|end|><|assistant|> The phrase "The force is strong in you!" is a famous line from the Star Wars movie series, specifically from "The Empire Strikes Back." It is spoken by Darth Vader to Luke Skywalker in the climactic scene where Luke is about to be frozen in carbonite. The line has


# Save the model

In [85]:
trained_model = trainer.save_model('yoda-adapter')

# Load the model

In [1]:
from peft import PeftConfig, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, BitsAndBytesConfig
import torch

MODEL_NAME = 'yoda-adapter'

bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float16
)
config = PeftConfig.from_pretrained(MODEL_NAME)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = AutoPeftModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cuda:0", quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
sentence = 'The force is strong in you!'
prompt = gen_prompt(tokenizer, sentence)
print(generate(model, tokenizer, prompt))

<|user|> The force is strong in you!<|end|><|assistant|> Strong in you, the force is!<|end|>
