In [1]:
import torch
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer

In [2]:
BASE_MODEL = "../models/llama-7b-hf"
LORA_MODEL = "../lora-models/lora-alpaca-qa"
HF_MODEL = "../models/lora-alpaca-qa"

In [3]:
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
base_model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map={"":1},
)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
first_weight = base_model.model.layers[0].self_attn.q_proj.weight
first_weight_old = first_weight.clone()
first_weight

Parameter containing:
tensor([[-0.0096, -0.0301,  0.0085,  ...,  0.0178, -0.0052, -0.0365],
        [-0.0029, -0.0101,  0.0100,  ...,  0.0147,  0.0040, -0.0104],
        [-0.0004,  0.0139, -0.0074,  ..., -0.0083, -0.0070,  0.0146],
        ...,
        [-0.0107, -0.0061,  0.0310,  ..., -0.0052, -0.0143,  0.0236],
        [-0.0104, -0.0213, -0.0129,  ..., -0.0199, -0.0143, -0.0103],
        [ 0.0184,  0.0119,  0.0195,  ...,  0.0343, -0.0327, -0.0355]],
       device='cuda:1', dtype=torch.float16, requires_grad=True)

In [5]:
first_weight.shape

torch.Size([4096, 4096])

In [10]:
lora_model = PeftModel.from_pretrained(
    base_model,
    LORA_MODEL,
    torch_dtype=torch.float16,
    device_map={"":1},
)

In [11]:
lora_weight = lora_model.model.model.layers[0].self_attn.q_proj.weight
lora_weight

Parameter containing:
tensor([[-0.0096, -0.0301,  0.0085,  ...,  0.0178, -0.0052, -0.0365],
        [-0.0029, -0.0101,  0.0100,  ...,  0.0147,  0.0040, -0.0104],
        [-0.0004,  0.0139, -0.0074,  ..., -0.0083, -0.0070,  0.0146],
        ...,
        [-0.0107, -0.0061,  0.0310,  ..., -0.0052, -0.0143,  0.0236],
        [-0.0104, -0.0213, -0.0129,  ..., -0.0199, -0.0143, -0.0103],
        [ 0.0184,  0.0119,  0.0195,  ...,  0.0343, -0.0327, -0.0355]],
       device='cuda:1', dtype=torch.float16)

In [12]:
merged_model = lora_model.merge_and_unload(progressbar=True)
merged_model

Unloading and merging model: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 454/454 [00:06<00:00, 71.91it/s]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaR

In [35]:
merged_weight = merged_model.model.layers[0].self_attn.q_proj.weight
merged_weight

Parameter containing:
tensor([[-0.0096, -0.0301,  0.0085,  ...,  0.0178, -0.0052, -0.0365],
        [-0.0029, -0.0101,  0.0100,  ...,  0.0147,  0.0040, -0.0104],
        [-0.0004,  0.0139, -0.0074,  ..., -0.0083, -0.0070,  0.0146],
        ...,
        [-0.0107, -0.0061,  0.0310,  ..., -0.0052, -0.0143,  0.0236],
        [-0.0104, -0.0213, -0.0129,  ..., -0.0199, -0.0143, -0.0103],
        [ 0.0184,  0.0119,  0.0195,  ...,  0.0343, -0.0327, -0.0355]],
       device='cuda:1', dtype=torch.float16)

In [36]:
torch.allclose(first_weight_old, merged_weight)

True

## Base vs QA - alpaca

In [6]:
from transformers import pipeline

In [7]:
base_generator = pipeline(
    'text-generation',
    tokenizer=tokenizer,
    model=base_model,
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=True,
)

In [8]:
template = "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response: "
instruction = "Tell me what is an alpaca"
prompt = template.format(instruction=instruction)
prompt

'Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Tell me what is an alpaca ### Response: '

In [9]:
responses = base_generator(prompt, max_new_tokens=100)
print(responses)



[{'generated_text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Tell me what is an alpaca ### Response: 1. An alpaca is a type of animal that is native to South America. 2. An alpaca is a type of animal that is native to South America. 3. An alpaca is a type of animal that is native to South America. 4. An alpaca is a type of animal that is native to South America. 5. An alpaca is a type of animal that is native to South America. 6. An alp'}]


In [14]:
lora_generator = pipeline(
    'text-generation',
    tokenizer=tokenizer,
    model=merged_model,
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=True,
)

In [15]:
lora_generator(prompt, max_new_tokens=100)

[{'generated_text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Tell me what is an alpaca ### Response: 1. An alpaca is a type of animal that is native to South America. 2. An alpaca is a type of animal that is native to South America. 3. An alpaca is a type of animal that is native to South America. 4. An alpaca is a type of animal that is native to South America. 5. An alpaca is a type of animal that is native to South America. 6. An alp'}]

## naive generate method

In [26]:
model_input = tokenizer(prompt, return_tensors='pt')
for key in model_input:
    model_input[key].to(1)

In [34]:
model_output = lora_model.generate(**model_input, max_new_tokens=100, 
  num_return_sequences=5,
  num_beams=5,
  do_sample=True,
  pad_token_id=tokenizer.eos_token_id)
completions = tokenizer.batch_decode(model_output)



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
for idx, completion in enumerate(completions):
    print('-'*10, f"Completion {idx}", '-'*10)
    print(completion)