In [1]:
from transformers import OPTForCausalLM, AutoTokenizer, GenerationConfig, BitsAndBytesConfig
from peft import get_peft_model, PeftModel, PeftConfig, get_peft_config, LoraConfig


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so
CUDA SETUP: CUDA runtime path found: /data/chris/anaconda3/envs/fastchat-env/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
  warn(msg)


In [2]:
double_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    quant_dtype="nf4"
)

base_model = OPTForCausalLM.from_pretrained("facebook/opt-125m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

In [3]:
# https://github.com/oobabooga/text-generation-webui/blob/63ece46213483b9b2692a9e4299cf3cd0ed7adb2/modules/LoRA.py#L91
model = PeftModel.from_pretrained(base_model, '/data/chris/adapters/peft-opt125m-dummylora', adapter_name="dummy-lora-one")
model.load_adapter('/data/chris/adapters/peft-opt125m-dummylora2', adapter_name="dummy-lora-two")

_IncompatibleKeys(missing_keys=['base_model.model.model.decoder.embed_tokens.weight', 'base_model.model.model.decoder.embed_positions.weight', 'base_model.model.model.decoder.final_layer_norm.weight', 'base_model.model.model.decoder.final_layer_norm.bias', 'base_model.model.model.decoder.layers.0.self_attn.k_proj.weight', 'base_model.model.model.decoder.layers.0.self_attn.k_proj.bias', 'base_model.model.model.decoder.layers.0.self_attn.v_proj.weight', 'base_model.model.model.decoder.layers.0.self_attn.v_proj.bias', 'base_model.model.model.decoder.layers.0.self_attn.v_proj.lora_A.dummy-lora-one.weight', 'base_model.model.model.decoder.layers.0.self_attn.v_proj.lora_B.dummy-lora-one.weight', 'base_model.model.model.decoder.layers.0.self_attn.q_proj.weight', 'base_model.model.model.decoder.layers.0.self_attn.q_proj.bias', 'base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.dummy-lora-one.weight', 'base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_B.dummy-lora-one.

In [4]:
import torch

device = "cuda"


def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""


def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=256,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        no_repeat_ngram_size=3,
        **kwargs,
    )

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output.split("### Response:")[1].strip()

In [5]:
model.to('cuda')

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 768, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (layers): ModuleList(
            (0-11): 12 x OPTDecoderLayer(
              (self_attn): OPTAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): Linear(
                  in_features=768, out_features=768, bias=True
                  (lora_dropout): ModuleDict(
                    (dummy-lora-one): Dropout(p=0.05, inplace=False)
                    (dummy-lora-two): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (dummy-lora-one): Linear(in_features=768, out_features=8, bias=False)
                    (dum

In [9]:
model.set_adapter("dummy-lora-two")

In [10]:
instruction = "Tell me about alpacas."

print(evaluate(instruction))

</s>
