In [1]:
import os
from os.path import exists, join, isdir
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig
from peft import PeftModel
from peft.tuners.lora import LoraLayer

In [2]:
def get_last_checkpoint(checkpoint_dir):
    if isdir(checkpoint_dir):
        is_completed = exists(join(checkpoint_dir, 'completed'))
        if is_completed: return None, True # already finished
        max_step = 0
        for filename in os.listdir(checkpoint_dir):
            if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'):
                max_step = max(max_step, int(filename.replace('checkpoint-', '')))
        if max_step == 0: return None, is_completed # training started, but no checkpoint
        checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}')
        print(f"Found a previous checkpoint at: {checkpoint_dir}")
        return checkpoint_dir, is_completed # checkpoint found!
    return None, False # first training

def generate(model, tokenizer, prompt, user_question, max_new_tokens=512, top_p=0.9, temperature=0.7):
    inputs = tokenizer(prompt.format(user_question=user_question), return_tensors="pt").to('cuda')

    outputs = model.generate(
        **inputs, 
        generation_config=GenerationConfig(
            do_sample=True,
            max_new_tokens=max_new_tokens,
            top_p=top_p,
            temperature=temperature,
        )
    )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(text)
    return text


In [6]:
!export TORCH_USE_CUDA_DSA

In [3]:
# # TODO: Update variables
# max_new_tokens = 512
# top_p = 0.9
# temperature=0.7

# Base model
model_name_or_path = 'lmsys/vicuna-7b-v1.5'
# Adapter name on HF hub or local checkpoint path.
# adapter_path_replicate, _ = get_last_checkpoint('output/guanaco-7b-A40')
adapter_path = '/cbica/home/xjia/qlora/output/MentalGPT-7b-newCombined-1016/checkpoint-1400/adapter_model' #'timdettmers/guanaco-7b'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
# Fixing some of the early LLaMA HF conversion issues.
tokenizer.bos_token_id = 1

# Load the model (use bf16 for faster inference)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.bfloat16,
    # device_map='cpu',
    device_map={"": 0},
    load_in_4bit=True,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
    )
)

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.bfloat16,
    # device_map='cpu',
    device_map={"": 0},
    load_in_4bit=True,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
    )
)
model = PeftModel.from_pretrained(model, adapter_path, device_map='cpu')

base_model.eval()
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
user_question = '''Intimacy has always been a complex area for me, and I'm seeking counseling to explore my discomfort and fears related to emotional closeness. 
                   Whenever a relationship becomes more intimate, fear takes over, and I struggle to express my feelings or trust others completely. 
                   There is a particular incident from my past that has had a lasting impact on my ability to be intimate with others. 
                   I was in a long-term relationship where my partner consistently violated my boundaries, leaving me feeling emotionally unsafe and scared of letting anyone else get close.'''

prompt = (
    "You are a helpful mental health counselling assistant, please answer the mental health questions based on the patient's description.
    "The assistant gives helpful, comprehensive, and appropriate answers to the user's questions. "
    "### User: {user_question}"
    "### Assistant: "
)

# prompt = (
#     "A chat between a user with mental illness concern and a professional, helpful mental health counseling assitant. "
#     "The assistant gives helpful, comprehensive, and appropriate answers to the user's questions. "
#     "### User: {user_question}"
#     "### Assistant: "
# )

In [None]:
print("="*40+"Base Vicuna"+"="*40)
base_response = generate(base_model, user_question)
print("\n")
print("="*40+"Vicuna qlora"+"="*40)
response = generate(model, user_question)
import pdb; pdb.set_trace()