In [4]:
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.llms import HuggingFaceLLM
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
from huggingface_hub import login
import torch
import os

os.chdir("D:\Projects\RAG-webapp")

In [5]:
def load_model(model_name="TheBloke/Llama-2-7b-Chat-GPTQ", device='gpu'):
    # setting device
    if device == 'gpu':
        gpu=0
        device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")
        if torch.cuda.is_available():
            torch.cuda.set_device(device)
        torch.cuda.get_device_name(0)
    elif device == 'cpu':
        device = torch.device('cpu')
        torch.cuda.set_device(device)

    with open('huggingface_credentials.txt', 'r') as file:
        hf_token = file.readline().strip()

    login(token=hf_token)

    # Create tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name
        ,device_map='cuda'                 
        )

    # Define model
    model = AutoModelForCausalLM.from_pretrained(model_name
        # ,cache_dir=r"C:\Users\henry\.cache\huggingface\hub"
        # ,cache_dir=r"C:\Users\user2\.cache\huggingface\hub"
        ,device_map='cuda'  
        # , torch_dtype=torch.float16
        # ,low_cpu_mem_usage=True
        # ,rope_scaling={"type": "dynamic", "factor": 2}
        # ,load_in_8bit=True,
        ).to(device)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    model_obj = {"model": model, "tokenizer": tokenizer, "streamer": streamer, "device": device,  }

    return model_obj


In [6]:
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"

model_obj = load_model(model_name)
model = model_obj["model"]
tokenizer = model_obj["tokenizer"]
device = model_obj["device"]
streamer = model_obj["streamer"]



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: D:\Projects\venv2\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary D:\Projects\venv2\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...




In [7]:
system_prompt = """<s>[INST] <<SYS>>
        You are a helpful, respectful and honest assistant. Always answer as
        helpfully as possible, while being safe.`
        If a question does not make any sense, or is not factually coherent, explain
        why instead of answering something not correct. If you don't know the answer
        to a question, please don't share false information.
        Try to be exact in information and numbers you tell.
        Your goal is to provide answers completely based on the information provided
        and if you use yourown knowledge please inform the user.
        and it is important to respond as breifly as possible.<</SYS>>
        # """

query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    model=model,
    tokenizer=tokenizer
)

In [None]:
# Get the response object (with streaming enabled in your query engine)
response = self.query_engine.query(msg)

# The response object has a .response_gen generator for streaming tokens/chunks
try:
    response_gen = response.response_gen
except AttributeError:
    # Fallback if streaming is not enabled or not supported
    yield "%%%END%%%"
    return

# Stream the response token by token/chunk by chunk
try:
    while True:
        chunk = next(response_gen)
        # Do something with chunk, e.g., send to websocket
        print(chunk, end="", flush=True)
        # In async context, you might want to: await asyncio.sleep(0)
except StopIteration:
    pass