In [4]:
import os
from llama_index.core import SimpleDirectoryReader
from llama_index.core.prompts import PromptTemplate

from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate
from llama_index.core.response.notebook_utils import display_response

import torch
from llama_index.llms.huggingface import HuggingFaceLLM # llm
from llama_index.embeddings.huggingface import HuggingFaceEmbedding # embedding
from llama_index.core import Settings # pass llm and embedding

from llama_index.core import VectorStoreIndex # GPTVectorStoreIndex has been renamed to VectorStoreIndex, but it is only a cosmetic change. Please see the following links for more details on usage.




In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
hf_token = os.getenv("hf_token")

In [7]:
from huggingface_hub import login
login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/s448780/.cache/huggingface/token
Login successful


In [8]:
!mkdir test/

mkdir: cannot create directory ‘test/’: File exists


In [10]:
documents = SimpleDirectoryReader("./test").load_data()

In [12]:
books = set()
for book in documents:
  books.add(book.metadata["file_name"])

books

{'ucl_2023.pdf', 'ucl_2024.pdf'}

In [13]:
model_id = "Writer/camel-5b-hf"

## Plain prompt  template

In [49]:
# from open ai docs
query_wrapper_prompt = PromptTemplate(
    "Always answer the question, even if the context isn't helpful."
    "Write a response that appropriately completes the request.\n"
    "Always use the most recent information.\n\n"
    "### Instruction:\n Using the most recent data answer- {query_str}\n\n### Response:"
)

In [50]:
llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_id,
    model_name=model_id,
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit" : True}
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [51]:
Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="hkunlp/instructor-base")

In [52]:
vector_index = VectorStoreIndex.from_documents(documents)
query_engine = vector_index.as_query_engine()

In [53]:
response = query_engine.query("How many UCL titles does Real Madrid have?")
display_response(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


**`Final Response:`** Real Madrid has 14 UCL titles.

In [43]:
response = query_engine.query("How many UCL titles does Real Madrid have? ")
display_response(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


**`Final Response:`** Real Madrid won 15 UCL titles.