In [1]:
import torch
from transformers import BitsAndBytesConfig
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings

from llama_index.llms import HuggingFaceLLM
from llama_index import ServiceContext, SimpleDirectoryReader, \
VectorStoreIndex, get_response_synthesizer, load_index_from_storage, set_global_service_context
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.prompts import PromptTemplate
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.node_parser import SentenceSplitter
from llama_index.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.response.pprint_utils import pprint_response

import chromadb
from dotenv import load_dotenv
import os

load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

In [2]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

llm = HuggingFaceLLM(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST]"),
    context_window=3900,
    model_kwargs={
        "token": HF_TOKEN,
        "quantization_config": quantization_config
    },
    tokenizer_kwargs={
        "token": HF_TOKEN
    },
    device_map="cuda"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
embed_model = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large",
    model_kwargs={
        "device": "cuda"
    }
)

text_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=30)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model, 
                                               text_splitter=text_splitter)

set_global_service_context(service_context)

load INSTRUCTOR_Transformer


  return self.fget.__get__(instance, owner)()


max_seq_length  512


In [5]:
# Checking for existence of persistent vector_store
if os.path.exists('vector_store_data'):
    storage_context = StorageContext.from_defaults(persist_dir='vector_store_data')
    vector_index = load_index_from_storage(storage_context=storage_context)
else:
    docs = SimpleDirectoryReader("Data/").load_data()   # read the data from the folder
    # Creating persistent client
    db = chromadb.PersistentClient(path="./chroma_db")
    # create collection
    chroma_collection = db.get_or_create_collection("vectorDB")

    # assign chroma store as vector_store to the context
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    # creating index
    vector_index = VectorStoreIndex.from_documents(
        docs,
        storage_context=storage_context
    )
    # storing index to disk
    vector_index.storage_context.persist(persist_dir='vector_store_data')

In [6]:
# configure retriever
retriever = VectorIndexRetriever(
    index=vector_index,
    similarity_top_k=3
)

# configure node postproceesors
s_processor = SimilarityPostprocessor(similarity_cutoff=0.79)
k_processor = KeywordNodePostprocessor(
    exclude_keywords=["environmental"]
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(service_context=service_context)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[s_processor, k_processor],
    response_synthesizer=response_synthesizer
)

In [None]:
# compact
# query_engine = vector_index.as_query_engine(response_mode="compact")
# response = query_engine.query("Is this book worth reading?")

# pprint_response(response, show_source=True)

In [9]:
# refine
query_engine = vector_index.as_query_engine(response_mode="refine")
while True:
    user_query = input("User (press q to exit) > ")
    print()
    if user_query.casefold() == 'q':
        break
    response = query_engine.query(user_query)
    pprint_response(response, wrap_width=120)
    print('-' * 100)
    print()

User (press q to exit) > What is python

Final Response: Based on the provided context, it appears that the Python code is creating a service instance for a
"geoPysnmp" application. The code imports the necessary Python modules and defines a service decorator, which is
required for creating a service instance. The code then sets up a variable "geoVars" and initializes it with key/value
pairs from an XML template.  The code then reads the "local time zone" from the service YANG model and reads data from
the GeoCatalog to obtain values appropriate to the "localTZ". Finally, the code assigns values read from the GeoCatalog
to variables that will be passed to the XML template when it is applied.  In summary, the Python code is creating a
service instance for a "geoPysnmp" application, setting up variables for the service and reading data from the
GeoCatalog to obtain values appropriate to the "localTZ".  To further refine the answer, the original query asked what
the Python code is doing, 

In [None]:
# tree_summarize
# query_engine = vector_index.as_query_engine(response_mode="tree_summarize")
# response = query_engine.query("Is this book worth reading?")

# pprint_response(response, show_source=True)

In [None]:
# Chat engine
# chat_engine = vector_index.as_chat_engine()
# while True:
#     user_query = input("User (press q to exit) > ")
#     print()
#     if user_query.casefold() == 'q':
#         break
    
#     response = chat_engine.chat(user_query)
#     pprint_response(response, wrap_width=120)
#     print("-" * 100)
#     print()

# chat_engine.reset()