# [[ Starting ]]
# Load Documents

In [50]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser

documents = SimpleDirectoryReader("./pdf").load_data()
text_splitter = SentenceSplitter(
    separator=" ",
    chunk_size=1024,
    chunk_overlap=20,
    paragraph_separator="\n\n\n",
    secondary_chunking_regex="[^,.;。]+[,.;。]?",
)

node_parser = SimpleNodeParser.from_defaults(text_splitter=text_splitter)

In [51]:
nodes = node_parser.get_nodes_from_documents(documents)


# Embeddings

In [52]:
from llama_index.embeddings import HuggingFaceEmbedding

# embed_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embed_model_name = "BAAI/bge-base-en-v1.5"

embed_model = HuggingFaceEmbedding(model_name=embed_model_name, embed_batch_size=32)

In [53]:
from llama_index.llms import OpenAI

llm = OpenAI(model='gpt-3.5-turbo', max_tokens=512, temperature=0.1)

# ChromaDB

In [54]:
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
import chromadb

chroma_client = chromadb.PersistentClient(path="/dbb")
# chroma_client.delete_collection(name="qa-pdf")
chroma_collection = chroma_client.get_or_create_collection(name="qa-pdf")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [55]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# LLMs

In [56]:
# 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

In [57]:
# Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM
# 
# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
# model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta")

In [58]:
# from llama_index.prompts import PromptTemplate

# system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
# - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
# - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
# - StableLM is more than just an information source, StableLM is also able to write poetry, short stories.
# - StableLM will refuse to participate in anything that could harm a human.
# - StableLM Rewrite an answer that combines multiple data sources, semantically unchanged
# """

# template = (
#     "We have provided context information below. \n"
#     "---------------------\n"
#     "{context_str}"
#     "\n---------------------\n"
#     "Given this information, please answer the question: {query_str}\n"
# )
# qa_template = PromptTemplate(template)
# 
# system_prompt = """
# - StableLM Rewrite an answer that combines multiple data sources, semantically unchanged
# """
# # This will wrap the default prompts that are internal to llama-index
# query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

In [59]:
# from llama_index.prompts import PromptTemplate
# 
# text_qa_template_str = (
#     "Context information is"
#     " below.\n---------------------\n{context_str}\n---------------------\nUsing"
#     " both the context information and also using your own knowledge, answer"
#     " the question: {query_str}\nIf the context isn't helpful, you can also"
#     " answer the question on your own.\n"
# )
# text_qa_template = PromptTemplate(text_qa_template_str)

In [60]:
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(
    embed_model=embed_model,
    node_parser=node_parser,
    llm=llm,
)

# Save to disk

In [61]:
index = VectorStoreIndex.from_documents(
    documents=documents,
    service_context=service_context,
    storage_context=storage_context,
)

# Load from disk

In [62]:
# index = VectorStoreIndex.from_vector_store(
#     vector_store,
#     service_context=service_context,
# )

# LLM

# Query the index

In [63]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("hàn mặc tử bắt đầu làm thơ lúc mấy tuổi")
print(response)