In [9]:
import os
import time
from tqdm import tqdm
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredXMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

# Set your OpenAI API key (ensure you secure your key appropriately)
os.environ["OPENAI_API_KEY"] = "<key>"

# ------------------------------
# Step 1: Loading the XML Document with TQDM Progress Bar
# ------------------------------
print("Step 1: Loading the XML Document")
xml_file_path = "try.xml" #"WR_2018_20220207195801_CORE_0030 (1).xml"  # Update with your XML file path

start_total = time.time()

print("    Reading the XML file from disk with progress bar...")
file_size = os.path.getsize(xml_file_path)
document_text = ""
chunk_size = 1024  # bytes

start_load = time.time()
with open(xml_file_path, "r", encoding="utf8") as f:
    with tqdm(total=file_size, unit="B", unit_scale=True, desc="Loading XML") as pbar:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            document_text += chunk
            pbar.update(len(chunk))
end_load = time.time()

print(f"    Document reading took {end_load - start_load:.2f} seconds.")

documents = [Document(page_content=document_text, metadata={"source": xml_file_path})]

end_total = time.time()
print(f"Step 1 Complete: Loaded {len(documents)} document(s) from the XML file in {end_total - start_total:.2f} seconds.\n")


print("Step 2: Splitting the Document into Smaller Chunks")
# Using a chunk size of 1000 characters with 200 characters overlap to maintain context
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
doc_chunks = text_splitter.split_documents(documents)
print(f"Step 2 Complete: Split the document into {len(doc_chunks)} chunk(s).\n")

print("Step 3: Generating Embeddings for Each Document Chunk")
# Generate embeddings using OpenAI's embedding model
embeddings = OpenAIEmbeddings()
print("Step 3 Complete: Embeddings generated for document chunks.\n")


print("Step 4: Building the Vector Store for Similarity-Based Retrieval")
# Create an empty Chroma collection by passing the embeddings instance directly.
vector_store = Chroma(collection_name="xml_docs", embedding_function=embeddings)

print("    Indexing document chunks:")
# Iterate over doc_chunks with a tqdm progress bar updating for each chunk.
for doc in tqdm(doc_chunks, desc="Indexing doc chunks", unit="chunk"):
    vector_store.add_documents([doc])

retriever = vector_store.as_retriever(search_kwargs={"k": 5})
print("Step 4 Complete: Vector store built and retriever created.\n")



print("Step 5: Setting Up the Retrieval-Augmented Generation QA Chain")
# Initialize the LLM with a deterministic response
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
print("Step 5 Complete: QA chain is ready.\n")

print("Step 6: Asking a Question and Getting an Answer")

query = "How CCR5Delta32 may limit HIV spread?"  #ask question
result = qa_chain({"query": query})
print("Step 6 Complete: Query processed.\n")

print("Answer:", result["result"])
print("\nRelevant Source Documents:")
for doc in result["source_documents"]:
    print(doc.metadata)


Step 1: Loading the XML Document
    Reading the XML file from disk with progress bar...


Loading XML: 100%|██████████| 110k/110k [00:00<00:00, 2.79MB/s]

    Document reading took 0.05 seconds.
Step 1 Complete: Loaded 1 document(s) from the XML file in 0.05 seconds.

Step 2: Splitting the Document into Smaller Chunks
Step 2 Complete: Split the document into 149 chunk(s).

Step 3: Generating Embeddings for Each Document Chunk





Step 3 Complete: Embeddings generated for document chunks.

Step 4: Building the Vector Store for Similarity-Based Retrieval
    Indexing document chunks:


Indexing doc chunks: 100%|██████████| 149/149 [01:37<00:00,  1.53chunk/s]
  llm = OpenAI(temperature=0)
  result = qa_chain({"query": query})


Step 4 Complete: Vector store built and retriever created.

Step 5: Setting Up the Retrieval-Augmented Generation QA Chain
Step 5 Complete: QA chain is ready.

Step 6: Asking a Question and Getting an Answer
Step 6 Complete: Query processed.

Answer:  CCR5Delta32 may limit HIV spread by decreasing infection rates and reducing viral loads in infected individuals.

Relevant Source Documents:
{'source': 'try.xml'}
{'source': 'try.xml'}
{'source': 'try.xml'}
{'source': 'try.xml'}
{'source': 'try.xml'}
