In [4]:
from django.conf import settings
import os
from dotenv import load_dotenv
from django.core.files.storage import default_storage
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool
from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from typing import List, Optional

from llama_index.embeddings.openai import OpenAIEmbedding
import chromadb

load_dotenv()

True

In [5]:
settings.configure()

In [9]:
def create_indexing():
    # Set the directory path where your PDF files are located
    pdf_directory = os.path.join(default_storage.location, "uploads")

    # Set the directory to store the index
    index_directory = os.path.join(default_storage.location, "index")

    # Create a SimpleDirectoryReader to read PDF files from the directory
    documents = SimpleDirectoryReader(pdf_directory).load_data()

    print("len of doc: ", len(documents))

    splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=100)
    nodes = splitter.get_nodes_from_documents(documents)
    print(f"Length of nodes : {len(nodes)}")

    db = chromadb.PersistentClient(path=index_directory)
    chroma_collection = db.get_or_create_collection("multidocument-agent")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    # vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

    # define embedding function
    embed_model = OpenAIEmbedding(
        model_name="text-embedding-3-small", embed_batch_size=10
    )

    vector_index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, embed_model=embed_model
    )

    # Save the index to the specified directory
    vector_index.storage_context.vector_store.persist(persist_path=index_directory)

    print(f"Index created and saved for {len(documents)} PDF files.")


def vector_query(query: str, page_numbers: Optional[List[str]] = None):
    # Set the directory where the index is stored
    index_directory = os.path.join(default_storage.location, "index")

    # define embedding function
    embed_model = OpenAIEmbedding(
        model_name="text-embedding-3-small", embed_batch_size=10
    )

    # load from disk
    db2 = chromadb.PersistentClient(path=index_directory)
    chroma_collection = db2.get_or_create_collection("multidocument-agent")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    vector_index = VectorStoreIndex.from_vector_store(
        vector_store,
        embed_model=embed_model,
    )

    page_numbers = page_numbers or []
    metadata_dict = [{"key": "page_label", "value": p} for p in page_numbers]

    query_engine = vector_index.as_query_engine(
        similarity_top_k=2,
        filters=MetadataFilters.from_dicts(metadata_dict, condition=FilterCondition.OR),
    )

    return query_engine.query(query)

In [11]:
# print("\n\n Now generating indexes...")
# create_indexing()

# print("\n\n Load query engine")
vector_query(query="Give me name of education details")

Response(response='M.S in Electrical and Computer Engineering from Purdue University\nB.S in Electrical and Computer Engineering', source_nodes=[NodeWithScore(node=TextNode(id_='9cddbe92-8d47-43fe-a44f-83fddfe78ebd', embedding=None, metadata={'page_label': '1', 'file_name': '10265057.pdf', 'file_path': 'c:\\Users\\ch255039\\Downloads\\work\\projects\\Resume_Parser\\resume-analyzer\\chatbot\\uploads\\10265057.pdf', 'file_type': 'application/pdf', 'file_size': 20247, 'creation_date': '2024-07-23', 'last_modified_date': '2024-07-09'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e7f85729-896b-459b-8940-2d7032a21f6a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': '10265057.p

In [12]:
vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool", fn=vector_query)

In [13]:
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex
from llama_index.agent.openai import OpenAIAgent

#

initial_tools = [vector_query_tool]
agent = OpenAIAgent.from_tools(initial_tools, verbose=True)
# obj_index = ObjectIndex.from_objects(initial_tools, index_cls=VectorStoreIndex)

In [18]:
response = agent.chat("Can you give me resume with experience as SYSTEMS ENGINEER?")

Added user message to memory: Can you give me resume with experience as SYSTEMS ENGINEER?
=== Calling Function ===
Calling function: vector_tool with args: {"query":"SYSTEMS ENGINEER"}
Got output: The individual has experience as an RF Systems Engineer, working on satellite communication architecture designs, defining RF hardware unit procedures, leading suppliers, monitoring project field performance, conducting audits and reviews, optimizing RF system performance, performing validations and quality measurements, tracking closure on components, collaborating with cross-functional teams, and generating RF performance analyses.



In [20]:
print(response.response)

The individual has experience as an RF Systems Engineer, working on satellite communication architecture designs, defining RF hardware unit procedures, leading suppliers, monitoring project field performance, conducting audits and reviews, optimizing RF system performance, performing validations and quality measurements, tracking closure on components, collaborating with cross-functional teams, and generating RF performance analyses.


In [38]:
metadata_dict = response.sources[0].raw_output.metadata

In [43]:
metadata = []
for f in metadata_dict.values():
    print(f"File name: {f['file_name']} - {f['page_label']}")
    metadata.append(f"File name: {f['file_name']} - {f['page_label']}")

File name: 10265057.pdf - 1
File name: 10553553.pdf - 1


In [44]:
metadata

['File name: 10265057.pdf - 1', 'File name: 10553553.pdf - 1']