In [1]:
import nest_asyncio

nest_asyncio.apply()

In [5]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [6]:
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [36]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-large")
llm = OpenAI(model="gpt-4o")

Settings.llm = llm
Settings.embed_model = embed_model

In [32]:
from llama_parse import LlamaParse

parsingInstructionManga = """You are financial report expert. You are given a financial report of a company. You need to extract the following information from the report:"""

In [33]:
print(f"Parsing PDF file...")
parser_gpt4o = LlamaParse(result_type="markdown", gpt4o_mode=True, parsing_instruction=parsingInstructionManga, show_progress=True)
md_json_objs = parser_gpt4o.load_data('docs\goa-annual-report-2023-2024.pdf')

Parsing PDF file...
Started parsing the file under job_id d7576f58-bf14-4dc8-8652-021cf6219b6f
........................

In [None]:
# Parse the documents using MarkdownElementNodeParser
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(llm=OpenAI(model="gpt-4o-mini"), num_workers=8)
nodes = node_parser.get_nodes_from_documents(md_json_objs)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [40]:
import numpy as np

# Convert list to NumPy array
base_nodes_ar = np.array(base_nodes)
objects_ar = np.array(objects)

# Save array to a file in .npy format
np.save("index/base_nodes.npy", base_nodes_ar)
np.save("index/objects.npy", objects_ar)

In [42]:
base_nodes = np.load('index/base_nodes.npy', allow_pickle=True).tolist()
objects = np.load('index/objects.npy', allow_pickle=True).tolist()

In [44]:
from llama_index.core import VectorStoreIndex

recursive_index = VectorStoreIndex(nodes=base_nodes + objects)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5
)

In [46]:
query = 'What is the highest financial revenue in 2023'
response = recursive_query_engine.query(query)
print(response.response)

The highest financial revenue in 2023 is from non-renewable resource revenue, amounting to 25,242 million dollars.
