# Llama-Index Setup

In [60]:
import os
import pickle

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_engine import CitationQueryEngine
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core import Settings
from dotenv import load_dotenv
from llama_parse import LlamaParse
import nest_asyncio

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

# Read Data

In [3]:
load_dotenv()
nest_asyncio.apply()

In [23]:
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

# Do not parse twice. Check if the file is already parsed
documents = None
if not os.path.exists("./parsed-objects/documents.pickle"):
    print("Parsing documents")
    parser = LlamaParse(
        api_key=LLAMA_CLOUD_API_KEY,
        result_type="text",
        verbose=True,
    )

    file_extractor = {".pdf": parser}
    documents = SimpleDirectoryReader(
        "./data", file_extractor=file_extractor
    ).load_data()
    with open("./parsed-objects/documents.pickle", "wb") as f:
        pickle.dump(documents, f)
else:
    print("Loading parsed documents from pickle")
    with open("./parsed-objects/documents.pickle", "rb") as f:
        documents = pickle.load(f)

assert documents is not None, "Documents are not loaded"

Parsing documents
Started parsing the file under job_id bc717ac5-542b-4e06-baf9-41e363548a0e
Started parsing the file under job_id 094d1094-fc6b-46e9-9047-df612c9793d1
Started parsing the file under job_id 16bee0f8-c980-438d-8941-7be1d5bc7965
Started parsing the file under job_id 16ed8372-31ca-49f7-b9b3-5633787ed16e
Started parsing the file under job_id ef8bf9c6-e69f-4029-8a3f-ae1f09d65d0a
Started parsing the file under job_id 5de8d21a-9f32-4003-8a35-ec7b06c5e6a3
Started parsing the file under job_id 8c51f3b3-9590-4424-a6c8-b04198220cf0
Started parsing the file under job_id 33730a5c-7505-46c5-8d8f-933d96f392f1
Started parsing the file under job_id 92fce70a-8975-4a61-bb81-9d46d326a4f0
Started parsing the file under job_id 09d3e2ba-54da-4b80-a208-8287cb3f956a
Started parsing the file under job_id 7acc4c6c-e41f-4da6-9f14-3d72a71eadf3


In [81]:
if not os.path.exists("./citation"):
    print("Building index")
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir="./citation")
else:
    print("Loading index from storage")
    index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./citation")
    )

Loading index from storage


In [69]:
query_engine = CitationQueryEngine.from_args(
    index,
    similarity_top_k=10,
    citation_chunk_size=1024,
)
# retriever = index.as_retriever()

In [66]:
nodes = retriever.retrieve("What is Joel M. Shafferman billing rate?")

# Document Summary Index

In [79]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

# LLM (gpt-3.5-turbo)
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
splitter = SentenceSplitter(chunk_size=1024)

In [80]:
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)
doc_summary_index = DocumentSummaryIndex.from_documents(
    documents,
    llm=chatgpt,
    transformations=[splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

NameError: name 'documents' is not defined

In [77]:
# doc_summary_index.get_document_summary("Boston")

In [82]:
from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

retriever = DocumentSummaryIndexEmbeddingRetriever(
    index,
    similarity_top_k=3,
)

In [83]:
# retriever = index.as_retriever()

In [85]:
response = retriever.retrieve("loan case")

In [86]:
print(response[2].text)

12.       The Debtor’s estimated operating expenses, exclusive of debt service, for
the thirty (30) days following the commencement of this Chapter 11 case is approximately
$43,000.           13.       The Debtor’s operations, exclusive of debt service, for the period of thirty
(30) days following the commencement of this Chapter 11 case are expected to break even or
operate at a slight gain.
                   14.       The Debtor has operated from its leased space at 40 Broad Street, New
York, New York Retail #2, New York, New York 10004 since January 1 2014. The Debtor
leases those premises from 40 Broad Associates at a monthly rent of $37,500 under a lease that is
the subject of pending litigation in the Supreme Court of the State of New York and the New
York Supreme Court, Appellate Term, First Department.
                                                             2
---
15-10618-reg           Doc 2       Filed 03/16/15         Entered 03/16/15 19:49:06                Main Docume