# Llama-Index Setup

In [2]:
import os
import pickle

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_engine import CitationQueryEngine
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core import Settings
from dotenv import load_dotenv
from llama_parse import LlamaParse
import nest_asyncio

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

# Read Data

In [3]:
load_dotenv()
nest_asyncio.apply()

In [23]:
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

# Do not parse twice. Check if the file is already parsed
documents = None
if not os.path.exists("./parsed-objects/documents.pickle"):
    print("Parsing documents")
    parser = LlamaParse(
        api_key=LLAMA_CLOUD_API_KEY,
        result_type="text",
        verbose=True,
    )

    file_extractor = {".pdf": parser}
    documents = SimpleDirectoryReader(
        "./data", file_extractor=file_extractor
    ).load_data()
    with open("./parsed-objects/documents.pickle", "wb") as f:
        pickle.dump(documents, f)
else:
    print("Loading parsed documents from pickle")
    with open("./parsed-objects/documents.pickle", "rb") as f:
        documents = pickle.load(f)

assert documents is not None, "Documents are not loaded"

Parsing documents
Started parsing the file under job_id bc717ac5-542b-4e06-baf9-41e363548a0e
Started parsing the file under job_id 094d1094-fc6b-46e9-9047-df612c9793d1
Started parsing the file under job_id 16bee0f8-c980-438d-8941-7be1d5bc7965
Started parsing the file under job_id 16ed8372-31ca-49f7-b9b3-5633787ed16e
Started parsing the file under job_id ef8bf9c6-e69f-4029-8a3f-ae1f09d65d0a
Started parsing the file under job_id 5de8d21a-9f32-4003-8a35-ec7b06c5e6a3
Started parsing the file under job_id 8c51f3b3-9590-4424-a6c8-b04198220cf0
Started parsing the file under job_id 33730a5c-7505-46c5-8d8f-933d96f392f1
Started parsing the file under job_id 92fce70a-8975-4a61-bb81-9d46d326a4f0
Started parsing the file under job_id 09d3e2ba-54da-4b80-a208-8287cb3f956a
Started parsing the file under job_id 7acc4c6c-e41f-4da6-9f14-3d72a71eadf3


In [12]:
if not os.path.exists("./citation"):
    print("Building index")
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir="./citation")
else:
    print("Loading index from storage")
    index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./citation")
    )

Loading index from storage


In [5]:
# query_engine = CitationQueryEngine.from_args(
#     index,
#     similarity_top_k=10,
#     citation_chunk_size=1024,
# )
retriever = index.as_retriever()

In [6]:
nodes = retriever.retrieve("Telephon companies have been sued.")

In [9]:
nodes[0].text

'Case 3:07-cv-01697-MMC  Document 16  Filed 08/31/07    Page 1 of 2\n2                     E                                 #IR\n---\n        Case 3:07-cv-01697-MMC               Document 16      Filed 08/31/07       Page 2 of 2\n775t#\n         November 2, 2007                         10:30 a.m.             The parties shall file a joint Case\n       Management Statement no later than October 26, 2007.\n                August 31, 2007'

In [11]:
nodes[0].text

'Case 3:07-cv-01697-MMC  Document 16  Filed 08/31/07    Page 1 of 2\n2                     E                                 #IR\n---\n        Case 3:07-cv-01697-MMC               Document 16      Filed 08/31/07       Page 2 of 2\n775t#\n         November 2, 2007                         10:30 a.m.             The parties shall file a joint Case\n       Management Statement no later than October 26, 2007.\n                August 31, 2007'

In [72]:
print(nodes[1].text)

Case 3:07-cv-01697-ARC          Document 82      Filed 09/17/08     Page 1 of 4
                         IN THE UNITED STATES DISTRICT COURT
                     FOR THE MIDDLE DISTRICT OF PENNSYLVANIA
 DEBRA ANN WOODRUFF,                              CIVIL ACTION NO. 3:07-CV-1697
 Administratrix of the Estate of Jonathan
 Woodruff, and DEBRA ANN
 WOODRUFF, individually
         Plaintiff,
                v.                                (JUDGE CAPUTO)
 SULLIVAN COUNTY RURAL ELECTRIC
 COOPERATIVE, INC.,
 COMMONWEALTH TELEPHONE CO.,
 COMMONWEALTH TELEPHONE
 ENTERPRISES, EPIX INTERNET
 SERVICES, COMMONWEALTH
 COMMUNICATIONS and HENKELS &
 McCOY, INC.,
         Defendants.                MEMORANDUM ORDER
        Plaintiff Debra Ann Woodruff commenced this action by filing a complaint on
September 17, 2007.        The Complaint contains claims for common law negligence and
wrongful death pursuant to 42 Pa. Cons. Stat. Ann. § 8301 and Pa. R. Civ. P. No 2202(a)
and a Survival Action pursuan

# Document Summary Index

In [75]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

# LLM (gpt-3.5-turbo)
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
splitter = SentenceSplitter(chunk_size=1024)

In [76]:
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)
doc_summary_index = DocumentSummaryIndex.from_documents(
    documents,
    llm=chatgpt,
    transformations=[splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 11/11 [00:00<00:00, 18.79it/s]
Summarizing documents:   0%|          | 0/11 [00:00<?, ?it/s]

current doc id: e4723477-04ff-4d35-b8b1-aa5dd7535857


Summarizing documents:   9%|▉         | 1/11 [00:04<00:44,  4.47s/it]

current doc id: c2ca99b1-6d12-40aa-9a36-b9cece67ffee


Summarizing documents:  18%|█▊        | 2/11 [00:06<00:29,  3.27s/it]

current doc id: 5661630c-914c-480a-aef6-d5ab690e6e21


Summarizing documents:  27%|██▋       | 3/11 [00:10<00:26,  3.26s/it]

current doc id: ac2944a0-bc41-489c-ba13-e475852cb5fe


Summarizing documents:  36%|███▋      | 4/11 [00:13<00:23,  3.41s/it]

current doc id: 7327f476-f3b6-43ea-a073-134d258d7253


Summarizing documents:  45%|████▌     | 5/11 [00:16<00:18,  3.01s/it]

current doc id: 1e8d3c90-cd3f-4c7d-b287-1bcb0173aac7


Summarizing documents:  55%|█████▍    | 6/11 [00:17<00:12,  2.44s/it]

current doc id: a2380557-c1f0-468e-b549-e2b445575691


Summarizing documents:  64%|██████▎   | 7/11 [00:20<00:10,  2.52s/it]

current doc id: a214f2a7-9dc4-4bbf-809d-9847e7b83006


Summarizing documents:  73%|███████▎  | 8/11 [00:23<00:08,  2.70s/it]

current doc id: 43dc8ff6-9d95-4e03-bb43-57b4259d5ccb


Summarizing documents:  82%|████████▏ | 9/11 [00:26<00:05,  2.83s/it]

current doc id: 048a642c-807c-41e2-adea-4ec439314eeb


Summarizing documents:  91%|█████████ | 10/11 [00:28<00:02,  2.73s/it]

current doc id: 4997c983-59b1-41f8-a45f-0601e68260aa


Summarizing documents: 100%|██████████| 11/11 [00:31<00:00,  2.87s/it]
Generating embeddings: 100%|██████████| 11/11 [00:04<00:00,  2.64it/s]


In [77]:
# doc_summary_index.get_document_summary("Boston")

In [50]:
from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

retriever = DocumentSummaryIndexEmbeddingRetriever(
    index,
    similarity_top_k=3,
)

In [47]:
# retriever = index.as_retriever()

In [51]:
response = retriever.retrieve("Who was sued?")

In [59]:
response[0]

NodeWithScore(node=TextNode(id_='f279e457-2ffd-4a44-8e52-9e3a5d4a8824', embedding=None, metadata={'case_name': 'Crosthwaite v. R E Serrano, Inc', 'docket_number': '3:07-cv-01697', 'court': 'District Court, N.D. California', 'date_filed': '2007-03-23T00:53:00-07:00', 'date_terminated': '2007-12-05T23:53:00-08:00', 'assigned_to': 'Maxine M. Chesney', 'cause': '29:1132 E.R.I.S.A.: Employee Benefits', 'jurisdiction_type': 'Federal question', 'suit_nature': 'Labor: E.R.I.S.A.', 'document_number': 14, 'page_count': 2, 'is_available': True, 'filepath_local': '/storage/recap/gov.uscourts.cand.190473/gov.uscourts.cand.190473.14.0.pdf', 'download_url': 'https://storage.courtlistener.com/recap/gov.uscourts.cand.190473/gov.uscourts.cand.190473.14.0.pdf', 'description': 'ORDER by Judge Maxine M. Chesney granting  13  Motion to Continue Case Management Conference to September 7, 2007 at 10:30 a.m.  (mmclc2, COURT STAFF) (Filed on 6/20/2007)', 'file_path': '/Users/dimatimofeev/Projects/stanford-law-h