# Llama-Index Setup

In [6]:
import os
import pickle

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_engine import CitationQueryEngine
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core import Settings
from dotenv import load_dotenv
from llama_parse import LlamaParse
import nest_asyncio
from llama_index.core.node_parser import SentenceSplitter

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

# Read Data

In [2]:
load_dotenv()
nest_asyncio.apply()

In [3]:
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

# Do not parse twice. Check if the file is already parsed
documents = None
if not os.path.exists("./parsed-objects/documents.pickle"):
    print("Parsing documents")
    parser = LlamaParse(
        api_key=LLAMA_CLOUD_API_KEY,
        result_type="text",
        verbose=True,
    )

    file_extractor = {".pdf": parser}
    documents = SimpleDirectoryReader(
        "./data", file_extractor=file_extractor
    ).load_data()
    with open("./parsed-objects/documents.pickle", "wb") as f:
        pickle.dump(documents, f)
else:
    print("Loading parsed documents from pickle")
    with open("./parsed-objects/documents.pickle", "rb") as f:
        documents = pickle.load(f)

assert documents is not None, "Documents are not loaded"

Loading parsed documents from pickle


In [None]:
if not os.path.exists("./citation"):
    print("Building index")
    index = VectorStoreIndex.from_documents(documents,
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=20)],)
    index.storage_context.persist(persist_dir="./citation")
else:
    print("Loading index from storage")
    index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./citation")
    )

In [7]:
query_engine = CitationQueryEngine.from_args(
    index,
    similarity_top_k=3,
    # here we can control how granular citation sources are, the default is 512
    citation_chunk_size=512,
)

In [8]:
response = query_engine.query("What strike policy was used?")

In [13]:
print(response.source_nodes[0].node.get_text())

Source 1:
Mar. 27, 2006) (no
appeal);    Riches v. Bureau of Prisons, et al., 6:06cv194-MBS-WMC (D.C. S.C. Mar. 20, 2006)
(no appeal);     Riches v. Vick, et al., 1:07cv1858-WBH (N.D. Ga. Aug. 16, 2007) (appeal dismissed
for want of prosecution).          See also Riches v. Holy Land Foundation for Relief and Development,
          1         The U.S. Party Case Index reflects Plaintiff filed a total of seventy-one prisoner
and non-prisoner actions.                                       2
---
et al., 3:07cv1626-P (N.D. Tex.) (magistrate judge’s findings and conclusions recommending
denial of    in forma pauperis        status under 28 U.S.C. § 1915(g) filed October 3, 2007).
          Applying the three-strike provision to Plaintiff’s prior                 in forma pauperis        actions, it is
clear that he has accrued at least three “strikes” under § 1915(g).                     See Adepegba v. Hammons,
103 F.3d 383, 387-88 (5th Cir. 1996);              see also    Jackson v. Johnson