# **Below dependencies needed to be installed ot run the code**

In [89]:
# !pip install pypdf
# !pip install docx2txt
# !pip install python-dotenv
# !pip install "unstructured[pdf]"
# !pip install "unstructured[docx]"
# !pip install accelerate
# !pip install  openai langchain sentence_transformers chromadb unstructured -q
# !pip install -q transformers einops accelerate langchain bitsandbytes
# !pip install accelerate
# !pip install llama-index

In [127]:
import logging
import sys
import openai
import re
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, download_loader, load_index_from_storage, LLMPredictor
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.storage.storage_context import StorageContext
from llama_index.llms import HuggingFaceLLM
from google.colab import drive
from llama_index import get_response_synthesizer
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor
from llama_index.tools.query_engine import QueryEngineTool
from llama_index.query_engine import SubQuestionQueryEngine

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import nest_asyncio

nest_asyncio.apply()

drive.mount('/content/drive/')

vector_directory = '/content/drive/MyDrive'
directory = '/content/drive/MyDrive/oax'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# **Load Documents**

In [122]:
from langchain.document_loaders import DirectoryLoader

def load_docs(directory):
    loader = DirectoryLoader(directory, recursive=False)
    documents = loader.load()
    return documents

path2 = directory
langchain_documents = load_docs(path2)
print("Total Documnets : ",len(langchain_documents))

Total Documnets :  2


In [123]:
from llama_index import Document
documents=[]
files = []
for doc in langchain_documents:
    file = doc.metadata['source'].split("/")[-1].split(".")[0].lower()
    doc.metadata['source'] = file
    files.append(file)
    documents.append(Document(text =doc.page_content , metadata=doc.metadata))

In [150]:
documents[0].metadata
print(files)
print(len(documents))

['inputdocumentone', 'inputdocumenttwo']
2


# **Load LLM :** Note that, to run this code you need open-api key

In [131]:
from llama_index.llms import OpenAI
openai.api_key = "api-key"
model = 'text-davinci-003'
llm = OpenAI(temperature=0, max_tokens=1024, model=model)

In [132]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    #callback_manager = CallbackManager([llama_debug]),
    #embed_model=embed_model
)

# **Document Indexing**

In [128]:
from llama_index import TreeIndex, KeywordTableIndex, ListIndex
from llama_index.indices.composability import ComposableGraph

# Index summary
index_summaries = dict()
for i in range(len(documents)):
    index_summaries[files[i]] = "this index provides all the financial information stored in "+ files[i] + " file."

# Calculate or load Index
vector_directory = f"{vector_directory}/vector/"
index_set = dict()
try:
  storage_context = StorageContext.from_defaults(persist_dir = vector_directory)
  print("Vector directory already exists. Going to load them from storage")
  for file in files:
    index_set[file] = load_index_from_storage(storage_context, file)

except:
  print("Indexing does not exist. Going to calculate")
  storage_context = StorageContext.from_defaults()
  for i in range(len(documents)):
    index = VectorStoreIndex.from_documents([documents[i]], service_context=service_context, storage_context=storage_context)
    index.set_index_id(files[i])
    index.storage_context.persist(vector_directory)
    index_set[files[i]] = index

index_set

Vector directory already exists. Going to load them from storage


{'inputdocumentone': <llama_index.indices.vector_store.base.VectorStoreIndex at 0x78f733d4c040>,
 'inputdocumenttwo': <llama_index.indices.vector_store.base.VectorStoreIndex at 0x78f733d4d030>}

# For complex queries below approach can be used, which devides your questios in multiple subquestions automatically.

In [113]:
# query_engine_set = dict()
# for key, index in index_set.items():
#     query_engine_set[key] = index.as_query_engine(
#                                                   #k=3,
#                                                   node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.75)],
#                                                   service_context=service_context
#                                                   )

# query_tools=[]
# for key, index in query_engine_set.items():
#     query_tool = QueryEngineTool.from_defaults(
#         query_engine=query_engine_set[key],
#         name=key,
#         description=f"Provides information about {key.split('.')[0]}",
#     )
#     query_tools.append(query_tool)

# query_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_tools)

In [129]:
def print_large(text, font_size=20):
    text = re.sub('\n','<br/>',text)
    html_text = f"<p style='font-size:{font_size}px'>{text}</p>"
    from IPython.core.display import display, HTML
    display(HTML(html_text))

In [177]:
graph = ComposableGraph.from_indices(
    KeywordTableIndex,
    [index_set[key] for key in index_set],
    [index_summaries[key] for key in index_summaries],
    service_context=service_context,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.90)]
)

query_engine = graph.as_query_engine(service_context=service_context, k=2)

# **QUESTIONS ANSWERS STARTS FROM HERE**

# **QUESTION 1: What is an Acceptable Bank?**

In [178]:
response = query_engine.query("what is an Acceptable Banks in inputDocumentOne")
print_large(str(response))

#**QUESTION 2: What is Margin Definition?**

In [179]:
response = query_engine.query("Margin definition according to inputDocumentOne. describe the table as well")
print_large(str(response))

#**QUESTION: What is governing law?**

In [180]:
response = query_engine.query("governing law in inputDocumentOne under GOVERNING LAW AND ENFORCEMENT")
print_large(str(response))

# **QUESTION: Fetach all defintitions from section 1.1 of inputDocumentTwo?**

In [181]:
response = query_engine.query("fetch all the definition terms mentioned in section 1.1 of inputDocumentTwo.")
print_large(str(response))
print("Total Terms Fetched =", len(str(response).split(',')))

Total Terms Fetched = 48


Note that above answer can further be improved.I know it is not the best answer, but each experiment is costing me money as i am using open AI api. I tried using open sorece LLM, but my system is not very powerful enough to load those models and work with them. My google colab and kaggle GPU usage cota has already ran out.

# **QUESTION: What is Total Commitments means in inputDocumentTwo?**

In [182]:
response = query_engine.query("what is Total Commitments means amount in inputDocumentTwo.")
print_large(str(response))