# To locate FIT and LOT in large documents

In [8]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import json
import openai
import os

with open('../local_settings.json') as f:
    data = json.load(f)

# Set form recogniser client
credential = AzureKeyCredential(data["FORM_KEY"])
document_analysis_client = DocumentAnalysisClient(data["FORM_ENDPOINT"], credential)

# This example also requires an OpenAI API key
os.environ['OPENAI_API_KEY'] = data['OPENAI_API_KEY']
openai.api_key = os.environ['OPENAI_API_KEY']

In [9]:
field_names = ["Mission", "Colour Palette"]
field_descriptions = ["", ""]

json_template = json.dumps({"Results": [dict(zip(field_names, field_descriptions))]})


system = f"""
You are an assistant that given a text extracted using OCR from a document will extract user provided data fields.
Fields can have multiple formats.
Write your output as a JSON with an entry with the format {json_template} per each test you find.
If there is a field that you can not find, set it a null.
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}
"""   # noqa E501

In [11]:
import pdfplumber

path = "../data/example/Elastacloud-Brand-Book.pdf"

relevant_text = ""
with pdfplumber.open(path) as pdf:
    for page in pdf.pages:
        relevant_text += pdf.pages[page.page_number-1].extract_text()

In [13]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index import Document
from langchain.chat_models import ChatOpenAI
from llama_index import LLMPredictor


documents = [Document(text=relevant_text)]

node_parser = SimpleNodeParser.from_defaults(chunk_size=4096,
                                                chunk_overlap=200)
                                                
llm = ChatOpenAI(temperature=0, max_tokens=512)
llm_predictor = LLMPredictor(llm=llm)
service_context = ServiceContext.from_defaults(node_parser=node_parser,
                                                llm_predictor=llm_predictor)

index = VectorStoreIndex.from_documents(documents,
                                        service_context=service_context)
query_engine = index.as_query_engine()

In [14]:
response = query_engine.query("What is EC color palette?")

In [15]:
response.get_formatted_sources

<bound method Response.get_formatted_sources of Response(response="The EC color palette consists of a selection of colors that are approved for use in our branding. These colors are specifically Elastacloud navy, Elastacloud blue, Elastacloud sky, Elastacloud yellow, and Elastacloud green. Each color has its own CMYK, RGB, and Hex values that should be used for different purposes like print, screen, and web. It is important to use the correct color references for each application to maintain consistency in our brand's visual identity.", source_nodes=[NodeWithScore(node=TextNode(id_='5a773191-7330-45c5-b33b-861abf518e76', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6739222c-6c0a-4eef-9095-b8a4d9076524', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='c5ac0d8386bb12226c1165daffb9772cb7b9fe65d96348023933eba1df32bbff'), <NodeRelationship.PREVIOUS: '2'>: RelatedN

In [16]:
response.source_nodes[1]

NodeWithScore(node=TextNode(id_='044e2f22-f3db-4698-bb31-5a4ea2d21256', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6739222c-6c0a-4eef-9095-b8a4d9076524', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='c5ac0d8386bb12226c1165daffb9772cb7b9fe65d96348023933eba1df32bbff'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='5a773191-7330-45c5-b33b-861abf518e76', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='c9dd24bd532a4a9acf53d696ce101ef1060b7e4f69e7d5c7c6b6497d5278c995')}, hash='45201f4ed06273264bcd39c3634129ae93e07943c8c6677d30f31b1259521fbc', text='BRAND BOOK\nV4\nMarch 2023WHO WE ARE\nOUR VISION\nGlobal industry transformation through data and next generation AI\nWe believe every business has immense power in its data. The power to unlock growth, to unleash intelligence, and to accelerate\noutcomes – responsibly, and with the help of AI.\nWe’re p

## PDF retrieval tool

In [19]:
# read a pdf and answers question on it
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("../data/example/Elastacloud-Brand-Book.pdf")
pages = loader.load_and_split()
# check the metadata
pg_no=6
print(f"Content:\n {pages[pg_no].page_content} \n metadata:\n {pages[pg_no].metadata}")

Content:
 A culture of giving back
Elastacloud firmly believes that data can break boundaries 
and benefit humanity, which is why we are heavily 
invested in projects and actions that improve outcomes 
for individuals and the planet. One way we do this is by 
empowering our Elastaclouders to contribute to the wider 
community through sharing their digital skills.For well over a decade, Elastacloud has established and 
maintained various industry user groups: joining together 
more than 12,000 people around the world for community-
based learning and innovation. This includes one of the 
largest data science communities in Europe.
We are also proud to sponsor computer labs at Shanti 
Bhavan, a residential school in India for children born into 
the lowest socioeconomic class, which aims to uplift from 
poverty through education and opportunity. Our team in 
India run coding labs for the students there, sometimes 
held in the Elastacloud offices. 
Elastacloud has a deeply embedded commit

In [23]:
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains import create_qa_with_sources_chain
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma


# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(pages, embedding_function)

In [55]:
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains import create_qa_with_sources_chain, RetrievalQA
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma


# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(pages, embedding_function)


llm_src = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

qa_chain = create_qa_with_sources_chain(llm_src)

doc_prompt = PromptTemplate(
    template="Content: {page_content}\n Source: {source} - page {page}", # look at the prompt does have page#
    input_variables=["page_content", "source", "page"],
)

final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain, 
    document_variable_name='context',
    document_prompt=doc_prompt,
)
retrieval_qa = RetrievalQA(
    retriever=db.as_retriever(),
    combine_documents_chain=final_qa_chain
)

In [56]:

query = "What is EC Mission?"

answer_1 = retrieval_qa.run(query)
print(answer_1)

{
  "answer": "The mission of Elastacloud is to be data pioneers, unleashing the power of data, accelerating transformational outcomes, serving as trusted advisors, and fostering a global community.",
  "sources": ["../data/example/Elastacloud-Brand-Book.pdf - page 3"]
}


## LLM call

In [None]:
input = relevant_text

json_template = json.dumps({ 
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<MW value in sg>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg. Calculated as: EMW = MW + (P-FIT / (TVD-shoe * 1.421))>"
})

system = f"""
You are an API that given a text extracted using OCR from an End of Well Report will extract Formation Integrity Test (FIT) and Leak Off Test (LOT) results.
Your response will be a JSON with as many entries as needed in the format {json_template}
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}

The format of your input will be the text of the relevant page.
"""

config = {
    "temperature": 0.2,
    "max_tokens": 512,
    "top_p": 1,
}

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": system},
      {"role": "user", "content": input},
    ],
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
  )

print(response.get("choices")[0]["message"]["content"])

In [241]:
input = relevant_text

json_template = json.dumps({ 
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<MW value in sg>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg. Calculated as: EMW = MW + (P-FIT / (TVD-shoe * 1.421))>"
})

system = f"""
You are an API that given a text extracted using OCR from an End of Well Report will extract Formation Integrity Test (FIT) and Leak Off Test (LOT) results.
Your response will be a JSON with as many entries as needed in the format {json_template}
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}

The format of your input will be the text of the relevant page.
"""

config = {
    "temperature": 0.2,
    "max_tokens": 512,
    "top_p": 1,
}

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k",
    messages=[
      {"role": "system", "content": system},
      {"role": "user", "content": input},
    ],
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
  )

print(response.get("choices")[0]["message"]["content"])

{"text": "Formation Integrity Test (FIT)\n\nCasing Shoe: 10000 ft\nTVD (m): 3000\nSurface pressure (psi): 5000\nMW (sg): 1.2\n\nLeak Off Test (LOT)\n\nCasing Shoe: 10000 ft\nTVD (m): 3000\nSurface pressure (psi): 5000\nMW (sg): 1.2\nEMW (sg): 1.3"}


Good enough with 16k, let's check with form recogniser and see 

In [None]:
len(encoder.encode(extracted_text["content"]))

In [None]:
input = extracted_text["content"]

json_template = json.dumps({ 
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg>"
})

system = f"""
You are an API that given a text extracted using OCR from an End of Well Report will extract Formation Integrity Test (FIT) and Leak Off Test (LOT) results.
There can be multiple tests per shoe size and depth, please report them all.
Your response will be a JSON with one entry per test in the format {json_template}.
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}

The format of your input will be the text of the relevant page.
"""

config = {
    "temperature": 0.2,
    "max_tokens": 512,
    "top_p": 1,
}

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k",
    messages=[
      {"role": "system", "content": system},
      {"role": "user", "content": input},
    ],
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
  )

print(response.get("choices")[0]["message"]["content"])

# Implementing Llama Index 

In [244]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

In [245]:
documents = SimpleDirectoryReader("../llama_dir/").load_data()

In [246]:
index = VectorStoreIndex.from_documents(documents)

In [247]:
query_engine = index.as_query_engine()
response = query_engine.query("what's the EMW of the FIT performed on the well 206/12a-3?")
print(response)

The EMW of the FIT performed on the well 206/12a-3 is 1.75sg.


In [248]:
query_engine = index.as_query_engine()
response = query_engine.query("How many test where performend on the well 206/12a-3?")
print(response)

Two tests were performed on the well 206/12a-3.


In [249]:
query_engine = index.as_query_engine()
response = query_engine.query("In which page on the end of well 206/12a-3 report are the LOT and FIT reported?")
print(response)

The LOT and FIT are reported on page 53 of the end of well 206/12a-3 report.


# LLama Index Approach

In [None]:
documents = SimpleDirectoryReader(
    input_files=[path]
).load_data()
index = VectorStoreIndex.from_documents(documents)

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("In which page are the LOT and FIT reported?")
print(response)

In [242]:
json_template = json.dumps({
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<Surface pressure value>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg>"
})

system = f"""
You are an assistant that given a text extracted using OCR from an End of Well Report will extract 'Formation Integrity Test' (FIT) and 'Leak Off Test' (LOT) results.
There can be multiple tests, report all of them.
Write your output as a list with an entry with the format {json_template} per each test you find, separated by commas.
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}
"""

In [243]:
response = query_engine.query(system)
print(response)

{"Test Type": "FIT", "Casing Shoe": "20", "TVD (m)": "536", "Surface pressure (psi)": "101.0", "MW (sg)": "1.15", "EMW (sg)": "1.26"},
{"Test Type": "LOT", "Casing Shoe": "20", "TVD (m)": "536", "Surface pressure (psi)": "84.0", "MW (sg)": "1.15", "EMW (sg)": "1.26"},
{"Test Type": "FIT", "Casing Shoe": "13 3/8", "TVD (m)": null, "Surface pressure (psi)": "328", "MW (sg)": "1.55", "EMW (sg)": null},
{"Test Type": "LOT", "Casing Shoe": "13 3/8", "TVD (m)": null, "Surface pressure (psi)": "360", "MW (sg)": "1.55", "EMW (sg)": null}


# Loading data from Form Recogniser

In [228]:
import io

# OCR from base form recogniser
def base_form_recogniser(pdf_bytes: io.BytesIO) -> dict:
    document = pdf_bytes.getvalue()

    # Start the document analysis
    poller = document_analysis_client.begin_analyze_document("prebuilt-document", document, polling_interval=5)

    # Get the result
    result = poller.result()
    data = result.to_dict()
    return data

In [229]:
from PyPDF4 import PdfFileWriter, PdfFileReader

inputpdf = PdfFileReader(open(path, "rb"))

output = PdfFileWriter()

relevant_text = ""

with pdfplumber.open(path) as pdf:
    for page in pdf.pages:
        if page.search("FIT") and page.search("LOT"):
            output.addPage(inputpdf.pages[page.page_number])
            print(page.page_number)

output_bytesio = io.BytesIO()

output.write(output_bytesio)

4
5
8
13
15
52
53
54


In [230]:
extracted_text = base_form_recogniser(output_bytesio)

In [231]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index import Document

text_list = [extracted_text["content"]]
documents = [Document(text=t) for t in text_list]

node_parser = SimpleNodeParser.from_defaults(chunk_size=4096, chunk_overlap=200)
service_context = ServiceContext.from_defaults(node_parser=node_parser)

index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()

In [239]:
json_template = json.dumps({
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<Surface pressure value>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg>"
})

system = f"""
You are an assistant that given a text extracted using OCR from an End of Well Report will extract 'Formation Integrity Test' (FIT) and 'Leak Off Test' (LOT) results.
You must prioritize reporting the EMW value, meassured in sg.
There can be multiple tests, report all of them.
Write an entry with the format {json_template} per each test you find.
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}
"""

In [240]:
response = query_engine.query(system)
print(response)

{"Test Type": "FIT", "Casing Shoe": "9 5/8", "TVD (m)": "2081.4", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "1.69"}
{"Test Type": "LOT", "Casing Shoe": "null", "TVD (m)": "536", "Surface pressure (psi)": "84.0", "MW (sg)": "null", "EMW (sg)": "1.26"}
{"Test Type": "LOT", "Casing Shoe": "null", "TVD (m)": "null", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "null"}
{"Test Type": "FIT", "Casing Shoe": "13 3/8", "TVD (m)": "null", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "1.55"}
{"Test Type": "FIT", "Casing Shoe": "null", "TVD (m)": "null", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "null"}
