# To locate FIT and LOT in large documents

In [225]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import json
import tiktoken
import openai
import os

with open('../settings.json') as f:
    data = json.load(f)

# Set form recogniser client
credential = AzureKeyCredential(data["FORM_KEY"])
document_analysis_client = DocumentAnalysisClient(data["FORM_ENDPOINT"], credential)

# This example also requires an OpenAI API key
os.environ['OPENAI_API_KEY'] = data['OPENAI_API_KEY']
openai.api_key = os.environ['OPENAI_API_KEY']

# Set up Token Enconder for meassurement
encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [226]:
import pdfplumber

path = "../data/206_12a-3 (SW Clair F1) Geological EOWR_Signed.pdf"

relevant_text = ""
with pdfplumber.open(path) as pdf:
    for page in pdf.pages:
        if page.search("FIT") or page.search("LOT"):
            relevant_text += pdf.pages[page.page_number].extract_text()
            print(page.page_number)

4
5
8
13
14
15
16
52
53
54
55


In [227]:
len(encoder.encode(relevant_text))

8018

While considerably less content, it is still prone to go over the model's token limit

## LLM call

In [None]:
input = relevant_text

json_template = json.dumps({ 
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<MW value in sg>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg. Calculated as: EMW = MW + (P-FIT / (TVD-shoe * 1.421))>"
})

system = f"""
You are an API that given a text extracted using OCR from an End of Well Report will extract Formation Integrity Test (FIT) and Leak Off Test (LOT) results.
Your response will be a JSON with as many entries as needed in the format {json_template}
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}

The format of your input will be the text of the relevant page.
"""

config = {
    "temperature": 0.2,
    "max_tokens": 512,
    "top_p": 1,
}

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": system},
      {"role": "user", "content": input},
    ],
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
  )

print(response.get("choices")[0]["message"]["content"])

In [241]:
input = relevant_text

json_template = json.dumps({ 
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<MW value in sg>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg. Calculated as: EMW = MW + (P-FIT / (TVD-shoe * 1.421))>"
})

system = f"""
You are an API that given a text extracted using OCR from an End of Well Report will extract Formation Integrity Test (FIT) and Leak Off Test (LOT) results.
Your response will be a JSON with as many entries as needed in the format {json_template}
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}

The format of your input will be the text of the relevant page.
"""

config = {
    "temperature": 0.2,
    "max_tokens": 512,
    "top_p": 1,
}

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k",
    messages=[
      {"role": "system", "content": system},
      {"role": "user", "content": input},
    ],
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
  )

print(response.get("choices")[0]["message"]["content"])

{"text": "Formation Integrity Test (FIT)\n\nCasing Shoe: 10000 ft\nTVD (m): 3000\nSurface pressure (psi): 5000\nMW (sg): 1.2\n\nLeak Off Test (LOT)\n\nCasing Shoe: 10000 ft\nTVD (m): 3000\nSurface pressure (psi): 5000\nMW (sg): 1.2\nEMW (sg): 1.3"}


Good enough with 16k, let's check with form recogniser and see 

In [None]:
len(encoder.encode(extracted_text["content"]))

In [None]:
input = extracted_text["content"]

json_template = json.dumps({ 
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg>"
})

system = f"""
You are an API that given a text extracted using OCR from an End of Well Report will extract Formation Integrity Test (FIT) and Leak Off Test (LOT) results.
There can be multiple tests per shoe size and depth, please report them all.
Your response will be a JSON with one entry per test in the format {json_template}.
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}

The format of your input will be the text of the relevant page.
"""

config = {
    "temperature": 0.2,
    "max_tokens": 512,
    "top_p": 1,
}

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k",
    messages=[
      {"role": "system", "content": system},
      {"role": "user", "content": input},
    ],
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
  )

print(response.get("choices")[0]["message"]["content"])

# Implementing Llama Index 

In [244]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

In [245]:
documents = SimpleDirectoryReader("../llama_dir/").load_data()

In [246]:
index = VectorStoreIndex.from_documents(documents)

In [247]:
query_engine = index.as_query_engine()
response = query_engine.query("what's the EMW of the FIT performed on the well 206/12a-3?")
print(response)

The EMW of the FIT performed on the well 206/12a-3 is 1.75sg.


In [248]:
query_engine = index.as_query_engine()
response = query_engine.query("How many test where performend on the well 206/12a-3?")
print(response)

Two tests were performed on the well 206/12a-3.


In [249]:
query_engine = index.as_query_engine()
response = query_engine.query("In which page on the end of well 206/12a-3 report are the LOT and FIT reported?")
print(response)

The LOT and FIT are reported on page 53 of the end of well 206/12a-3 report.


# LLama Index Approach

In [None]:
documents = SimpleDirectoryReader(
    input_files=[path]
).load_data()
index = VectorStoreIndex.from_documents(documents)

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("In which page are the LOT and FIT reported?")
print(response)

In [242]:
json_template = json.dumps({
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<Surface pressure value>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg>"
})

system = f"""
You are an assistant that given a text extracted using OCR from an End of Well Report will extract 'Formation Integrity Test' (FIT) and 'Leak Off Test' (LOT) results.
There can be multiple tests, report all of them.
Write your output as a list with an entry with the format {json_template} per each test you find, separated by commas.
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}
"""

In [243]:
response = query_engine.query(system)
print(response)

{"Test Type": "FIT", "Casing Shoe": "20", "TVD (m)": "536", "Surface pressure (psi)": "101.0", "MW (sg)": "1.15", "EMW (sg)": "1.26"},
{"Test Type": "LOT", "Casing Shoe": "20", "TVD (m)": "536", "Surface pressure (psi)": "84.0", "MW (sg)": "1.15", "EMW (sg)": "1.26"},
{"Test Type": "FIT", "Casing Shoe": "13 3/8", "TVD (m)": null, "Surface pressure (psi)": "328", "MW (sg)": "1.55", "EMW (sg)": null},
{"Test Type": "LOT", "Casing Shoe": "13 3/8", "TVD (m)": null, "Surface pressure (psi)": "360", "MW (sg)": "1.55", "EMW (sg)": null}


# Loading data from Form Recogniser

In [228]:
import io

# OCR from base form recogniser
def base_form_recogniser(pdf_bytes: io.BytesIO) -> dict:
    document = pdf_bytes.getvalue()

    # Start the document analysis
    poller = document_analysis_client.begin_analyze_document("prebuilt-document", document, polling_interval=5)

    # Get the result
    result = poller.result()
    data = result.to_dict()
    return data

In [229]:
from PyPDF4 import PdfFileWriter, PdfFileReader

inputpdf = PdfFileReader(open(path, "rb"))

output = PdfFileWriter()

relevant_text = ""

with pdfplumber.open(path) as pdf:
    for page in pdf.pages:
        if page.search("FIT") and page.search("LOT"):
            output.addPage(inputpdf.pages[page.page_number])
            print(page.page_number)

output_bytesio = io.BytesIO()

output.write(output_bytesio)

4
5
8
13
15
52
53
54


In [230]:
extracted_text = base_form_recogniser(output_bytesio)

In [231]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index import Document

text_list = [extracted_text["content"]]
documents = [Document(text=t) for t in text_list]

node_parser = SimpleNodeParser.from_defaults(chunk_size=4096, chunk_overlap=200)
service_context = ServiceContext.from_defaults(node_parser=node_parser)

index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()

In [239]:
json_template = json.dumps({
    "Test Type": "<FIT or LOT>",
    "Casing Shoe": "<Casing shoe size>",
    "TVD (m)": "TVD in meters",
    "Surface pressure (psi)": "<Surface pressure value>",
    "MW (sg)": "<MW value in sg>",
    "EMW (sg)": "<EMW value in sg>"
})

system = f"""
You are an assistant that given a text extracted using OCR from an End of Well Report will extract 'Formation Integrity Test' (FIT) and 'Leak Off Test' (LOT) results.
You must prioritize reporting the EMW value, meassured in sg.
There can be multiple tests, report all of them.
Write an entry with the format {json_template} per each test you find.
If there is a field that you can not find, set it a null.
If the document has any kind of errors or is corrupted, add a field {{"errors": "<error description>"}}
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}
"""

In [240]:
response = query_engine.query(system)
print(response)

{"Test Type": "FIT", "Casing Shoe": "9 5/8", "TVD (m)": "2081.4", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "1.69"}
{"Test Type": "LOT", "Casing Shoe": "null", "TVD (m)": "536", "Surface pressure (psi)": "84.0", "MW (sg)": "null", "EMW (sg)": "1.26"}
{"Test Type": "LOT", "Casing Shoe": "null", "TVD (m)": "null", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "null"}
{"Test Type": "FIT", "Casing Shoe": "13 3/8", "TVD (m)": "null", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "1.55"}
{"Test Type": "FIT", "Casing Shoe": "null", "TVD (m)": "null", "Surface pressure (psi)": "null", "MW (sg)": "null", "EMW (sg)": "null"}
