### This little tutorial focuses on the task of entity extraction from a PDF file using OCR + GenAI. The use of LangChain is illustrated as a convenient tool library.

In [89]:
import os
import boto3
from my_package.config import global_config as glob
# from my_package.config import config as cfg
from my_package.utils.utils import s3_health_check 

#cfg.model_config

In [2]:
s3_health_check()

S3 credentials are valid 😀


(Optional) Work with PDF locally by downloading it from S3:

In [3]:
# The S3 bucket and file name of the PDF
bucket_name = 'sagemaker-foundry-610968375774-eu-central-1'
object_key = 'firmware-migration-data/raw_documents_from_foundry/'
local_filename = '005fed92-d96d-11eb-bd22-030439b8049b.pdf'

# Initialize a boto3 client
s3 = boto3.client('s3')
    
# Download the PDF file from S3
s3.download_file(Bucket=bucket_name, Key=os.path.join(object_key, local_filename), Filename=os.path.join(glob.UC_DATA_PKG_DIR, local_filename))
print("Download successful.")

Download successful.


Or using convenience wrapper class:

In [4]:
from my_package.services import s3_client

my_client = s3_client.S3Client()

my_client.download_file(key=os.path.join(object_key, local_filename), file_path=os.path.join(glob.UC_DATA_PKG_DIR, local_filename))


S3 client available.
Bucket name: sagemaker-foundry-610968375774-eu-central-1
File firmware-migration-data/raw_documents_from_foundry/005fed92-d96d-11eb-bd22-030439b8049b.pdf successfully downloaded from sagemaker-foundry-610968375774-eu-central-1 to /Users/mn9lu5b/Library/CloudStorage/OneDrive-Allianz/Github/Team AZ Account/genai-quickstart/src/data/005fed92-d96d-11eb-bd22-030439b8049b.pdf


### 1.) Use Textract to retrieve raw text from single PDF (stored in S3)

Check out the textractor package! pip install amazon-textract-textractor

In [5]:
import os
import boto3
import time
import pandas as pd

# Initialize a boto3 client for Textract
textract = boto3.client('textract')

# Call Amazon Textract
response = textract.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': bucket_name,
            'Name': os.path.join(object_key, local_filename)
        }
    }
)

# The job ID is needed to get the result
job_id = response['JobId']
print(f"Started job with id: {job_id}")

# Check the job status and wait for it to complete
status = ''
time.sleep(5)  # Wait a few seconds before checking the status
while status != 'SUCCEEDED':
    result = textract.get_document_text_detection(JobId=job_id)
    status = result['JobStatus']
    if status == 'FAILED':
        raise Exception("Textract Job Failed")
    time.sleep(5)  # Wait between checks

# Collect lines of raw text in a list
lines = []
for block in result['Blocks']:
    if block['BlockType'] == 'LINE':
        #print(block['Text'])
        lines.append(block['Text'])

df = pd.DataFrame(lines, columns=['Extracted Text:']) 

Started job with id: 04496cb1c0636f9b68fcd2b33fc490a8339997133509b75dae5cc6920f9d89f4


In [6]:
#print(df.head(50))
#df.to_csv('extracted_text.csv', index=False)

# Concatenate all the text into a single string (only for keeping things simple here)
document = df['Extracted Text:'].str.cat(sep=' ')

len(document)                  # just checkin for context window size

6673

In [7]:
print(document)

Von: Rokosch, Kerstin (Allianz Deutschland) (kerstin.rokosch@allianz.de) Gesendet: Dienstag, 29. Juni 2021 15:24:50 An: Firmen_Sachbetrieb_BG@allianz.de Sachsubstanz-2021-06-28-ISP-02 Swis Life REF German High Street Anlagen: (gültig 2021-01-01).pdf Sachsubstanz-2021-06-28-ISP-02 Swiss Life REF European Retail Fund Germany (gültig 2021-01-01).pdf Betreff: GSV 10/0055/8520220%%Policen unterzeichnet an Makler%% Von: Rokosch, Kerstin (Allianz Deutschland) Gesendet: Dienstag, 29. Juni 2021 15:24 An: 'd.hoff@funk-gruppe.de' <d.hoff@funk-gruppe.de> Betreff: Swiss Life REF European Retail Fund Germany und Swiss Life REF German High Street / Policen zur Unterschrift Liebe Frau Hoff, anbei erhalten Sie die unterzeichneten Policen. Viele Grüße, Kerstin Rokosch Allianz Versicherungs-AG Firmen Sach Fachberatung Individual Gruppe 1 10900 Berlin Mail kerstin.rokosch@allianz.de Tel: 030/53893-33065 Fax: 030/53893-833065 Allianz Versicherungs-Aktiengesellschaft USt-IdNr: DE 811 150 709; für Versicheru

### 2.) Extract some basic entities from the Textract results using Bedrock

Define custome Callback handler:
https://python.langchain.com/v0.1/docs/modules/callbacks/

In [85]:
from typing import Any
from uuid import UUID
from langchain.callbacks.base import BaseCallbackHandler
# from langchain_core.callbacks import StdOutCallbackHandler
from langchain_core.runnables import RunnableConfig
from langchain_core.outputs import LLMResult

# Implement a callback handler that logs the generated text and the number of tokens used
class BedrockHandler(BaseCallbackHandler):

    def __init__(self, initial_text=""):
        self.text = initial_text
        self.input_token_count = 0
        self.output_token_count = 0
        self.stop_reason = None

    def on_llm_new_token(self, token: str, **kwargs):
        self.text += token
        # do something

    def on_llm_end(
        self,
        response: LLMResult,
        *,
        run_id: UUID,
        parent_run_id: UUID | None = None,
        **kwargs: Any,
    ) -> Any:
        if response.llm_output is not None:
            self.input_token_count = response.llm_output.get("usage", {}).get("prompt_tokens", None)
            self.output_token_count = response.llm_output.get("usage", {}).get("completion_tokens", None)
            self.stop_reason = response.llm_output.get("stop_reason", None)


# class MyCustomHandler(BaseCallbackHandler):
#     def on_llm_new_token(self, token: str, **kwargs) -> None:
#         print(f"My custom handler, token: {token}")
        

Output format: JSON

In [86]:
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

class OutputStructure(BaseModel):
    """
    Represents the output structure for customer information.

    Attributes:
        first_name (str): First name of the customer.
        last_name (str): Last name of the customer.
        email (str): Email address of the customer.
    """
    first_name: str = Field(description="First name of the customer")
    last_name: str = Field(description="Last name of the customer")
    email: str = Field(description="Email address of the customer")
 
parser = JsonOutputParser(pydantic_object=OutputStructure)

In [87]:
from langchain_aws import ChatBedrock
from langchain_core.prompts import PromptTemplate
# from langchain.chains import LLMChain
from pprint import PrettyPrinter
 
# handler1 = StdOutCallbackHandler()     # verbose output
handler2 = BedrockHandler()
# handler3 = MyCustomHandler()

inference_modifier = {
    "max_tokens": 2000,
    "temperature": 0.1,
}

bedrock_runtime = boto3.client(service_name='bedrock-runtime')

model = ChatBedrock(
        #model_id="anthropic.claude-3-haiku-20240307-v1:0",
        model_id="anthropic.claude-3-sonnet-20240229-v1:0",
        client=bedrock_runtime,
        model_kwargs=inference_modifier
    )
  
template = """
Following is an insurance contract. Please extract the following information from it: 
First name of the customer, the last name of the customer and its Email address: {context}.\n 
Please the following json output format:\n{format_instructions}
"""
 
prompt = PromptTemplate(
    template=template,
    input_variables=["context"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)
 
chain = prompt | model | parser
# chain = LLMChain(llm=model, prompt=prompt, callbacks=[handler2], verbose=False, output_parser=parser)  # alternative way to define chain
  
# response = chain.invoke({"context": document})
response = chain.invoke({"context": document}, RunnableConfig(callbacks=[handler2]))
 
PrettyPrinter().pprint(response) 

{'email': 'D.Hoff@funk-gruppe.de', 'first_name': 'Diana', 'last_name': 'Hoff'}


In [88]:
# input_token_count = response.response_metadata.get("usage", {}).get("prompt_tokens", 0)
# output_token_count = response.response_metadata.get("usage", {}).get("completion_tokens", 0)   # via boto3 response

print("Input token count:", handler2.input_token_count)
print("Output token count:", handler2.output_token_count)

Input token count: 3035
Output token count: 46
