In [9]:
import logging
import boto3
from botocore.exceptions import ClientError
from botocore.exceptions import NoCredentialsError
import os


In [11]:


def upload_file_to_s3(file_name, bucket_name, object_name=None):
    """
    Uploads a file to an S3 bucket.
    
    :param file_name: Path to the file to upload
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name. If not specified, file_name is used
    :return: a string of the response
    """
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Initialize the S3 client
    s3 = boto3.client('s3')

    
    try:
        with open(file_name, "rb") as file_data: # Uploading the FILE CONTENTS not the filepath
            response = s3.put_object(
                Body=file_data,
                Bucket=bucket_name,
                Key=object_name,                # This is the what the file will be called in S3
            )
        s = response
        print(response)
        return s
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")



In [13]:
# Use your own bucket and file name
bucket_name = "test-bucket-cnevares-2024"
file_name = "data/Supplies1.jpg"
object_name = file_name.split('/')[1]


In [56]:
s = upload_file_to_s3(file_name, bucket_name, object_name=object_name)

{'ResponseMetadata': {'RequestId': 'M8B731H3N9WJV9N5', 'HostId': '/3wpVWxjwlxMIFCb3oYEE/lTPkbo2CeodY+rlD9bTiG9/YHoISw4NvAQDZygKwQ5nL3KQnUqiC1+Q7S3fs1MSKzQVRc5J01e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': '/3wpVWxjwlxMIFCb3oYEE/lTPkbo2CeodY+rlD9bTiG9/YHoISw4NvAQDZygKwQ5nL3KQnUqiC1+Q7S3fs1MSKzQVRc5J01e', 'x-amz-request-id': 'M8B731H3N9WJV9N5', 'date': 'Tue, 24 Dec 2024 21:00:24 GMT', 'x-amz-server-side-encryption': 'AES256', 'etag': '"276a1a0737b038e64d9324f56ae0fdcc"', 'content-length': '0', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'ETag': '"276a1a0737b038e64d9324f56ae0fdcc"', 'ServerSideEncryption': 'AES256'}


In [14]:
def analyze_receipt(bucket_name, object_name):
    """
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name
    :return: string of the response
    """
    
    client = boto3.client('textract')

    try:
        response = client.analyze_expense(
            
            Document = {
                "S3Object": {
                    "Bucket": bucket_name,
                    "Name": object_name
                }
            }
        )
        s = response
        print("Success!")
        return s
        
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [15]:
s = analyze_receipt(bucket_name, object_name)

Success!


In [16]:
# Retrieving the FIRST itemization
print(s['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][0]['Type']['Text'])
print(s['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][0]['ValueDetection']['Text'])

ITEM
1 of: Elgato Wave: 3 Premium Studio
Free Mixer Quality USB Condenser Microphone for Streaming, Podcast, Gaming and Home Office,
Sold by: Amazon.com Software, Sound Effect Plugins, Anti-Distortion, Plug 'n Play, for Mac, PC
Services, Inc
Supplied by: Other


In [99]:
## The Fields detected and their values
for i in range(len(s['ExpenseDocuments'][0]['SummaryFields'])):
    print(s['ExpenseDocuments'][0]['SummaryFields'][i]['Type']['Text'] +": " + s['ExpenseDocuments'][0]['SummaryFields'][i]['ValueDetection']['Text'] )
    print()
print()

ADDRESS: Christopher S, Nevares
17035 W VALLEY HWY
SEATTLE, WA 98188-5519
United States

STREET: 17035 W VALLEY HWY

CITY: SEATTLE,

STATE: WA

ZIP_CODE: 98188-5519

COUNTRY: United States

NAME: Christopher S, Nevares

ADDRESS_BLOCK: 17035 W VALLEY HWY
SEATTLE, WA 98188-5519
United States

ADDRESS: Christopher S Nevares
10812 9TH ST E
EDGEWOOD, WA 98372-6607
United States

STREET: 10812 9TH ST E

CITY: EDGEWOOD,

STATE: WA

ZIP_CODE: 98372-6607

COUNTRY: United States

NAME: Christopher S Nevares

ADDRESS_BLOCK: 10812 9TH ST E
EDGEWOOD, WA 98372-6607
United States

NAME: Amazon.com Inc.

INVOICE_RECEIPT_DATE: 8/13/24.

INVOICE_RECEIPT_ID: 112-9546U10-U268240

INVOICE_RECEIPT_ID: #112-9546010-0268240

INVOICE_RECEIPT_ID: 112-9546010-0268240

ORDER_DATE: August 13, 2024

RECEIVER_ADDRESS: Christopher S, Nevares
17035 W VALLEY HWY
SEATTLE, WA 98188-5519
United States

RECEIVER_ADDRESS: Christopher S Nevares
10812 9TH ST E
EDGEWOOD, WA 98372-6607
United States

RECEIVER_NAME: Christopher 

In [470]:
# Condensing response into only useful(ish) values

D = {}
D['prompt'] = 'You are an expert in receipt categorization. Categorize the following receipt into one of these categories: Meals, Supplies, Safety, Travel, Lodging, or Other. \
\nRespond in the format: "Category: <category>." \
\nDo not include explanations, steps, or any additional text.\nReceipt:'
exclude = ["ADDRESS", "ADDRESS_BLOCK", "RECEIVER_ADDRESS"]
for i in range(len(s['ExpenseDocuments'][0]['SummaryFields'])):
    
    key = s['ExpenseDocuments'][0]['SummaryFields'][i]['Type']['Text']
    value = s['ExpenseDocuments'][0]['SummaryFields'][i]['ValueDetection']['Text']
    if key not in D.keys() and key not in exclude:
        D[key] = value

D['items'] = {}

for j in range(len(s['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'])):
    value = s['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][j]['ValueDetection']['Text']
    D['items']['item'+str(j)] = value

In [471]:
# Retrieving the FIRST itemization
print(s['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][0]['Type']['Text'])
print(s['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][0]['ValueDetection']['Text'])

ITEM
1 of: Elgato Wave: 3 Premium Studio
Free Mixer Quality USB Condenser Microphone for Streaming, Podcast, Gaming and Home Office,
Sold by: Amazon.com Software, Sound Effect Plugins, Anti-Distortion, Plug 'n Play, for Mac, PC
Services, Inc
Supplied by: Other


In [472]:
# formatting for prompt

S = ""
for key in D.keys():
    if key == 'items':
        S+=key +":\n"
        for k in D['items'].keys():
            S+= k + ":" + D['items'][k].replace('\n',' ') +'\n'
    else:
        S += key +":"+D[key]+"\n"
S+="Category:"

In [473]:
S

'prompt:You are an expert in receipt categorization. Categorize the following receipt into one of these categories: Meals, Supplies, Safety, Travel, Lodging, or Other. \nRespond in the format: "Answer: <category>." \nDo not include explanations, steps, or any additional text.\nReceipt:\nSTREET:17035 W VALLEY HWY\nCITY:SEATTLE,\nSTATE:WA\nZIP_CODE:98188-5519\nCOUNTRY:United States\nNAME:Christopher S, Nevares\nINVOICE_RECEIPT_DATE:8/13/24.\nINVOICE_RECEIPT_ID:112-9546U10-U268240\nORDER_DATE:August 13, 2024\nRECEIVER_NAME:Christopher S Nevares\nSHIPPING_HANDLING_CHARGE:$0.00\nSUBTOTAL:$149.99\nTAX:$15.30\nTOTAL:$165.29\nVENDOR_NAME:Amazon.com Inc.\nOTHER:Amazon.com\nServices, Inc\nitems:\nitem0:1 of: Elgato Wave: 3 Premium Studio Free Mixer Quality USB Condenser Microphone for Streaming, Podcast, Gaming and Home Office, Sold by: Amazon.com Software, Sound Effect Plugins, Anti-Distortion, Plug \'n Play, for Mac, PC Services, Inc Supplied by: Other\nitem1:$149.99\nitem2:1 Free of: Mixer El

In [479]:
# Prompting the model

def prompt_model(json_derulo):
    client = boto3.client('bedrock-runtime')
    try:
        response = client.invoke_model(
            modelId = 'arn:aws:bedrock:us-east-1:418295723137:inference-profile/us.meta.llama3-1-8b-instruct-v1:0',
            body = json.dumps({"prompt":json_derulo, 'top_p': 1, 'temperature': 1, "max_gen_len":5}),
            
            contentType = 'application/json',
            accept = "application/json",
            
        )
        s = response
        print(response)
        return response['body']
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [480]:
# running API
out = prompt_model(S)

{'ResponseMetadata': {'RequestId': '99339dee-d1fa-4a0e-9bfd-bc7d015473fb', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 25 Dec 2024 03:34:46 GMT', 'content-type': 'application/json', 'content-length': '114', 'connection': 'keep-alive', 'x-amzn-requestid': '99339dee-d1fa-4a0e-9bfd-bc7d015473fb', 'x-amzn-bedrock-invocation-latency': '454', 'x-amzn-bedrock-output-token-count': '5', 'x-amzn-bedrock-input-token-count': '357'}, 'RetryAttempts': 0}, 'contentType': 'application/json', 'body': <botocore.response.StreamingBody object at 0x0000023BD43CE650>}


In [481]:
# Printing the response
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body)

{'generation': 'Other.\nCategory:Other', 'prompt_token_count': 357, 'generation_token_count': 5, 'stop_reason': 'length'}


In [343]:
D

{'prompt': 'You are an expert at receipt categorizationand analysis. I am sending you a receipt to categorize. The categories you can choose are one of the following six: Meals, Supplies, Safety, Travel, Lodging, Other.  Choose only one and format it like "Category: <your response here> ". Keep the response short and concise. Do not provide an explanation or analysis of the steps. Only return one category and then end.\nHere is the receipt\nReceipt:',
 'ADDRESS': 'Christopher S, Nevares\n17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States',
 'STREET': '17035 W VALLEY HWY',
 'CITY': 'SEATTLE,',
 'STATE': 'WA',
 'ZIP_CODE': '98188-5519',
 'COUNTRY': 'United States',
 'NAME': 'Christopher S, Nevares',
 'ADDRESS_BLOCK': '17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States',
 'INVOICE_RECEIPT_DATE': '8/13/24.',
 'INVOICE_RECEIPT_ID': '112-9546U10-U268240',
 'ORDER_DATE': 'August 13, 2024',
 'RECEIVER_ADDRESS': 'Christopher S, Nevares\n17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\n

In [202]:
# Analyze a local file

def analyze_receipt_local(file_name): 
    client = boto3.client('textract')
    

    try:
        with open(file_name, "rb") as document:
            response = client.analyze_expense(
                Document={
                    'Bytes': document.read()
                }
        )
        s = response
        print("Success!")
        return s
        
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [None]:
file_name = 'data/Supplies1.jpg'
analyze_receipt_local("data/Supplies1.jpg")

In [144]:
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body)

{'generation': 'PAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPA

In [146]:
parsed_body

{'generation': 'PAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPA

In [108]:
S

'prompt: Classify this receipt between these categories Meals, Supplies, Safety, Travel, Lodging, Other. \nReceipt:ADDRESS:Christopher S, Nevares\n17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States\nSTREET:17035 W VALLEY HWY\nCITY:SEATTLE,\nSTATE:WA\nZIP_CODE:98188-5519\nCOUNTRY:United States\nNAME:Christopher S, Nevares\nADDRESS_BLOCK:17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States\nINVOICE_RECEIPT_DATE:8/13/24.\nINVOICE_RECEIPT_ID:112-9546U10-U268240\nORDER_DATE:August 13, 2024\nRECEIVER_ADDRESS:Christopher S, Nevares\n17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States\nRECEIVER_NAME:Christopher S Nevares\nSHIPPING_HANDLING_CHARGE:$0.00\nSUBTOTAL:$149.99\nTAX:$15.30\nTOTAL:$165.29\nVENDOR_NAME:Amazon.com Inc.\nOTHER:Amazon.com\nServices, Inc\nCategory:'

In [63]:
type(S)

str

In [64]:
S = {S}

In [71]:
S

{'prompt:Classify this receipt between these categories Meals, Supplies, Safety, Travel, Lodging, Other\nADDRESS:Christopher S, Nevares\n17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States\nSTREET:17035 W VALLEY HWY\nCITY:SEATTLE,\nSTATE:WA\nZIP_CODE:98188-5519\nCOUNTRY:United States\nNAME:Christopher S, Nevares\nADDRESS_BLOCK:17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States\nINVOICE_RECEIPT_DATE:8/13/24.\nINVOICE_RECEIPT_ID:112-9546U10-U268240\nORDER_DATE:August 13, 2024\nRECEIVER_ADDRESS:Christopher S, Nevares\n17035 W VALLEY HWY\nSEATTLE, WA 98188-5519\nUnited States\nRECEIVER_NAME:Christopher S Nevares\nSHIPPING_HANDLING_CHARGE:$0.00\nSUBTOTAL:$149.99\nTAX:$15.30\nTOTAL:$165.29\nVENDOR_NAME:Amazon.com Inc.\nOTHER:Amazon.com\nServices, Inc\n'}

In [78]:
out

{'ResponseMetadata': {'RequestId': '081bc70a-0ffb-4f93-aa10-2dfc56679301',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Wed, 25 Dec 2024 01:59:19 GMT',
   'content-type': 'application/json',
   'content-length': '2130',
   'connection': 'keep-alive',
   'x-amzn-requestid': '081bc70a-0ffb-4f93-aa10-2dfc56679301',
   'x-amzn-bedrock-invocation-latency': '6157',
   'x-amzn-bedrock-output-token-count': '512',
   'x-amzn-bedrock-input-token-count': '255'},
  'RetryAttempts': 0},
 'contentType': 'application/json',
 'body': <botocore.response.StreamingBody at 0x23bd3a20880>}