In [155]:
import logging
import boto3
from botocore.exceptions import ClientError
from botocore.exceptions import NoCredentialsError
import os
import json
from datetime import datetime

In [4]:
# Uploading a new file to S3

def upload_file_to_s3(file_name, bucket_name, object_name=None):
    """
    Uploads a file to an S3 bucket.
    
    :param file_name: Path to the file to upload
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name. If not specified, file_name is used
    :return: a string of the response
    """
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Initialize the S3 client
    s3 = boto3.client('s3')

    
    try:
        with open(file_name, "rb") as file_data: # Uploading the FILE CONTENTS not the filepath
            response = s3.put_object(
                Body=file_data,
                Bucket=bucket_name,
                Key=object_name,                # This is the what the file will be called in S3
            )
        s = response
        print(response)
        return s
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")



In [27]:
# Use your own bucket and file name
bucket_name = "test-bucket-cnevares-2024"
file_name = "data/Supplies1.jpg"
object_name = file_name.split('/')[1]


In [41]:
s = upload_file_to_s3(file_name, bucket_name, object_name=object_name)

{'ResponseMetadata': {'RequestId': 'Y4DCWGNNTRMR584E', 'HostId': 'w4nT/+4EvmEQAyzFqzTpOZUg2llkdgxxM+dynHVC0zMJPmZURiNNm8eQbnrrYqK3rjlaj1YPA6Rw4JCSmQgyjKSUKSwY/P3jeT7Wo0vyoBc=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'w4nT/+4EvmEQAyzFqzTpOZUg2llkdgxxM+dynHVC0zMJPmZURiNNm8eQbnrrYqK3rjlaj1YPA6Rw4JCSmQgyjKSUKSwY/P3jeT7Wo0vyoBc=', 'x-amz-request-id': 'Y4DCWGNNTRMR584E', 'date': 'Fri, 27 Dec 2024 03:41:57 GMT', 'x-amz-server-side-encryption': 'AES256', 'etag': '"39600dbbc987bf035ed63c46502db278"', 'content-length': '0', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'ETag': '"39600dbbc987bf035ed63c46502db278"', 'ServerSideEncryption': 'AES256'}


In [121]:
object_name = 'Supplies1.jpg'

In [122]:
# Analyze a receipt in an S3 bucket

def analyze_receipt(bucket_name, object_name):
    """
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name
    :return: string of the response
    """
    
    client = boto3.client('textract')

    try:
        response = client.analyze_expense(
            
            Document = {
                "S3Object": {
                    "Bucket": bucket_name,
                    "Name": object_name
                }
            }
        )
        s = response
        print("Success!")
        return s
        
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [123]:
text_extract = analyze_receipt(bucket_name, object_name)

Success!


In [124]:
## The Fields detected and their values
for i in range(len(text_extract['ExpenseDocuments'][0]['SummaryFields'])):
    print(text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['Type']['Text'] +": " + text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['ValueDetection']['Text'] )
    print()
print()

INVOICE_RECEIPT_DATE: 8/13/24,

INVOICE_RECEIPT_ID: 112-9546U10-U268240

INVOICE_RECEIPT_ID: #112-9546010-0268240

ORDER_DATE: August 13, 2024

SHIPPING_HANDLING_CHARGE: $0.00

SUBTOTAL: $149.99

SUBTOTAL: $149.99

TAX: $15.30

TOTAL: $165.29

TOTAL: $165.29

OTHER: 112-9546010-0268240

OTHER: Amazon.com
Services, Inc

OTHER: Other

OTHER: New

OTHER: FREE Prime Delivery

OTHER: Payment information
Payment Method:
Item(s) Subtotal: $149.99
Shipping & Handling: $0.00
Total before tax: $149.99
Estimated tax to be collected: $15.30
Grand Total: $165.29
To view the status of your order, return to Order Summary.
Conditions of Use Privacy Notice © 1996-2024, Amazon.com Inc. or its affiliates

OTHER: Item(s) Subtotal: $149.99
Shipping & Handling: $0.00
Total before tax: $149.99
Estimated tax to be collected: $15.30




In [125]:
# Condensing response into only useful(ish) values

condensed_extract = {}
exclude = []
for i in range(len(text_extract['ExpenseDocuments'][0]['SummaryFields'])):
    
    key = text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['Type']['Text']
    value = text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['ValueDetection']['Text']
    if key not in condensed_extract.keys() and key not in exclude:
        condensed_extract[key] = value



if len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'])> 0:
    condensed_extract['items'] = {}
    for j in range(len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'])):
        value = text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][j]['ValueDetection']['Text']
        condensed_extract['items']['item'+str(j)] = value

print(condensed_extract)

{'INVOICE_RECEIPT_DATE': '8/13/24,', 'INVOICE_RECEIPT_ID': '112-9546U10-U268240', 'ORDER_DATE': 'August 13, 2024', 'SHIPPING_HANDLING_CHARGE': '$0.00', 'SUBTOTAL': '$149.99', 'TAX': '$15.30', 'TOTAL': '$165.29', 'OTHER': '112-9546010-0268240', 'items': {'item0': "1 Sold Free of: by: Mixer Elgato Amazon.com Wave: Software, 3 Services, - Sound Inc Premium Effect Studio Plugins, Quality USB Anti-Distortion, Condenser Plug 'n Microphone Play, for for Mac, PC Streaming, Podcast, Gaming and Home Office,", 'item1': '$149.99', 'item2': "1 Sold Free of: by: Mixer Elgato Amazon.com Wave: Software, 3 Services, - Sound Inc Premium Effect Studio Plugins, Quality USB Anti-Distortion, Condenser Plug 'n Microphone Play, for for Mac, PC Streaming, Podcast, Gaming and Home Office, $149.99\nSupplied by: Other"}}


In [126]:
len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'])> 0

True

In [127]:
# formatting for prompt

prompt = '''
You are an expert in receipt categorization. Categorize the following receipt into one of these categories: Meals, Supplies, Safety, Travel, Lodging, or Other. 
Respond in the format: "Category:<category>."
Do not include explanations, steps, or any additional text.
Receipt:

'''
for key in condensed_extract.keys():
    if key == 'items':
        prompt+=key +":\n"
        for k in condensed_extract['items'].keys():
            prompt+= k + ":" + condensed_extract['items'][k].replace('\n',' ') +'\n'
    else:
        prompt += key +":"+condensed_extract[key]+"\n"
prompt+="Category:"

print(prompt)


You are an expert in receipt categorization. Categorize the following receipt into one of these categories: Meals, Supplies, Safety, Travel, Lodging, or Other. 
Respond in the format: "Category:<category>."
Do not include explanations, steps, or any additional text.
Receipt:

INVOICE_RECEIPT_DATE:8/13/24,
INVOICE_RECEIPT_ID:112-9546U10-U268240
ORDER_DATE:August 13, 2024
SHIPPING_HANDLING_CHARGE:$0.00
SUBTOTAL:$149.99
TAX:$15.30
TOTAL:$165.29
OTHER:112-9546010-0268240
items:
item0:1 Sold Free of: by: Mixer Elgato Amazon.com Wave: Software, 3 Services, - Sound Inc Premium Effect Studio Plugins, Quality USB Anti-Distortion, Condenser Plug 'n Microphone Play, for for Mac, PC Streaming, Podcast, Gaming and Home Office,
item1:$149.99
item2:1 Sold Free of: by: Mixer Elgato Amazon.com Wave: Software, 3 Services, - Sound Inc Premium Effect Studio Plugins, Quality USB Anti-Distortion, Condenser Plug 'n Microphone Play, for for Mac, PC Streaming, Podcast, Gaming and Home Office, $149.99 Supplied

In [187]:
# Prompting the model

def prompt_model_titan_express(json_derulo):
    client = boto3.client('bedrock-runtime')
    try:
        response = client.invoke_model(
            modelId = 'amazon.titan-text-lite-v1',
            contentType = 'application/json',
            accept = "application/json",
            body = json.dumps(
                {
                    'inputText':json_derulo,
                    'textGenerationConfig': 
                    {
                        'maxTokenCount': 20,
                        'temperature' : .5,
                        'topP':.5
                    }
                }
            )
        )
            
        body = response['body']
        return body
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

out = prompt_model_titan_express(prompt)

# Printing the response
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body['results'][0]['outputText'])

 Supplies


In [188]:
# This is to reformat dates extracted into a consistent format. uccrently 
def reformat_date(date_string):
    # List of potential input formats
    input_formats = ["%m/%d/%y", "%m/%d/%Y", "%m/%-d/%y", "%m/%-d/%Y", "%-m/%d/%y", "%-m/%d/%Y", "%B %d %Y", "%b %d %Y",
    ]
    
    # Try parsing with each format
    for fmt in input_formats:
        try:
            date_object = datetime.strptime(date_string, fmt)
            break
        except ValueError:
            continue
    else:
        raise ValueError(f"Date format not recognized: {date_string}")
    
    # Format to "mm/dd/yyyy"
    date_object = date_object.strftime("%m/%d/%Y")
    return datetime.strptime(date_object, "%m/%d/%Y").date() #return date portion of datetime object

In [179]:
reformatted_date

datetime.date(2024, 8, 13)

In [180]:
date = condensed_extract['INVOICE_RECEIPT_DATE']
date = date.translate(str.maketrans('', '','.,'))
reformatted_date = reformat_date(date)


month = reformatted_date.month  # 12
day = reformatted_date.day      # 25
year = reformatted_date.year    # 2024

print(f"Month: {month}, Day: {day}, Year: {year}")

Month: 8, Day: 13, Year: 2024


In [182]:
total = condensed_extract['TOTAL']
category = parsed_body['results'][0]['outputText']
print("Extracted details: ")
print(f'Amount: {total}\nDate:\n\tMonth: {month}\n\tDay: {day}\n\tYear: {year}\nCategory: {category}')

Extracted details: 
Amount: $165.29
Date:
	Month: 8
	Day: 13
	Year: 2024
Category:  Other


In [154]:
# Prompting the a llama model

def prompt_model_llama(json_derulo):
    client = boto3.client('bedrock-runtime')
    try:
        response = client.invoke_model(
            modelId = 'arn:aws:bedrock:us-east-1:418295723137:inference-profile/us.meta.llama3-1-8b-instruct-v1:0',
            body = json.dumps({"prompt":json_derulo, 'top_p': 1, 'temperature': 1, "max_gen_len":20}),
            
            contentType = 'application/json',
            accept = "application/json",
            
        )
        s = response
        print(response)
        return response['body']
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

out = prompt_model_llama(S)

# Printing the response
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body)

NameError: name 'S' is not defined

In [43]:
print("{\"inputText\":\"this is where you place your input text\",\"textGenerationConfig\":{\"maxTokenCount\":8192,\"stopSequences\":[],\"temperature\":0,\"topP\":1}}")

{"inputText":"this is where you place your input text","textGenerationConfig":{"maxTokenCount":8192,"stopSequences":[],"temperature":0,"topP":1}}


In [202]:
# Analyze a local file

def analyze_receipt_local(file_name): 
    client = boto3.client('textract')
    

    try:
        with open(file_name, "rb") as document:
            response = client.analyze_expense(
                Document={
                    'Bytes': document.read()
                }
        )
        s = response
        print("Success!")
        return s
        
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [None]:
file_name = 'data/Supplies1.jpg'
analyze_receipt_local("data/Supplies1.jpg")

In [144]:
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body)

{'generation': 'PAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPA