In [22]:
import logging
import boto3
from botocore.exceptions import ClientError
from botocore.exceptions import NoCredentialsError
import os
import json
from datetime import datetime
import numpy as np
import pandas as pd

In [2]:
# Uploading a new file to S3

def upload_file_to_s3(file_name, bucket_name, object_name=None):
    """
    Uploads a file to an S3 bucket.
    
    :param file_name: Path to the file to upload
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name. If not specified, file_name is used
    :return: a string of the response
    """
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Initialize the S3 client
    s3 = boto3.client('s3')

    
    try:
        with open(file_name, "rb") as file_data: # Uploading the FILE CONTENTS not the filepath
            response = s3.put_object(
                Body=file_data,
                Bucket=bucket_name,
                Key=object_name,                # This is the what the file will be called in S3
            )
        s = response
        print(response)
        return s
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")



In [3]:
# Use your own bucket and file name
bucket_name = "test-bucket-cnevares-2024"
file_name = "data/Hotel1.jpg"
object_name = file_name.split('/')[1]


In [5]:
s = upload_file_to_s3(file_name, bucket_name, object_name=object_name)

{'ResponseMetadata': {'RequestId': 'SEJE5A8ZXCK81A76', 'HostId': 'je+us/i+xtewohEJNUIldAH4ZpVIxVG2keqEG1MgZ7cvpLflbSI4hEP6vuBSiHbF7rUksQ+88Os=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'je+us/i+xtewohEJNUIldAH4ZpVIxVG2keqEG1MgZ7cvpLflbSI4hEP6vuBSiHbF7rUksQ+88Os=', 'x-amz-request-id': 'SEJE5A8ZXCK81A76', 'date': 'Sun, 29 Dec 2024 02:25:09 GMT', 'x-amz-server-side-encryption': 'AES256', 'etag': '"7bc5adc4783acdc0f4727423a96fd3d2"', 'content-length': '0', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'ETag': '"7bc5adc4783acdc0f4727423a96fd3d2"', 'ServerSideEncryption': 'AES256'}


In [6]:
# Analyze a receipt in an S3 bucket

def analyze_receipt(bucket_name, object_name):
    """
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name
    :return: string of the response
    """
    
    client = boto3.client('textract')

    try:
        response = client.analyze_expense(
            
            Document = {
                "S3Object": {
                    "Bucket": bucket_name,
                    "Name": object_name
                }
            }
        )
        s = response
        print("Success!")
        return s
        
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [7]:
text_extract = analyze_receipt(bucket_name, object_name)

Success!


In [8]:
## The Fields detected and their values
for i in range(len(text_extract['ExpenseDocuments'][0]['SummaryFields'])):
    print(text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['Type']['Text'] +": " + text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['ValueDetection']['Text'] )
    print()
print()

ADDRESS: Super 8 by Wyndham ewistown
102 Wendell Ave
Lewistown MT 59457
United States

STREET: 102 Wendell Ave

CITY: Lewistown

STATE: MT

ZIP_CODE: 59457

COUNTRY: United States

ADDRESS_BLOCK: 102 Wendell Ave
Lewistown MT 59457
United States

INVOICE_RECEIPT_DATE: 12/20/2024

INVOICE_RECEIPT_ID: H9970857

RECEIVER_NAME: 

TAX: $47.68 USD

TOTAL: $514.01 USD

VENDOR_ADDRESS: Super 8 by Wyndham ewistown
102 Wendell Ave
Lewistown MT 59457
United States

VENDOR_URL: reserve1 hatelguides.com/Res

OTHER: H9970857

OTHER: Confirmed

OTHER: 1 room for 3 nights

OTHER: December 17 2024 (Tue)

OTHER: December 20. 2024 (Fri)

OTHER: Super

OTHER: 86241EE026909

OTHER: Confirmed

OTHER: 1 adult 0 children

OTHER: 1 King Bed (Room Only)

OTHER: $123.66 USD




In [17]:
# Condensing response into only useful(ish) values

condensed_extract = {}
exclude = []
for i in range(len(text_extract['ExpenseDocuments'][0]['SummaryFields'])):
    
    key = text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['Type']['Text']
    value = text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['ValueDetection']['Text']
    if key not in condensed_extract.keys():
        condensed_extract[key] = value

    else:
        temp = " " + value
        condensed_extract[key] +
if len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'])> 0:
    condensed_extract['items'] = {}
    for j in range(len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'])):
        value = text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][j]['ValueDetection']['Text']
        condensed_extract['items']['item'+str(j)] = value

print(condensed_extract)

{'ADDRESS': 'Super 8 by Wyndham ewistown\n102 Wendell Ave\nLewistown MT 59457\nUnited States', 'STREET': '102 Wendell Ave', 'CITY': 'Lewistown', 'STATE': 'MT', 'ZIP_CODE': '59457', 'COUNTRY': 'United States', 'ADDRESS_BLOCK': '102 Wendell Ave\nLewistown MT 59457\nUnited States', 'INVOICE_RECEIPT_DATE': '12/20/2024', 'INVOICE_RECEIPT_ID': 'H9970857', 'RECEIVER_NAME': '', 'TAX': '$47.68 USD', 'TOTAL': '$514.01 USD', 'VENDOR_ADDRESS': 'Super 8 by Wyndham ewistown\n102 Wendell Ave\nLewistown MT 59457\nUnited States', 'VENDOR_URL': 'reserve1 hatelguides.com/Res', 'OTHER': 'H9970857 Confirmed 1 room for 3 nights December 17 2024 (Tue) December 20. 2024 (Fri) Super 86241EE026909 Confirmed 1 adult 0 children 1 King Bed (Room Only) $123.66 USD'}


In [18]:
condensed_extract['OTHER']

'H9970857 Confirmed 1 room for 3 nights December 17 2024 (Tue) December 20. 2024 (Fri) Super 86241EE026909 Confirmed 1 adult 0 children 1 King Bed (Room Only) $123.66 USD'

In [19]:
len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'])> 0

False

In [20]:
# formatting for prompt

prompt = '''
You are an expert in receipt categorization. Categorize the following receipt into one of these categories: Meals, Supplies, Safety, Travel, Lodging, or Other. 
Respond in the format: "Category:<category>."
Do not include explanations, steps, or any additional text.
Receipt:

'''
for key in condensed_extract.keys():
    if key == 'items':
        prompt+=key +":\n"
        for k in condensed_extract['items'].keys():
            prompt+= k + ":" + condensed_extract['items'][k].replace('\n',' ') +'\n'
    else:
        prompt += key +":"+condensed_extract[key]+"\n"
prompt+="Category:"

print(prompt)


You are an expert in receipt categorization. Categorize the following receipt into one of these categories: Meals, Supplies, Safety, Travel, Lodging, or Other. 
Respond in the format: "Category:<category>."
Do not include explanations, steps, or any additional text.
Receipt:

ADDRESS:Super 8 by Wyndham ewistown
102 Wendell Ave
Lewistown MT 59457
United States
STREET:102 Wendell Ave
CITY:Lewistown
STATE:MT
ZIP_CODE:59457
COUNTRY:United States
ADDRESS_BLOCK:102 Wendell Ave
Lewistown MT 59457
United States
INVOICE_RECEIPT_DATE:12/20/2024
INVOICE_RECEIPT_ID:H9970857
RECEIVER_NAME:
TAX:$47.68 USD
TOTAL:$514.01 USD
VENDOR_ADDRESS:Super 8 by Wyndham ewistown
102 Wendell Ave
Lewistown MT 59457
United States
VENDOR_URL:reserve1 hatelguides.com/Res
OTHER:H9970857 Confirmed 1 room for 3 nights December 17 2024 (Tue) December 20. 2024 (Fri) Super 86241EE026909 Confirmed 1 adult 0 children 1 King Bed (Room Only) $123.66 USD
Category:


In [21]:
# Prompting the model

def prompt_model_titan_express(json_derulo):
    client = boto3.client('bedrock-runtime')
    try:
        response = client.invoke_model(
            modelId = 'amazon.titan-text-lite-v1',
            contentType = 'application/json',
            accept = "application/json",
            body = json.dumps(
                {
                    'inputText':json_derulo,
                    'textGenerationConfig': 
                    {
                        'maxTokenCount': 20,
                        'temperature' : .5,
                        'topP':.5
                    }
                }
            )
        )
            
        body = response['body']
        return body
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

out = prompt_model_titan_express(prompt)

# Printing the response
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body['results'][0]['outputText'])

 Lodging


In [188]:
# This is to reformat dates extracted into a consistent format. uccrently 
def reformat_date(date_string):
    # List of potential input formats
    input_formats = ["%m/%d/%y", "%m/%d/%Y", "%m/%-d/%y", "%m/%-d/%Y", "%-m/%d/%y", "%-m/%d/%Y", "%B %d %Y", "%b %d %Y",
    ]
    
    # Try parsing with each format
    for fmt in input_formats:
        try:
            date_object = datetime.strptime(date_string, fmt)
            break
        except ValueError:
            continue
    else:
        raise ValueError(f"Date format not recognized: {date_string}")
    
    # Format to "mm/dd/yyyy"
    date_object = date_object.strftime("%m/%d/%Y")
    return datetime.strptime(date_object, "%m/%d/%Y").date() #return date portion of datetime object

In [179]:
reformatted_date

datetime.date(2024, 8, 13)

In [180]:
date = condensed_extract['INVOICE_RECEIPT_DATE']
date = date.translate(str.maketrans('', '','.,'))
reformatted_date = reformat_date(date)


month = reformatted_date.month  # 12
day = reformatted_date.day      # 25
year = reformatted_date.year    # 2024

print(f"Month: {month}, Day: {day}, Year: {year}")

Month: 8, Day: 13, Year: 2024


In [182]:
total = condensed_extract['TOTAL']
category = parsed_body['results'][0]['outputText']
print("Extracted details: ")
print(f'Amount: {total}\nDate:\n\tMonth: {month}\n\tDay: {day}\n\tYear: {year}\nCategory: {category}')

Extracted details: 
Amount: $165.29
Date:
	Month: 8
	Day: 13
	Year: 2024
Category:  Other


In [154]:
# Prompting the a llama model

def prompt_model_llama(json_derulo):
    client = boto3.client('bedrock-runtime')
    try:
        response = client.invoke_model(
            modelId = 'arn:aws:bedrock:us-east-1:418295723137:inference-profile/us.meta.llama3-1-8b-instruct-v1:0',
            body = json.dumps({"prompt":json_derulo, 'top_p': 1, 'temperature': 1, "max_gen_len":20}),
            
            contentType = 'application/json',
            accept = "application/json",
            
        )
        s = response
        print(response)
        return response['body']
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

out = prompt_model_llama(S)

# Printing the response
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body)

NameError: name 'S' is not defined

In [43]:
print("{\"inputText\":\"this is where you place your input text\",\"textGenerationConfig\":{\"maxTokenCount\":8192,\"stopSequences\":[],\"temperature\":0,\"topP\":1}}")

{"inputText":"this is where you place your input text","textGenerationConfig":{"maxTokenCount":8192,"stopSequences":[],"temperature":0,"topP":1}}


In [202]:
# Analyze a local file

def analyze_receipt_local(file_name): 
    client = boto3.client('textract')
    

    try:
        with open(file_name, "rb") as document:
            response = client.analyze_expense(
                Document={
                    'Bytes': document.read()
                }
        )
        s = response
        print("Success!")
        return s
        
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [None]:
file_name = 'data/Supplies1.jpg'
analyze_receipt_local("data/Supplies1.jpg")

In [144]:
body_content = out.read().decode('utf-8')

# Parse the JSON content
parsed_body = json.loads(body_content)

# Print the parsed body
print(parsed_body)

{'generation': 'PAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPAYMENT_AMOUNT:$165.29\nPAYMENT_TYPE:Credit Card\nPAYMENT_METHOD:Credit Card\nPAYMENT_STATUS:Paid\nPAYMENT_DATE:August 13, 2024\nPA

In [3]:
import os
from pdf2image import convert_from_path
from PIL import Image

def convert_to_jpg_standardize_dpi(input_folder, output_folder, dpi=(300, 300)):
    os.makedirs(output_folder, exist_ok=True)
    j = 0

    for filename in os.listdir(input_folder):
        path = os.path.join(input_folder, filename)
        
        if filename.lower().endswith(".pdf"):
            # Convert PDF to a list of images (one image per page)
            pages = convert_from_path(path, dpi=dpi[0])  # DPI is passed to pdf2image

            # Combine all pages into a single image
            widths, heights = zip(*(page.size for page in pages))
            total_width = max(widths)
            total_height = sum(heights)
            
            combined_image = Image.new("RGB", (total_width, total_height))
            y_offset = 0

            for page in pages:
                combined_image.paste(page, (0, y_offset))
                y_offset += page.size[1]

            # Save the combined image
            output_filename = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.jpg")
            combined_image.save(output_filename, "JPEG", dpi=dpi)
            j += 1

        else:
            # For non-PDF images, save them directly with standardized DPI
            image = Image.open(path)
            output_path = os.path.join(output_folder, filename)
            image.save(output_path, "JPEG", dpi=dpi)
            j += 1

    print(f"Saved {j} images in {output_folder}")

In [2]:
pip install pdf2image

Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
convert_to_jpg_standardize_dpi('data/pdf/', 'data')

Saved 3 images in data
