In [108]:
import logging
import boto3
from botocore.exceptions import ClientError
from botocore.exceptions import NoCredentialsError
import os
import json
from datetime import datetime
import numpy as np
import pandas as pd

In [112]:
data = pd.read_excel("data/Spreadsheets/receipt_ground_truth.xlsx")
data.head()

Unnamed: 0,receipt_extract,object_name,date,subtotal,total,category
0,,Airfare1.jpg,2024-09-22,45.0,45.0,Travel
1,,Airfare2.jpg,2024-11-13,545.11,614.69,Travel
2,,CarRental1.jpg,2024-12-15,355.0,505.63,Travel
3,,CarRental2.jpg,NaT,173.14,173.14,Travel
4,,CarRental3.jpg,2024-12-20,272.83,319.18,Travel


In [113]:
# Getting rid of null values in date column
data['date']=data['date'].fillna(pd.Timestamp('1900-01-01'))

In [115]:
data.head()

Unnamed: 0,receipt_extract,object_name,date,subtotal,total,category
0,,Airfare1.jpg,2024-09-22,45.0,45.0,Travel
1,,Airfare2.jpg,2024-11-13,545.11,614.69,Travel
2,,CarRental1.jpg,2024-12-15,355.0,505.63,Travel
3,,CarRental2.jpg,1900-01-01,173.14,173.14,Travel
4,,CarRental3.jpg,2024-12-20,272.83,319.18,Travel


In [117]:
# Uploading a new file to S3

def upload_file_to_s3(file_name, bucket_name, object_name=None):
    """
    Uploads a file to an S3 bucket.
    
    :param file_name: Path to the file to upload
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name. If not specified, file_name is used
    :return: a string of the response
    """
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Initialize the S3 client
    s3 = boto3.client('s3')

    
    try:
        with open(file_name, "rb") as file_data: # Uploading the FILE CONTENTS not the filepath
            response = s3.put_object(
                Body=file_data,
                Bucket=bucket_name,
                Key=object_name,                # This is the what the file will be called in S3
            )
        s = response
        return s
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [118]:
# Analyze a receipt in an S3 bucket

def analyze_receipt(bucket_name, object_name):
    """
    :param bucket_name: Name of the S3 bucket
    :param object_name: S3 object name
    :return: string of the response
    """
    
    client = boto3.client('textract')

    try:
        response = client.analyze_expense(
            
            Document = {
                "S3Object": {
                    "Bucket": bucket_name,
                    "Name": object_name
                }
            }
        )
        s = response
        return s
        
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except NoCredentialsError:
        print("AWS credentials not available.")

In [119]:
def condense_textract(text_extract, exclude = []):
    """
    Converts the large json response from Textract into a smaller dictionary.
    
    :param text_extract: json return of AWS Textract operation
    :param exclude: list of keys to exclude
    :return: the new dictionary
    """
    condensed_extract = {}

    # grabbing information from the text_extract json
    for i in range(len(text_extract['ExpenseDocuments'][0]['SummaryFields'])):
        key = text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['Type']['Text']
        value = text_extract['ExpenseDocuments'][0]['SummaryFields'][i]['ValueDetection']['Text']
        if key not in exclude:
            if key not in condensed_extract.keys():
                condensed_extract[key] = value

            else:
                temp = " " + value
                condensed_extract[key] + temp
        
        if len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'])> 0:
            condensed_extract['items'] = {}
            for j in range(len(text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'])):
                value = text_extract['ExpenseDocuments'][0]['LineItemGroups'][0]['LineItems'][0]['LineItemExpenseFields'][j]['ValueDetection']['Text']
                condensed_extract['items']['item'+str(j)] = value
    
    return condensed_extract

    

In [120]:
def post_to_s3_analyze_receipt(dataframe, bucket_name):
    
    for i in range(len(dataframe)):
        
        object_name = dataframe.loc[i, 'object_name']
        file_name = 'data/' + str(object_name)

        # Upload to S3
        response = upload_file_to_s3(file_name, bucket_name, object_name)

        # Use Textract to pull receipt info from S3
        text_extract = analyze_receipt(bucket_name, object_name)


        # condensing the text_extract into usable information
        condensed_extract = condense_textract(text_extract)
        
        dataframe.loc[i, 'receipt_extract'] = json.dumps(condensed_extract) # convert to string format for storage
        print(f"Index {i}, {object_name} finished")


In [121]:

dataframe = data.copy(deep = True)
bucket_name = bucket_name = "test-bucket-cnevares-2024"
post_to_s3_analyze_receipt(dataframe, bucket_name)

dataframe.to_excel('receipts_with_extracts.xlsx', index=False)  # Set index=False to exclude the index

print("DataFrame written to 'receipts_with_extracts.xlsx'")

  dataframe.loc[i, 'receipt_extract'] = json.dumps(condensed_extract) # convert to string format for storage


Index 0, Airfare1.jpg finished
Index 1, Airfare2.jpg finished
Index 2, CarRental1.jpg finished
Index 3, CarRental2.jpg finished
Index 4, CarRental3.jpg finished
Index 5, CarRental4.jpg finished
Index 6, CarRental5.jpg finished
Index 7, CarRental6.jpg finished
Index 8, CarWash1.jpg finished
Index 9, Gas1.jpg finished
Index 10, Gas2.jpg finished
Index 11, Gas3.jpg finished
Index 12, Gas4.jpg finished
Index 13, Groceries1.jpg finished
Index 14, Groceries2.jpg finished
Index 15, Groceries3.jpg finished
Index 16, Groceries4.jpg finished
Index 17, Groceries5.jpg finished
Index 18, Hotel1.jpg finished
Index 19, Hotel2.jpg finished
Index 20, Hotel3.jpg finished
Index 21, Hotel4.jpg finished
Index 22, Hotel5.jpg finished
Index 23, Hotel6.jpg finished
Index 24, Hotel7.jpg finished
Index 25, Hotel8.jpg finished
Index 26, Hotel9.jpg finished
Index 27, Meals1.jpg finished
Index 28, Meals2.jpg finished
Index 29, Meals3.jpg finished
Index 30, Meals4.jpg finished
Index 31, Meals5.jpg finished
Index 32