In [12]:
import os
from pdf2image import convert_from_path
from PIL import Image, ImageFilter, ImageEnhance
import numpy as np
import pytesseract
import re

In [74]:
class PreprocessingPipeline:

    def __init__(self):
        self.layers = []
        
    def add_layer(self, layer):
        self.layers.append(layer)

    def run(self, input_folder, output_folder, dpi = (300,300), **kwargs):
        os.makedirs(output_folder, exist_ok=True)
        j = 0
        for filename in os.listdir(input_folder):
            if not filename.lower().endswith('.ipynb_checkpoints'):
                path = os.path.join(input_folder, filename)
                image = Image.open(path)
                print(f'****{filename}****')
                for i in range(len(self.layers)):
                    image = self.layers[i](image, **kwargs)
                    print(self.layers[i])
                    #display(image)

                    text = pytesseract.image_to_string(image)
                        
                    # Extract the date using a regex pattern
                    date_pattern = r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b'  # Matches dates like 01/01/2023 or 1-1-23
                    dates = re.findall(date_pattern, text)

                    # Extract the total amount using a regex pattern
                    amount_pattern = r'\$?(\d+\.\d{2})'  # Matches "Total: $12.34" or "Total 12.34"
                    amounts = re.findall(amount_pattern, text, re.IGNORECASE)

                    if dates:
                        print("Most Likely Date:", dates[0])  # Assume the first match is the transaction date
                    if amounts:
                        print(max(amounts))
                    #print(text)
                print()
                output_path = os.path.join(output_folder, filename)
                image.save(output_path, "JPEG", dpi = dpi)
                j+=1

In [55]:
def greyscale(img, **kwargs):
    
    greyscale_img = img.convert("L")
    
    return greyscale_img

In [56]:
def median_filter(img, **kwargs):
    size = kwargs.get("filter_size", 1)
    return img.filter(ImageFilter.MedianFilter(size=size))


In [57]:
def contrast_stretch(img, **kwargs):
    img_array = np.array(img)

    # Perform contrast stretching
    p2, p98 = np.percentile(img_array, (2, 98))  # Percentiles for stretching
    img_stretched = np.clip((img_array - p2) * (255 / (p98 - p2)), 0, 255)

    # Convert back to PIL image and save
    return Image.fromarray(img_stretched.astype("uint8"))

In [58]:
def binarization(img, **kwargs):
    # Define a threshold value
    threshold = kwargs.get('threshold', 128)

    # Apply thresholding
    return img.point(lambda x: 255 if x > threshold else 0, mode="1")

In [59]:
def contrast(img, **kwargs):
  
    contrast = kwargs.get("contrast_factor",3)
    
    
    enhancer = ImageEnhance.Contrast(img).enhance(contrast)  # Increase contrast
    
    return enhancer

In [81]:
def sharpen(img, **kwargs):
  
    sharpness = kwargs.get("sharpness_factor",3)
    
    
    enhancer = ImageEnhance.Sharpness(img).enhance(sharpness)  # Increase contrast
    
    return enhancer

In [61]:
def mode_filter(image, **kwargs):
    return image.filter(ImageFilter.ModeFilter(size=1))

In [62]:
def gaussian_filter(img, **kwargs):
    radius = kwargs.get('filter_size', 1)
    return img.filter(ImageFilter.GaussianBlur(radius = radius))

In [92]:
pipeline = PreprocessingPipeline()
pipeline.add_layer(greyscale)

pipeline.add_layer(contrast)
pipeline.add_layer(sharpen)
pipeline.add_layer(median_filter)
#pipeline.add_layer(gaussian_filter)
pipeline.add_layer(contrast_stretch)
pipeline.add_layer(binarization)

In [93]:
pipeline.run(input_folder="assets/test", output_folder="assets/greyscale_test", dpi=(900, 900), sharpness_factor = 2, contrast_factor = 3, filter_size = 1)

****1000-receipt-Copy1.jpg****
<function greyscale at 0x00000213AF5FC820>
Most Likely Date: 5/26/2016
96.58
<function contrast at 0x00000213AF61B040>
Most Likely Date: 5/26/2016
6.59
<function sharpen at 0x00000213AF5FC8B0>
Most Likely Date: 5/26/2016
51.90
<function median_filter at 0x00000213AF5FCF70>
Most Likely Date: 5/26/2016
51.90
<function contrast_stretch at 0x00000213AF595160>
Most Likely Date: 5/26/2016
51.90
<function binarization at 0x00000213AF5FC940>
Most Likely Date: 5/26/2016
91.90

****1001-receipt-Copy1.jpg****
<function greyscale at 0x00000213AF5FC820>
<function contrast at 0x00000213AF61B040>
<function sharpen at 0x00000213AF5FC8B0>
<function median_filter at 0x00000213AF5FCF70>
<function contrast_stretch at 0x00000213AF595160>
<function binarization at 0x00000213AF5FC940>
10.38

****1002-receipt-Copy1.jpg****
<function greyscale at 0x00000213AF5FC820>
Most Likely Date: 9/1/2016
<function contrast at 0x00000213AF61B040>
Most Likely Date: 9/1/2016
9.00
<function shar

In [146]:
from PIL import ImageEnhance, ImageFilter

In [150]:



# Load and preprocess the image
image = Image.open("receipt.jpg")  # Replace with the path to your receipt image
gray_image = image.convert("L")  # Convert to grayscale for better OCR accuracy

# Perform OCR on the preprocessed image
extracted_text = pytesseract.image_to_string(gray_image)

# Extract the date using a regex pattern
date_pattern = r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b'  # Matches dates like 01/01/2023 or 1-1-23
dates = re.findall(date_pattern, extracted_text)

# Extract the total amount using a regex pattern
amount_pattern = r'\bTotal\b[:\s]*\$?(\d+\.\d{2})'  # Matches "Total: $12.34" or "Total 12.34"
amounts = re.findall(amount_pattern, extracted_text, re.IGNORECASE)

# Output the results
print("Extracted Dates:", dates)
print("Extracted Total Amounts:", amounts)

In [None]:
def convert_to_jpg_standardize_dpi(input_folder, output_folder, dpi = (300,300)):
    os.makedirs(output_folder, exist_ok=True)
    j = 0
    for filename in os.listdir(input_folder):
        path = os.path.join(input_folder, filename)
        
        if filename.lower().endswith(".pdf"):
            
            # Convert PDF to a list of images (one image per page) at default DPI
            pages = convert_from_path(path)

            # Save each page as a separate JPG file
            for i, page in enumerate(pages):
                output_filename = os.path.join(
                    output_folder, f"{os.path.splitext(filename)[0]}_page_{i + 1}.jpg"
                )
                
                page.save(output_filename, "JPEG", dpi=dpi)
            j += i
            
        else:
            image = Image.open(path)
            
            output_path = os.path.join(output_folder, filename)
            image.save(output_path, "JPEG", dpi = dpi)
            j+=1

    
    print(f"Saved {j} images in {output_folder}")