In [None]:
import PyPDF2
import re

def extract_invoice_data(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        page = reader.pages[0]
        text = page.extract_text()

        # Regular expressions to match each piece of data
        date_re = r'DATE\n(\d{2}/\d{2}/\d{4})'
        invoice_number_re = r'INVOICE #\n(\d+)'
        bill_to_re = r'BILL TO\n\n(.+?)(?:\n\d{3}-\d{3}-\d{4})'
        activity_re = r'(\d{3}-\d{3}-\d{4})\n(.+?)Amount'
        total_re = r'TOTAL\n\n\$(\d+\.\d{2})'
        balance_due_re = r'BALANCE DUE \n\n\$(\d+\.\d{2})'

        # Extracting data
        date = re.search(date_re, text).group(1)
        invoice_number = re.search(invoice_number_re, text).group(1)
        bill_to = re.search(bill_to_re, text, re.DOTALL).group(1).strip()
        activity = re.search(activity_re, text, re.DOTALL).group(2).strip()
        total = re.search(total_re, text).group(1)
        balance_due = re.search(balance_due_re, text).group(1)

        return {
            'Date': date,
            'Invoice Number': invoice_number,
            'Bill To': bill_to,
            'Activity': activity,
            'Total': total,
            'Balance Due': balance_due
        }

# Example usage
pdf_path ='C:\Programming\CogswellProject\Test\Cogswell BALANCE Invoice_1248.pdf'  # Replace with your PDF file path
data = extract_invoice_data(pdf_path)
print(data)

# 'C:\Programming\CogswellProject\Test\Cogswell BALANCE Invoice_1248.pdf

In [8]:
import PyPDF2
import pytesseract
from PIL import Image
import io
import re

# Function to extract text using PyPDF2
def extract_text_pdf2(reader):
    text = ''
    for page in reader.pages:
        text += page.extract_text() or ''
    return text

# Function to perform OCR using pytesseract
def perform_ocr(reader):
    text = ''
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        if '/XObject' in page['/Resources']:
            xObject = page['/Resources']['/XObject'].get_object()

            for obj in xObject:
                if xObject[obj]['/Subtype'] == '/Image':
                    size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                    data = xObject[obj].get_data()
                    image = Image.open(io.BytesIO(data))
                    text += pytesseract.image_to_string(image, lang='eng')
    return text

# Main function to extract data from PDF
def extract_invoice_data(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = extract_text_pdf2(reader)

        # If text extraction fails, try OCR
        if not text.strip():
            text = perform_ocr(reader)

        # Define regular expressions for each field
        date_re = r'DATE\s+(\d{2}/\d{2}/\d{4})'
        invoice_number_re = r'INVOICE #\s+(\d+)'
        bill_to_re = r'BILL TO\n\s+(.+?)\n\s+\d{3}-\d{3}-\d{4}'
        activity_re = r'(\d{3}-\d{3}-\d{4})\s+([\s\S]+?)\s+Amount'
        total_re = r'TOTAL\s+\$\s+(\d+\.\d{2})'
        balance_due_re = r'BALANCE DUE\s+\$\s+(\d+\.\d{2})'

        # Extract data using regular expressions
        extracted_data = {
            'Date': re.search(date_re, text).group(1) if re.search(date_re, text) else None,
            'Invoice Number': re.search(invoice_number_re, text).group(1) if re.search(invoice_number_re, text) else None,
            'Bill To': re.search(bill_to_re, text).group(1).strip() if re.search(bill_to_re, text) else None,
            'Activity': re.search(activity_re, text).group(2).strip() if re.search(activity_re, text) else None,
            'Total': re.search(total_re, text).group(1) if re.search(total_re, text) else None,
            'Balance Due': re.search(balance_due_re, text).group(1) if re.search(balance_due_re, text) else None
        }

    return extracted_data

# Example usage
pdf_path = 'C:\Programming\CogswellProject\Test\Cogswell BALANCE Invoice_1248.pdf'  # Replace with your PDF file path
data = extract_invoice_data(pdf_path)
print(data)


{'Date': '06/17/2009', 'Invoice Number': '1248', 'Bill To': None, 'Activity': 'clintdavis68@gmail.com\nENCLOSED AMOUNT DUE\n$2,913.75\n"Please detach top portion and return with your payment."\nService Activity Quantity Rate', 'Total': None, 'Balance Due': None}


In [13]:
import PyPDF2

def extract_pdf_metadata(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        metadata = reader.metadata
        return metadata


pdf_path = 'C:\Programming\CogswellProject\Test\Cogswell BALANCE Invoice_1248.pdf' # Replace with your PDF file path
metadata = extract_pdf_metadata(pdf_path)

if metadata:
    print("PDF Metadata:")
    for key, value in metadata.items():
        print(f"{key}: {value}")
else:
    print("No metadata found in the PDF.")


No metadata found in the PDF.


In [None]:
import PyPDF2
import pytesseract
from PIL import Image
import io
import re

# Function to extract text using PyPDF2
def extract_text_pdf2(reader):
    text = ''
    for page in reader.pages:
        text += page.extract_text() or ''
    return text

# Function to perform OCR using pytesseract
def perform_ocr(reader):
    text = ''
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        if '/XObject' in page['/Resources']:
            xObject = page['/Resources']['/XObject'].get_object()

            for obj in xObject:
                if xObject[obj]['/Subtype'] == '/Image':
                    size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                    data = xObject[obj].get_data()
                    image = Image.open(io.BytesIO(data))
                    text += pytesseract.image_to_string(image, lang='eng')
    return text

# Function to extract date from a single PDF
def extract_date(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = extract_text_pdf2(reader)

        # If text extraction fails, try OCR
        if not text.strip():
            text = perform_ocr(reader)

        # Define regular expression for the date
        date_re = r'DATE\s+(\d{2}/\d{2}/\d{4})'

        # Extract date using regular expression
        date = re.search(date_re, text).group(1) if re.search(date_re, text) else None

    return date

        # Define regular expressions for each field
        # date_re = r'DATE\s+(\d{2}/\d{2}/\d{4})'
        # invoice_number_re = r'INVOICE #\s+(\d+)'
        # bill_to_re = r'BILL TO\n\s+(.+?)\n\s+\d{3}-\d{3}-\d{4}'
        # activity_re = r'(\d{3}-\d{3}-\d{4})\s+([\s\S]+?)\s+Amount'
        # total_re = r'TOTAL\s+\$\s+(\d+\.\d{2})'
        # balance_due_re = r'BALANCE DUE\s+\$\s+(\d+\.\d{2})'

        # Extract data using regular expressions
    #     extracted_data = {
    #         'Date': re.search(date_re, text).group(1) if re.search(date_re, text) else None,
    #         'Invoice Number': re.search(invoice_number_re, text).group(1) if re.search(invoice_number_re, text) else None,
    #         'Bill To': re.search(bill_to_re, text).group(1).strip() if re.search(bill_to_re, text) else None,
    #         'Activity': re.search(activity_re, text).group(2).strip() if re.search(activity_re, text) else None,
    #         'Total': re.search(total_re, text).group(1) if re.search(total_re, text) else None,
    #         'Balance Due': re.search(balance_due_re, text).group(1) if re.search(balance_due_re, text) else None
    #     }

    # return extracted_data

def extract_dates_from_folder(folder_path):
    dates = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            date = extract_date(pdf_path)
            if date:
                dates.append((filename, date))

    return dates


# SOME SHIT
