In [1]:
import pytesseract
from pdf2image import convert_from_path
import re
import cv2
import numpy as np

# Path to the Tesseract executable (change this according to your setup)
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\asus\tesseract.exe'

# Path to the PDF file
pdf_file_path = "C:\\Users\\asus\\Downloads\\Sales_Service.pdf"

# Convert PDF pages to images using pdf2image
images = convert_from_path(pdf_file_path)

# Define a dictionary of keywords and corresponding information keys
keywords = {
    "Invoice Date": "Invoice Date",
}

# Define regular expression patterns
invoice_number_patterns = [
    r'Invoice\s*No[.\s]:\s([A-Za-z0-9]+)',       # Existing pattern
    r'Invoice\s*No[.\s]\s:\s*([0-9A-Za-z]+)\s*'  # New format: Invoice No.
]

state_code_pattern = r'State\s*Code\s*:\s*([A-Za-z0-9]+)'

# Define regular expression patterns for GSTIN and GSTIN/UIN
gst_number_pattern = r'GSTIN\s*:\s*([A-Za-z0-9]+)'
gstin_uin_pattern = r'GSTIN/ UIN\s*:\s*([A-Za-z0-9]+)'

# Define possible date formats
date_formats = [
    r'\d{1,2}/\d{1,2}/\d{2,4}',  # e.g., 08/23/2023
    r'\d{1,2}-\d{1,2}-\d{2,4}',  # e.g., 08-23-2023
    r'\d{1,2}\s[a-zA-Z]{3,9}\s\d{2,4}',  # e.g., 23 Aug 2023
    r'\d{1,2}\s[a-zA-Z]{3,9},\s\d{2,4}',  # e.g., 23 Aug, 2023
    r'[a-zA-Z]{3,9}\s\d{1,2},\s\d{2,4}',  # e.g., Aug 23, 2023
    r'[a-zA-Z]{3,9}\s\d{1,2}\s\d{2,4}',  # e.g., Aug 23 2023
    r'\d{1,2}\s[a-zA-Z]{3,9}\s\'\d{2}',  # e.g., 23 Aug '23
    r'\d{1,2}\s[a-zA-Z]{3,9}\s\d{4}',  # e.g., 23 Aug 2023
    r'\d{1,2}-[a-zA-Z]{3}-\d{2,4}',  # e.g., 23-Jul-2023
    # Add more date formats as needed
]

# Loop through the images and extract text using Tesseract
for page_num, image in enumerate(images, start=1):
    # Initialize an empty string to store the extracted text
    extracted_text = ''

    # Convert the image to text using Tesseract
    text = pytesseract.image_to_string(image)

    # Search for the invoice number using the regular expressions
    invoice_number = "Not found"
    for pattern in invoice_number_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            invoice_number = match.group(1)
            break

    # Search for the state code using the regular expression
    state_code_match = re.search(state_code_pattern, text, re.IGNORECASE)

    # Extract the state code if a match is found
    if state_code_match:
        state_code = state_code_match.group(1)
    else:
        state_code = "Not found"

    # Search for the GSTIN using the regular expression
    gst_number_match = re.search(gst_number_pattern, text, re.IGNORECASE)

    # Extract the GSTIN if a match is found
    if gst_number_match:
        gst_number = gst_number_match.group(1)
    else:
        gst_number = "Not found"

    # Search for the GSTIN/UIN using the regular expression
    gstin_uin_match = re.search(gstin_uin_pattern, text, re.IGNORECASE)

    # Extract the GSTIN/UIN if a match is found
    if gstin_uin_match:
        gstin_uin = gstin_uin_match.group(1)
    else:
        gstin_uin = "Not found"

    # Perform keyword extraction on the current page
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    page_text = pytesseract.image_to_string(gray)

    # Print the extracted information for the current page
    print(f"Page {page_num} Information:")
    print(f"Invoice Number: {invoice_number}")
    print(f"State Code: {state_code}")
    print(f"GSTIN: {gst_number}")
    print(f"GSTIN/UIN: {gstin_uin}")

    # Print the extracted keyword information for the current page
    for keyword, info_key in keywords.items():
        start_idx = page_text.find(keyword)
        if start_idx != -1:
            start_idx += len(keyword) + 1  # Move past the colon
            end_idx = page_text.find("\n", start_idx)
            value = page_text[start_idx:end_idx].strip()

            # Try to extract date using different formats
            date_value = "Not found"
            for date_format in date_formats:
                match = re.search(date_format, value)
                if match:
                    date_value = match.group()
                    break

            print(f"{info_key}: {date_value}")
        else:
            print(f"{info_key}: Not found")

    print("---------------------------------------------------")


Page 1 Information:
Invoice Number: 23331A0700000615
State Code: 33
GSTIN: 33AABCI8842G1ZM
GSTIN/UIN: 33ACUPV0251A1ZS
Invoice Date: 31-Jul-2023
---------------------------------------------------
Page 2 Information:
Invoice Number: 23331A0700000615
State Code: 33
GSTIN: 33AABCI8842G1ZM
GSTIN/UIN: 33ACUPV0251A1ZS
Invoice Date: 31-Jul-2023
---------------------------------------------------
