In [None]:
!pip install pytesseract




In [None]:
!pip install opencv-python




In [None]:
!pip install Pillow




In [None]:
!pip install pymupdf



In [None]:
# Updated invoice extraction logic to store output in a dictionary
def extract_invoice_info(text):
    # Split text into lines for easier processing
    lines = text.split('\n')

    # Initialize dictionary to store extracted information
    extracted_data = {
        "Invoice Number": None,
        "Invoice Date": None,
        "Customer Details": None,
        "Place of Supply": None,
        "Items": [],
        "Total Amount": None,
        "Total Items / Qty": None,
        "Bank Name": None,
        "Account Number": None,
        "IFSC Code": None,
        "Branch": None
    }

    # Loop through each line and search for relevant details
    for i, line in enumerate(lines):
        if 'Invoice #:' in line:
            extracted_data["Invoice Number"] = line.split(':')[-1].strip()

        if 'Invoice Date:' in line:
            extracted_data["Invoice Date"] = line.split(':')[-1].strip()

        if 'Customer Details:' in line:
            extracted_data["Customer Details"] = lines[i + 1].strip()

        if 'Place of Supply:' in line:
            extracted_data["Place of Supply"] = lines[i + 1].strip()

        if 'Total' in line and '₹' in line and 'Amount' in line:
            extracted_data["Total Amount"] = line.split('₹')[-1].strip()

        if 'Total Items / Qty :' in line:
            extracted_data["Total Items / Qty"] = line.split(':')[-1].strip()

        # Detecting the items section and capturing all necessary fields
        if line.strip().isdigit() and i + 7 < len(lines):
            description = lines[i + 1].strip()               # Item Description
            rate_line = lines[i + 2].strip()                 # Rate / Item or Rate with Discount
            qty = lines[i + 4].strip()                       # Quantity
            taxable_value = lines[i + 5].strip()             # Taxable Value
            tax_amount = lines[i + 6].strip()                # Tax Amount
            amount = lines[i + 7].strip()                    # Total Amount

            # Split the rate_line to handle discounts correctly
            if '(' in rate_line:  # If there is a discount in the rate line
                rate, discount = rate_line.split(' ')
            else:
                rate = rate_line
                discount = 'No Discount'

            # Append each item to the 'Items' list in the dictionary
            extracted_data["Items"].append({
                "Description": description,
                "Rate": rate,
                "Discount": discount,
                "Quantity": qty,
                "Taxable Value": taxable_value,
                "Tax Amount": tax_amount,
                "Amount": amount
            })

        # Extract Bank Details
        if 'Bank:' in line:
            extracted_data["Bank Name"] = lines[i + 1].strip() if i + 1 < len(lines) else None

        if 'Account #' in line or 'Account No' in line:
            extracted_data["Account Number"] = lines[i + 1].strip() if i + 1 < len(lines) else None

        if 'IFSC Code:' in line:
            extracted_data["IFSC Code"] = lines[i + 1].strip() if i + 1 < len(lines) else None

        if 'Branch:' in line:
            extracted_data["Branch"] = lines[i + 1].strip() if i + 1 < len(lines) else None

    return extracted_data

# Main function to combine PDF text extraction and invoice info extraction
def process_invoice(file_path):
    """
    Determines if the input is a PDF or an image and processes it accordingly.
    Extracts text from the invoice and returns extracted details in a dictionary.
    """
    if file_path.endswith('.pdf'):
        # Extract text from the PDF
        extracted_text = extract_text_from_pdf(file_path)
    elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        # Extract text from an image
        extracted_text = extract_text_from_image(file_path)
    else:
        raise ValueError("Unsupported file type. Please provide a PDF or an image file.")

    # Call the function to process and extract invoice details
    extracted_data = extract_invoice_info(extracted_text)
    return extracted_data

# Example usage
file_path = '/content/INV-135_Mohith Saragur.pdf'  # Replace with your file path
invoice_data = process_invoice(file_path)
# print(invoice_data)  # Display the extracted data as a dictionary
for key, value in invoice_data.items():
    print(f"{key}: {value}")

Invoice Number: INV-135
Invoice Date: 01 Mar 2024
Customer Details: Mohith Saragur
Place of Supply: 23-MADHYA PRADESH
Items: [{'Description': 'Tab flucon 400mg', 'Rate': '22.58', 'Discount': 'No Discount', 'Quantity': '3 STRP', 'Taxable Value': '67.75', 'Tax Amount': '12.19 (18%)', 'Amount': '79.94'}, {'Description': 'Lupizol ZS Shampoo 100 ml', 'Rate': '369.91', 'Discount': 'No Discount', 'Quantity': '1 BTL', 'Taxable Value': '369.91', 'Tax Amount': '44.39 (12%)', 'Amount': '414.30'}, {'Description': 'Anaboom AD Lotion - 50 ml', 'Rate': '253.56', 'Discount': 'No Discount', 'Quantity': '1 BTL', 'Taxable Value': '253.56', 'Tax Amount': '45.64 (18%)', 'Amount': '299.20'}, {'Description': 'IFSC Code:', 'Rate': 'kkbk0000725', 'Discount': 'No Discount', 'Quantity': 'PUNE - CHINCHWAD', 'Taxable Value': 'UnCue Dermacare Pvt Ltd', 'Tax Amount': 'For UNCUE DERMACARE PRIVATE LIMITED', 'Amount': 'Authorized Signatory'}]
Total Amount: None
Total Items / Qty: 3 / 5.000
Bank Name: Kotak Mahindra Ban

In [None]:
import difflib

# Function to compute similarity between two strings
def compute_similarity(str1, str2):
    return difflib.SequenceMatcher(None, str1, str2).ratio()

# Function to assess trustworthiness of each extracted data point
def assess_trust(similarity):
    HIGH_TRUST_THRESHOLD = 0.95  # 95% similarity or above
    MEDIUM_TRUST_THRESHOLD = 0.90  # Between 90% and 95% similarity

    if similarity >= HIGH_TRUST_THRESHOLD:
        return "High Trust"
    elif similarity >= MEDIUM_TRUST_THRESHOLD:
        return "Medium Trust"
    else:
        return "Low Trust"

# Function to compare the extracted data with ground truth and compute similarity
def compare_data(extracted_data, ground_truth_data):
    comparison_results = {}

    # Compare top-level keys (non-items fields)
    for key in extracted_data:
        if key != "Items":  # Exclude "Items" for now, we will handle it separately
            extracted_value = extracted_data[key] if extracted_data[key] else ""
            ground_truth_value = ground_truth_data[key] if ground_truth_data[key] else ""

            similarity = compute_similarity(str(extracted_value), str(ground_truth_value))
            trust_level = assess_trust(similarity)

            comparison_results[key] = {
                "Extracted Value": extracted_value,
                "Ground Truth Value": ground_truth_value,
                "Similarity": f"{similarity * 100:.2f}%",
                "Trust Level": trust_level
            }

    # Compare items (list of dictionaries)
    comparison_results["Items"] = []

    extracted_items = extracted_data.get("Items", [])
    ground_truth_items = ground_truth_data.get("Items", [])

    if len(extracted_items) != len(ground_truth_items):
        print("Warning: Mismatch in the number of extracted items and ground truth items.")

    for i in range(min(len(extracted_items), len(ground_truth_items))):
        extracted_item = extracted_items[i]
        ground_truth_item = ground_truth_items[i]

        item_comparison = {}
        for field in extracted_item:
            extracted_value = extracted_item.get(field, "")
            ground_truth_value = ground_truth_item.get(field, "")

            similarity = compute_similarity(str(extracted_value), str(ground_truth_value))
            trust_level = assess_trust(similarity)

            item_comparison[field] = {
                "Extracted Value": extracted_value,
                "Ground Truth Value": ground_truth_value,
                "Similarity": f"{similarity * 100:.2f}%",
                "Trust Level": trust_level
            }

        comparison_results["Items"].append(item_comparison)

    return comparison_results

# Ground truth values from the invoice extracted from open source LLMs
ground_truth_data = {
    "Invoice Number": "INV-135",
    "Invoice Date": "01 Mar 2024",
    "Customer Details": "Mohith Saragur",
    "Place of Supply": "23-MADHYA PRADESH",
    "Items": [
        {
            "Description": "Tab flucon 400mg",
            "Rate": "22.58",
            "Discount": "No Discount",
            "Quantity": "3 STRP",
            "Taxable Value": "67.75",
            "Tax Amount": "12.19 (18%)",
            "Amount": "79.94"
        },
        {
            "Description": "Lupizol ZS Shampoo 100 ml",
            "Rate": "369.91",
            "Discount": "No Discount",
            "Quantity": "1 BTL",
            "Taxable Value": "369.91",
            "Tax Amount": "44.39 (12%)",
            "Amount": "414.30"
        },
        {
            "Description": "Anaboom AD Lotion - 50 ml",
            "Rate": "253.56",
            "Discount": "No Discount",
            "Quantity": "1 BTL",
            "Taxable Value": "253.56",
            "Tax Amount": "45.64 (18%)",
            "Amount": "299.20"
        }
    ],
    "Total Amount": "₹793.44",
    "Total Items / Qty": "3 / 5.000",
    "Bank Name": "Kotak Mahindra Bank",
    "Account Number": "1146860541",
    "IFSC Code": "kkbk0000725",
    "Branch": "PUNE - CHINCHWAD"
}

# Perform comparison between extracted data and ground truth
comparison_results = compare_data(extracted_data, ground_truth_data)

# Display comparison results
for key, value in comparison_results.items():
    if key == "Items":
        print("\nItems Comparison:")
        for i, item in enumerate(value):
            print(f"Item {i+1}:")
            for field, comparison in item.items():
                print(f"  {field}:")
                print(f"    Extracted Value: {comparison['Extracted Value']}")
                print(f"    Ground Truth Value: {comparison['Ground Truth Value']}")
                print(f"    Similarity: {comparison['Similarity']}")
                print(f"    Trust Level: {comparison['Trust Level']}")
    else:
        print(f"{key}:")
        print(f"  Extracted Value: {value['Extracted Value']}")
        print(f"  Ground Truth Value: {value['Ground Truth Value']}")
        print(f"  Similarity: {value['Similarity']}")
        print(f"  Trust Level: {value['Trust Level']}")


Invoice Number:
  Extracted Value: INV-135
  Ground Truth Value: INV-135
  Similarity: 100.00%
  Trust Level: High Trust
Invoice Date:
  Extracted Value: 01 Mar 2024
  Ground Truth Value: 01 Mar 2024
  Similarity: 100.00%
  Trust Level: High Trust
Customer Details:
  Extracted Value: Mohith Saragur
  Ground Truth Value: Mohith Saragur
  Similarity: 100.00%
  Trust Level: High Trust
Place of Supply:
  Extracted Value: 23-MADHYA PRADESH
  Ground Truth Value: 23-MADHYA PRADESH
  Similarity: 100.00%
  Trust Level: High Trust
Total Amount:
  Extracted Value: None
  Ground Truth Value: ₹793.44
  Similarity: 0.00%
  Trust Level: Low Trust
Total Items / Qty:
  Extracted Value: 3 / 5.000
  Ground Truth Value: 3 / 5.000
  Similarity: 100.00%
  Trust Level: High Trust
Bank Name:
  Extracted Value: Kotak Mahindra Bank
  Ground Truth Value: Kotak Mahindra Bank
  Similarity: 100.00%
  Trust Level: High Trust
Account Number:
  Extracted Value: 1146860541
  Ground Truth Value: 1146860541
  Similarity: