In [3]:
import os
import json
import re
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import requests
from PIL import Image

# === CONFIG ===
API_KEY = "APIkey"  # Replace with your Google Gemini API key
INPUT_PDF = "input/1WhatsApp Image.pdf"  # Path to input PDF
OUTPUT_DIR = "output"
TESSERACT_CONFIG = "--psm 6"  # You can tune this based on your input

# Make sure Tesseract executable path is set if needed, e.g.:
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# === PDF to Images ===
def pdf_to_images(pdf_path):
    print("Converting PDF to images...")
    return convert_from_path(pdf_path)

# === OCR to extract raw text ===
def ocr_image_to_text(image):
    text = pytesseract.image_to_string(image, config=TESSERACT_CONFIG)
    return text

# === Call Gemini API ===
def call_gemini_api(prompt_text, api_key):
    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
    headers = {"Content-Type": "application/json"}
    data = {
        "contents": [
            {"parts": [{"text": prompt_text}]}
        ]
    }
    response = requests.post(url, json=data, headers=headers)
    response.raise_for_status()
    resp_json = response.json()
    return resp_json["candidates"][0]["content"]["parts"][0]["text"]

# === Clean Gemini JSON output ===
def clean_response_text(text):
    text = re.sub(r"^```json\s*", "", text.strip())
    text = re.sub(r"\s*```$", "", text.strip())
    return text.strip()

# === Extract general info via LLM ===
def extract_general_info(text):
    prompt = f"""
Extract invoice_number, invoice_date, supplier_gst_number, bill_to_gst_number, po_number, shipping_address from this text:

{text}

Return JSON with keys and values only.
"""
    response = call_gemini_api(prompt, API_KEY)
    cleaned = clean_response_text(response)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("Failed to parse JSON from general info response:")
        print(cleaned)
        return {}

# === Extract table line items via LLM and fix missing HSN and QTY ===
def extract_table_info(text):
    prompt = f"""
Extract line items from the following invoice table text and return a JSON list.
Each item should have these keys: serial_number, description, hsn_sac, quantity, unit_price, total_amount.

Table Text:
{text}

Return only the JSON array.
"""
    response = call_gemini_api(prompt, API_KEY)
    cleaned = clean_response_text(response)
    try:
        if "```" in cleaned:
            cleaned = re.sub(r"```(?:json)?\s*", "", cleaned)
            cleaned = cleaned.replace("```", "")
        items = json.loads(cleaned)
    except json.JSONDecodeError:
        print("❌ JSON parsing failed from Gemini response, returning empty list.")
        return []

    # Regex to extract potential HSN codes (4-8 digits)
    hsn_candidates = re.findall(r'\bHSN\s*NO\.?\s*[:\-]?\s*(\d{4,8})', text, re.IGNORECASE)
    if not hsn_candidates:
        # Fallback: find any 4-8 digit numbers that might be HSNs
        hsn_candidates = re.findall(r'\b(\d{4,8})\b', text)

    # Regex to extract quantities (look for Qty, Quantity labels with numbers)
    qty_candidates = re.findall(r'\bQTY\.?\s*[:\-]?\s*(\d+)', text, re.IGNORECASE)
    if not qty_candidates:
        # Fallback: find numeric quantities near Qty or Quantity keyword
        qty_candidates = re.findall(r'\bQuantity[:\-]?\s*(\d+)', text, re.IGNORECASE)

    # Assign serial_number and fill missing hsn_sac and quantity if possible
    for i, item in enumerate(items):
        item['serial_number'] = i + 1
        item['description'] = item.get('description', f"Item {i + 1}")

        # Assign HSN as int if valid
        hsn_val = None
        if i < len(hsn_candidates):
            candidate = hsn_candidates[i]
            if candidate.isdigit() and 4 <= len(candidate) <= 8:
                hsn_val = int(candidate)
        item['hsn_sac'] = hsn_val

        # Assign quantity as int if valid
        qty_val = None
        if i < len(qty_candidates):
            candidate = qty_candidates[i]
            if candidate.isdigit():
                qty_val = int(candidate)
        item['quantity'] = qty_val

        # Clean unit_price and total_amount, remove Rs., commas, convert to float
        def clean_num(x):
            if not x:
                return None
            x = str(x).replace("Rs.", "").replace("Rs", "").replace(",", "").strip()
            try:
                return float(x)
            except:
                return None

        item['unit_price'] = clean_num(item.get('unit_price'))
        item['total_amount'] = clean_num(item.get('total_amount'))

    return items

import pytesseract

def extract_hsn_qty_from_ocr(image):
    # Use Tesseract to get OCR data including word positions
    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    
    hsn_positions = []
    qty_positions = []

    # Find positions of "HSN", "HSN NO.", "QTY", "Quantity" words
    for i, word in enumerate(ocr_data['text']):
        w = word.strip().lower()
        if 'hsn' in w:
            hsn_positions.append(i)
        if 'qty' in w or 'quantity' in w:
            qty_positions.append(i)

    hsn_values = []
    qty_values = []

    # For each HSN keyword found, look for numeric words nearby (same line or just below)
    for pos in hsn_positions:
        # Assume HSN number is right or below the keyword in OCR data
        line_num = ocr_data['line_num'][pos]
        # Collect words in same line after pos
        line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num and i > pos]
        for idx in line_indices:
            text = ocr_data['text'][idx].strip()
            if text.isdigit() and 4 <= len(text) <= 8:
                hsn_values.append(int(text))
                break
        # If not found in same line, check next line
        if not hsn_values:
            next_line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num + 1]
            for idx in next_line_indices:
                text = ocr_data['text'][idx].strip()
                if text.isdigit() and 4 <= len(text) <= 8:
                    hsn_values.append(int(text))
                    break

    # Similarly for Quantity
    for pos in qty_positions:
        line_num = ocr_data['line_num'][pos]
        line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num and i > pos]
        for idx in line_indices:
            text = ocr_data['text'][idx].strip()
            if text.isdigit():
                qty_values.append(int(text))
                break
        if not qty_values:
            next_line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num + 1]
            for idx in next_line_indices:
                text = ocr_data['text'][idx].strip()
                if text.isdigit():
                    qty_values.append(int(text))
                    break

    return hsn_values, qty_values

# === Detect seal or signature presence and save image ===
def detect_seal_signature(images):
    # Simple heuristic: if image has any large dark blobs or unusual patterns in corners, mark True
    # Here, we just save first image and mark False as placeholder (implement advanced if needed)
    seal_present = False
    seal_image_path = None
    # For demo, no advanced detection - just return False and no image
    return seal_present, seal_image_path

# === Save outputs ===
def save_outputs(general_info, table_items):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Save general info JSON and Excel
    with open(os.path.join(OUTPUT_DIR, "general_info.json"), "w") as f:
        json.dump(general_info, f, indent=2)
    pd.DataFrame([general_info]).to_excel(os.path.join(OUTPUT_DIR, "general_info.xlsx"), index=False)

    # Save table contents JSON and Excel
    with open(os.path.join(OUTPUT_DIR, "table_contents.json"), "w") as f:
        json.dump(table_items, f, indent=2)
    if table_items:
        pd.DataFrame(table_items).to_excel(os.path.join(OUTPUT_DIR, "table_contents.xlsx"), index=False)
        print(f"Saved {len(table_items)} line items to 'table_contents.xlsx'")
    else:
        print("No table items found to save.")

    # Save combined JSON
    combined = {
        "general_info": general_info,
        "seal_and_sign_present": False,  # Update if seal/sign detection is implemented
        "seal_image": None,
        "line_items": table_items
    }
    with open(os.path.join(OUTPUT_DIR, "combined.json"), "w") as f:
        json.dump(combined, f, indent=2)

def main():
    if not os.path.exists(INPUT_PDF):
        print(f"Input PDF not found: {INPUT_PDF}")
        return

    # Convert PDF to images
    images = pdf_to_images(INPUT_PDF)

    # OCR all pages into single text
    full_text = ""
    print("Performing OCR on PDF pages...")
    for img in images:
        full_text += ocr_image_to_text(img) + "\n"

    # Extract general info from full text using Gemini
    print("Extracting general invoice information using LLM...")
    general_info = extract_general_info(full_text)

    # Extract table items using Gemini and regex fix
    print("Extracting table line items using LLM and regex...")
    table_items = extract_table_info(full_text)

    # Detect seal or signature (placeholder, returns False)
    seal_present, seal_image_path = detect_seal_signature(images)

    # Save outputs
    save_outputs(general_info, table_items)

    print("Extraction complete. Check the 'output' folder for results.")
    images = pdf_to_images(input_pdf)
full_text = ""
print("Performing OCR on images...")
for img in images:
    full_text += ocr_image_to_text(img) + "\n"

# Extract HSN and quantity from OCR positional data
hsn_values, qty_values = extract_hsn_qty_from_ocr(images[0])  # assuming 1 page invoice

# Use LLM to get line items JSON (without HSN and quantity)
table_items = extract_table_info(full_text)

# Assign HSN and quantity from OCR positional extraction
for idx, item in enumerate(table_items):
    item['hsn_sac'] = hsn_values[idx] if idx < len(hsn_values) else None
    item['quantity'] = qty_values[idx] if idx < len(qty_values) else None


if __name__ == "__main__":
    main()


Converting PDF to images...
Performing OCR on PDF pages...
Extracting general invoice information using LLM...
Extracting table line items using LLM and regex...
Saved 4 line items to 'table_contents.xlsx'
Extraction complete. Check the 'output' folder for results.


In [51]:
import os
import json
import re
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import requests
from PIL import Image
from word2number import w2n
import cv2
import numpy as np

# === CONFIG ===
API_KEY = "AIzaSyBYDxv5LCDXiWSU0uoANX1UmlNpF8WGKBs"  # Replace with your Google Gemini API key
INPUT_PDF = "input/1yavar_sample.pdf"  # Path to input PDF
OUTPUT_DIR = "output"
TESSERACT_CONFIG = "--psm 6"  # Tune based on input

# Uncomment and set if needed for your system:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# === PDF to Images ===
def pdf_to_images(pdf_path):
    print("Converting PDF to images...")
    return convert_from_path(pdf_path)

# === OCR to extract raw text ===
def ocr_image_to_text(image):
    return pytesseract.image_to_string(image, config=TESSERACT_CONFIG)

# === Call Gemini API ===
def call_gemini_api(prompt_text, api_key):
    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
    headers = {"Content-Type": "application/json"}
    data = {
        "contents": [
            {"parts": [{"text": prompt_text}]}
        ]
    }
    response = requests.post(url, json=data, headers=headers)
    response.raise_for_status()
    resp_json = response.json()
    return resp_json["candidates"][0]["content"]["parts"][0]["text"]

# === Clean Gemini JSON output ===
def clean_response_text(text):
    text = re.sub(r"^```json\s*", "", text.strip())
    text = re.sub(r"\s*```$", "", text.strip())
    return text.strip()

# === Extract general info via LLM ===
def extract_general_info(text):
    prompt = f"""
Extract invoice_number, invoice_date, supplier_gst_number, bill_to_gst_number, po_number, shipping_address from this text:

{text}

Return JSON with keys and values only.
"""
    response = call_gemini_api(prompt, API_KEY)
    cleaned = clean_response_text(response)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("Failed to parse JSON from general info response:")
        print(cleaned)
        return {}

# === Extract table line items via LLM ===
def extract_table_info(text):
    prompt = f"""
Extract line items from the following invoice table text and return a JSON list.
Each item should have these keys: serial_number, description, hsn_sac, quantity, unit_price, total_amount.

Table Text:
{text}

Return only the JSON array.
"""
    response = call_gemini_api(prompt, API_KEY)
    cleaned = clean_response_text(response)
    try:
        # Clean up any markdown or code fences
        cleaned = re.sub(r"```(?:json)?\s*", "", cleaned)
        cleaned = cleaned.replace("```", "")
        items = json.loads(cleaned)
    except json.JSONDecodeError:
        print("❌ JSON parsing failed from Gemini response, returning empty list.")
        return []
    return items

# === Extract HSN and Quantity using OCR positional data ===
def extract_hsn_qty_from_ocr(image):
    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

    hsn_positions = []
    qty_positions = []

    # Find positions of HSN and QTY keywords
    for i, word in enumerate(ocr_data['text']):
        w = word.strip().lower()
        if 'hsn' in w:
            hsn_positions.append(i)
        if 'qty' in w or 'quantity' in w:
            qty_positions.append(i)

    hsn_values = []
    qty_values = []

    # Extract HSN numeric values near keyword positions
    for pos in hsn_positions:
        line_num = ocr_data['line_num'][pos]
        line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num and i > pos]
        for idx in line_indices:
            text = ocr_data['text'][idx].strip()
            if text.isdigit() and 4 <= len(text) <= 8:
                hsn_values.append(int(text))
                break
        if len(hsn_values) == 0:
            # Check next line if none found
            next_line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num + 1]
            for idx in next_line_indices:
                text = ocr_data['text'][idx].strip()
                if text.isdigit() and 4 <= len(text) <= 8:
                    hsn_values.append(int(text))
                    break

    # Extract Quantity numeric values near keywords
    for pos in qty_positions:
        line_num = ocr_data['line_num'][pos]
        line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num and i > pos]
        for idx in line_indices:
            text = ocr_data['text'][idx].strip()
            if text.isdigit():
                qty_values.append(int(text))
                break
        if len(qty_values) == 0:
            next_line_indices = [i for i, lnum in enumerate(ocr_data['line_num']) if lnum == line_num + 1]
            for idx in next_line_indices:
                text = ocr_data['text'][idx].strip()
                if text.isdigit():
                    qty_values.append(int(text))
                    break

    return hsn_values, qty_values

def extract_tax_discount_from_regex(text):
    def extract_percent(label):
        pattern = rf"{label}\s*[@:]*\s*(\d+(?:\.\d+)?)\s*%"
        match = re.search(pattern, text, re.IGNORECASE)
        return float(match.group(1)) if match else None

    return {
        "discount_percent": extract_percent("discount"),
        "sgst_percent": extract_percent("sgst"),
        "cgst_percent": extract_percent("cgst"),
    }

def extract_tax_and_discount(text):
    # --- Call Gemini LLM ---
    prompt = f"""
Extract only the percentage values from this invoice text:

Example:
"SGST @ 6%" → sgst_percent: 6
"CGST RATE @ 9%" → cgst_percent: 9
"DISCOUNT @ 1%" → discount_percent: 1

Respond in this exact JSON format:
{{
  "discount_percent": <number or null>,
  "sgst_percent": <number or null>,
  "cgst_percent": <number or null>
}}

Only return the JSON. No explanation.

Text:
{text}
"""
    try:
        response = call_gemini_api(prompt, API_KEY)
        cleaned = clean_response_text(response)
        llm_data = json.loads(cleaned)
    except Exception as e:
        print("❌ LLM failed:", e)
        llm_data = {"discount_percent": None, "sgst_percent": None, "cgst_percent": None}

    # --- Regex fallback ---
    regex_data = extract_tax_discount_from_regex(text)

    # --- Combine with fallback ---
    final = {
        "discount_percent": llm_data.get("discount_percent") if llm_data.get("discount_percent") is not None else regex_data.get("discount_percent"),
        "sgst_percent": llm_data.get("sgst_percent") if llm_data.get("sgst_percent") is not None else regex_data.get("sgst_percent"),
        "cgst_percent": llm_data.get("cgst_percent") if llm_data.get("cgst_percent") is not None else regex_data.get("cgst_percent"),
    }

    print("✅ Final tax/discount info:", final)
    return final



def get_field_char_confidences(ocr_data, target_field):
    """
    Tries to locate the word corresponding to the field in the OCR data and returns character-level confidence.
    This is a naive approach. You may improve it using regex patterns for specific fields.
    """
    field_aliases = {
        "invoice_number": ["invoice number", "inv no", "invoice no"],
        "invoice_date": ["invoice date", "date"],
        "supplier_gst_number": ["supplier gst", "supplier gstin", "gstin"],
        "bill_to_gst_number": ["bill to gst", "bill to gstin", "gstin"],
        "po_number": ["po number", "purchase order", "po no"],
        "shipping_address": ["shipping address", "ship to", "delivery address"]
    }

    confidences = []

    text = ocr_data["text"]
    num_words = len(text)

    aliases = field_aliases.get(target_field, [target_field])

    for idx in range(num_words):
        current_text = text[idx].lower().strip()
        for alias in aliases:
            alias_words = alias.split()
            match = True
            for offset, word in enumerate(alias_words):
                if idx + offset >= num_words or text[idx + offset].lower().strip() != word:
                    match = False
                    break
            if match:
                # Try getting next meaningful word as value
                value_idx = idx + len(alias_words)
                while value_idx < num_words and not text[value_idx].strip():
                    value_idx += 1
                if value_idx < num_words:
                    value_text = text[value_idx]
                    value_conf = ocr_data['conf'][value_idx]
                    for char in value_text:
                        if char.strip():
                            confidences.append((char, float(value_conf)))
                return confidences  # Only return first match
    return confidences


def get_weighted_confidence(word_confidences):
    """
    Calculate weighted confidence for a word.
    For numerical fields, apply higher weight to initial digits.
    """
    if not word_confidences:
        return 1.0  # Default confidence for empty input

    # Check if the word represents a number
    is_number = all(char.isdigit() or char == '.' for char, _ in word_confidences)

    if is_number:
        weight = 1.0
        total_weight = 0.0
        weighted_sum = 0.0
        for char, conf in word_confidences:
            weighted_sum += conf * weight
            total_weight += weight
            weight /= 10  # Decay weight for subsequent digits
        return weighted_sum / total_weight if total_weight else 0.0
    else:
        # For non-numeric fields, return average confidence
        total_confidence = sum(conf for _, conf in word_confidences)
        return total_confidence / len(word_confidences)


def extract_general_info_with_confidence(image):
    """
    Extract general information fields along with their confidence scores.
    """
    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    fields = ['invoice_number', 'invoice_date', 'supplier_gst_number', 'bill_to_gst_number', 'po_number', 'shipping_address']
    field_verification = {}

    for field in fields:
        # Implement logic to locate the field in ocr_data and extract its characters and confidences
        # For demonstration, let's assume we have a function get_field_char_confidences that does this
        word_confidences = get_field_char_confidences(ocr_data, field)
        confidence = get_weighted_confidence(word_confidences)
        field_verification[field] = {
            "confidence": round(confidence, 2),
            "present": bool(word_confidences)
        }

    return field_verification

def save_field_verification(field_verification):
    output = {
        "field_verification": field_verification,
        "line_items_verification": [],
        "total_calculations_verification": {
            "subtotal_check": {
                "check_passed": True
            },
            "grand_total_check": {
                "check_passed": True
            }
        },
        "summary": {
            "all_fields_confident": all(f["confidence"] >= 0.8 for f in field_verification.values()),
            "all_line_items_verified": True,
            "totals_verified": True,
            "issues": []
        }
    }

    with open(os.path.join(OUTPUT_DIR, "field_verification.json"), "w") as f:
        json.dump(output, f, indent=2)
    print(f"Field verification saved to {os.path.join(OUTPUT_DIR, 'field_verification.json')}")



def parse_quantity(qty):
    if qty is None:
        return 0
    if isinstance(qty, (int, float)):
        return qty
    if isinstance(qty, str):
        qty = qty.strip().lower()
        try:
            # Try converting direct digits string first
            return float(qty)
        except:
            try:
                # Try converting word number to int
                return w2n.word_to_num(qty)
            except:
                # If fails, just return 0
                return 0
    return 0

def calculate_summary(general_info, table_items):
    total_quantity = 0
    total_invoice_amount = 0.0
    unit_prices = []

    for item in table_items:
        qty = item.get('quantity')
        qty_num = parse_quantity(qty)
        total_quantity += qty_num

        amount = item.get('total_amount')
        if amount is not None and isinstance(amount, (int, float)):
            total_invoice_amount += amount

        price = item.get('unit_price')
        if price is not None and isinstance(price, (int, float)):
            unit_prices.append(price)

    average_unit_price = sum(unit_prices) / len(unit_prices) if unit_prices else None

    summary = {
        "total_quantity": total_quantity,
        "total_invoice_amount": round(total_invoice_amount, 2),
        "average_unit_price": round(average_unit_price, 2) if average_unit_price is not None else None,
        "invoice_number": general_info.get("invoice_number", ""),
        "invoice_date": general_info.get("invoice_date", "")
    }
    return summary


# === Save calculated summary separately ===
def save_calculated_summary(summary):
    summary_path = os.path.join(OUTPUT_DIR, "calculated_summary.json")
    with open(summary_path, "w") as f:
        json.dump(summary, f, indent=2)
    print(f"Calculated summary saved to {summary_path}")

def save_outputs(general_info, table_items, seal_present=False, seal_image_path=None, tax_info=None):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    with open(os.path.join(OUTPUT_DIR, "general_info.json"), "w") as f:
        json.dump(general_info, f, indent=2)

    pd.DataFrame([general_info]).to_excel(os.path.join(OUTPUT_DIR, "general_info.xlsx"), index=False)

    with open(os.path.join(OUTPUT_DIR, "table_contents.json"), "w") as f:
        json.dump(table_items, f, indent=2)

    if table_items:
        pd.DataFrame(table_items).to_excel(os.path.join(OUTPUT_DIR, "table_contents.xlsx"), index=False)
        print(f"Saved {len(table_items)} line items to 'table_contents.xlsx'")
    else:
        print("No line items found to save.")

    combined = {
        "general_info": general_info,
        "seal_and_sign_present": seal_present,
        "seal_image": seal_image_path,
        "line_items": table_items,
        "tax_info": tax_info
    }

    with open(os.path.join(OUTPUT_DIR, "combined.json"), "w") as f:
        json.dump(combined, f, indent=2)


def main():
    if not os.path.exists(INPUT_PDF):
        print(f"Input PDF not found: {INPUT_PDF}")
        return

    # Convert PDF pages to images
    images = pdf_to_images(INPUT_PDF)

    # OCR all pages to single text
    print("Performing OCR on PDF pages...")
    full_text = ""
    for img in images:
        full_text += ocr_image_to_text(img) + "\n"

    # Extract general info using LLM
    print("Extracting general invoice information via LLM...")
    general_info = extract_general_info(full_text)

    # Extract raw table items using LLM
    print("Extracting table line items via LLM...")
    table_items = extract_table_info(full_text)

    # Extract HSN and Quantity using OCR positional data on first page image
    print("Extracting HSN and Quantity values via OCR positional data...")
    hsn_values, qty_values = extract_hsn_qty_from_ocr(images[0])  # Assuming first page

    # Overwrite HSN and Quantity in table items with OCR extracted values if present
    for idx, item in enumerate(table_items):
        if idx < len(hsn_values):
            item['hsn_sac'] = hsn_values[idx]
        else:
            item['hsn_sac'] = item.get('hsn_sac', None)

        if idx < len(qty_values):
            item['quantity'] = qty_values[idx]
        else:
            item['quantity'] = item.get('quantity', None)

        # Clean unit_price and total_amount fields
        def clean_num(x):
            if not x:
                return None
            x = str(x).replace("Rs.", "").replace("Rs", "").replace(",", "").strip()
            try:
                return float(x)
            except:
                return None

        item['unit_price'] = clean_num(item.get('unit_price'))
        item['total_amount'] = clean_num(item.get('total_amount'))

        # Ensure serial_number and description are present
        item['serial_number'] = idx + 1
        item['description'] = item.get('description', f"Item {idx + 1}")


    # Extract confidence verification of fields using OCR data on first page
    field_verification = extract_general_info_with_confidence(images[0])
    save_field_verification(field_verification)

    # Calculate summary data from extracted info
    summary = calculate_summary(general_info, table_items)
    save_calculated_summary(summary)

    # Extract tax and discount info
    print("Extracting tax and discount information via LLM...")
    tax_info = extract_tax_info(full_text)
    
    # Save all outputs
    save_outputs(general_info, table_items, tax_info=tax_info)

    with open(os.path.join(OUTPUT_DIR, "tax_info.json"), "w") as f:
        json.dump(tax_info, f, indent=2)



    print("Tax and discount info:", tax_info)

    print("Processing complete.")


    print("Extraction and calculation complete. Check the 'output' folder for results.")

    print("Extraction complete. Check the 'output' folder for results.")

if __name__ == "__main__":
    main()


Converting PDF to images...
Performing OCR on PDF pages...
Extracting general invoice information via LLM...
Extracting table line items via LLM...
Extracting HSN and Quantity values via OCR positional data...
Field verification saved to output\field_verification.json
Calculated summary saved to output\calculated_summary.json
Extracting tax and discount information via LLM...
Saved 4 line items to 'table_contents.xlsx'
Tax and discount info: {'discount_percent': None, 'sgst_percent': 9, 'cgst_percent': 9}
Processing complete.
Extraction and calculation complete. Check the 'output' folder for results.
Extraction complete. Check the 'output' folder for results.


In [31]:
import os
from pdf2image import convert_from_path
import pytesseract
import re
import pandas as pd
from PIL import Image

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

OUTPUT_DIR = r"output"
SEAL_DIR = os.path.join(OUTPUT_DIR, "seals")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(SEAL_DIR, exist_ok=True)

def extract_images_from_pdf(pdf_path):
    print("Converting PDF pages to images...")
    images = convert_from_path(pdf_path, dpi=300)
    print(f"Converted {len(images)} pages.")
    return images

def detect_seal_signature(image, invoice_index):
    # Heuristic for seal/sign presence:
    width, height = image.size
    crop_area = (
    int(width * 0.65),      # x1 (start x)
    int(height * 0.65),     # y1 (start y)
    int(width * 0.95),                 # x2 (end x)
    int(height * 0.9)     # y2 (end y, now 95% instead of 100%)
    )  # bottom right quarter
    cropped = image.crop(crop_area)
    gray = cropped.convert('L')
    hist = gray.histogram()
    dark_pixels = sum(hist[:30])  # count dark pixels in dark range
    total_pixels = sum(hist)
    ratio_dark = dark_pixels / total_pixels if total_pixels > 0 else 0
    if ratio_dark > 0.02:  # threshold
        path = os.path.join(SEAL_DIR, f"seal_invoice_{invoice_index}.png")
        cropped.save(path)
        print(f"Seal/signature detected and saved to {path}")
        return True
    return False

def main(pdf_path):
    # Step 1: Convert PDF to images
    images = extract_images_from_pdf(pdf_path)
    
    # Step 2: Set the page and invoice index (assuming only 1 invoice for now)
    page_index = 0
    idx = 1  # You can use a loop later if processing multiple invoices
    
    # Step 3: Run seal detection
    seal_present = detect_seal_signature(images[page_index], idx)
    
    # Step 4: Example general_info dict
    general_info = {}  # Normally populated by OCR/LLM pipeline
    general_info['seal_and_sign_present'] = seal_present
    
    print("Seal detection result:", seal_present)


if __name__ == "__main__":
    pdf_path = r"C:\Users\csmuk\yavar\input\1WhatsApp Image.pdf"
    main(pdf_path)

Converting PDF pages to images...
Converted 1 pages.
Seal/signature detected and saved to output\seals\seal_invoice_1.png
Seal detection result: True


In [35]:
import os
from pdf2image import convert_from_path
import pytesseract
import re
import pandas as pd

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

OUTPUT_DIR = r"output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_text_from_pdf(pdf_path):
    print("Converting PDF to image...")
    images = convert_from_path(pdf_path, dpi=300)
    print(f"PDF has {len(images)} page(s).")

    # Since only 1 page, take first image
    image = images[0]

    print("Performing OCR on full page image...")
    text = pytesseract.image_to_string(image, config='--psm 6')  # Assume a single uniform block of text
    return text

def extract_discount_gst_final_total(ocr_text):
    # Adjust regex to your invoice text patterns
    discount_match = re.search(r'Discount\s*[:\-]?\s*₹?\s*([\d,\.]+)', ocr_text, re.IGNORECASE)
    gst_match = re.search(r'GST\s*[:\-]?\s*₹?\s*([\d,\.]+)', ocr_text, re.IGNORECASE)
    final_match = re.search(r'(Grand Total|Final Total|Total Amount)\s*[:\-]?\s*₹?\s*([\d,\.]+)', ocr_text, re.IGNORECASE)

    discount = float(discount_match.group(1).replace(',', '')) if discount_match else 0.0
    gst = float(gst_match.group(1).replace(',', '')) if gst_match else 0.0
    final_total = float(final_match.group(2).replace(',', '')) if final_match else 0.0

    print(f"Extracted => Discount: ₹{discount}, GST: ₹{gst}, Final Total: ₹{final_total}")
    return discount, gst, final_total

def validate_invoice(table_path, general_path, discount, gst, final_total):
    # Load extracted data from Excel
    table_df = pd.read_excel(table_path)
    general_df = pd.read_excel(general_path)

    # Validate line items
    line_errors = []
    for idx, row in table_df.iterrows():
        qty = row['quantity']
        unit = row['unit_price']
        total = row['total_amount']
        if round(qty * unit, 2) != round(total, 2):
            line_errors.append((idx+1, qty, unit, total))

    # Calculate subtotal
    subtotal = round(table_df['total_amount'].sum(), 2)

    # Validate totals
    expected_final = round(subtotal - discount + gst, 2)
    final_matches = round(final_total, 2) == expected_final

    # Flags
    general_info_flags = {
        'subtotal_verified': True,
        'final_total_verified': final_matches,
        'discount': discount,
        'gst': gst,
        'final_total': final_total
    }

    # Prepare validated dataframes to save
    general_df['verified'] = final_matches
    table_df['line_item_verified'] = True
    for idx, _, _, _ in line_errors:
        table_df.loc[idx-1, 'line_item_verified'] = False

    # Save validated excel files
    table_df.to_excel(os.path.join(OUTPUT_DIR, 'table_contents_validated.xlsx'), index=False)
    general_df.to_excel(os.path.join(OUTPUT_DIR, 'general_info_validated.xlsx'), index=False)

    # Write log file
    with open(os.path.join(OUTPUT_DIR, 'validation_log.txt'), 'w', encoding='utf-8') as f:
        if line_errors:
            f.write("Line Item Errors:\n")
            for line_no, qty, unit, total in line_errors:
                f.write(f"Row {line_no}: quantity({qty}) * unit_price({unit}) != total_amount({total})\n")
        else:
            f.write("✅ All line items verified.\n")

        f.write(f"\nSubtotal: ₹{subtotal}\n")
        f.write(f"Discount: ₹{discount}\n")
        f.write(f"GST: ₹{gst}\n")
        f.write(f"Expected Final Total: ₹{expected_final}\n")
        f.write(f"Actual Final Total: ₹{final_total}\n")
        f.write(f"Final total match: {'Yes' if final_matches else 'No'}\n")

    print("✅ Validation complete. Results saved to:")
    print(f"  - {os.path.join(OUTPUT_DIR, 'table_contents_validated.xlsx')}")
    print(f"  - {os.path.join(OUTPUT_DIR, 'general_info_validated.xlsx')}")
    print(f"  - {os.path.join(OUTPUT_DIR, 'validation_log.txt')}")

if __name__ == "__main__":
    pdf_path = r"C:\Users\csmuk\yavar\input\1WhatsApp Image.pdf"
    table_path = r"output/table_contents.xlsx"
    general_path = r"output/general_info.xlsx"

    ocr_text = extract_text_from_pdf(pdf_path)
    discount, gst, final_total = extract_discount_gst_final_total(ocr_text)
    validate_invoice(table_path, general_path, discount, gst, final_total)


Converting PDF to image...
PDF has 1 page(s).
Performing OCR on full page image...
Extracted => Discount: ₹0.0, GST: ₹0.0, Final Total: ₹0.0
✅ Validation complete. Results saved to:
  - output\table_contents_validated.xlsx
  - output\general_info_validated.xlsx
  - output\validation_log.txt


In [39]:
from pdf2image import convert_from_path
import pytesseract
import re

def extract_percentages_from_pdf(pdf_path):
    # Convert PDF page(s) to image(s)
    images = convert_from_path(pdf_path)
    # Assuming 1-page PDF
    image = images[0]

    # OCR full page
    ocr_text = pytesseract.image_to_string(image)

    # Regex to find Discount % - common patterns like "DISCOUNT @ 1%" or "Discount: 1%"
    discount_match = re.search(r'DISCOUNT\s*@?\s*(\d+\.?\d*)\s*%', ocr_text, re.IGNORECASE)
    discount_percent = float(discount_match.group(1)) if discount_match else 0.0

    # Regex for SGST % and CGST % (or just GST % if CGST/SGST not split)
    sgst_match = re.search(r'SGST\s*RATE\s*@?\s*(\d+\.?\d*)\s*%', ocr_text, re.IGNORECASE)
    cgst_match = re.search(r'CGST\s*RATE\s*@?\s*(\d+\.?\d*)\s*%', ocr_text, re.IGNORECASE)

    sgst_percent = float(sgst_match.group(1)) if sgst_match else 0.0
    cgst_percent = float(cgst_match.group(1)) if cgst_match else 0.0

    return discount_percent, sgst_percent, cgst_percent

# Example usage
pdf_path = r"C:\Users\csmuk\yavar\input\1WhatsApp Image.pdf"
discount, sgst, cgst = extract_percentages_from_pdf(pdf_path)
print(f"Extracted Discount: {discount}%, SGST: {sgst}%, CGST: {cgst}%")


Extracted Discount: 0.0%, SGST: 6.0%, CGST: 0.0%


In [40]:
import pytesseract
from pdf2image import convert_from_path
import requests

API_KEY = "AIzaSyBYDxv5LCDXiWSU0uoANX1UmlNpF8WGKBs"
INPUT_PDF = "input/1WhatsApp Image.pdf"
TESSERACT_CONFIG = "--psm 6"
OUTPUT_DIR = "output"

def extract_text_from_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img, config=TESSERACT_CONFIG)
    return text

def get_discount_from_llm(text, api_key):
    # Example Google LLM API endpoint (fictional placeholder)
    url = "https://generativelanguage.googleapis.com/v1beta2/models/text-bison-001:generateText?key=" + api_key

    prompt = f"Given this invoice/product info, calculate the discount:\n\n{text}\n\nDiscount:"
    payload = {
        "prompt": {
            "text": prompt
        },
        "temperature": 0.7,
        "maxOutputTokens": 100
    }

    response = requests.post(url, json=payload)
    if response.status_code == 200:
        data = response.json()
        # Assuming response has a 'candidates' list with 'output' text
        discount_info = data['candidates'][0]['output']
        return discount_info
    else:
        return f"Error: {response.status_code} - {response.text}"

def main():
    text = extract_text_from_pdf(INPUT_PDF)
    discount = get_discount_from_llm(text, API_KEY)
    print("Discount info from LLM:\n", discount)

if __name__ == "__main__":
    main()


Discount info from LLM:
 Error: 404 - {
  "error": {
    "code": 404,
    "message": "Requested entity was not found.",
    "status": "NOT_FOUND"
  }
}

