In [1]:
!pip install pytesseract opencv-python pillow pandas openpyxl


Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting opencv-python
  Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Using cached pytesseract-0.3.13-py3-none-any.whl (14 kB)
Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
Installing collected packages: pytesseract, opencv-python
Successfully installed opencv-python-4.11.0.86 pytesseract-0.3.13


In [9]:
def extract_general_fields(text):
    fields = {}
    conf = {}

    patterns = {
        'invoice_number': r'\b(?:Invoice No|Inv No|Invoice Number)\b[\s:\-]*([A-Z0-9\-]{3,})',
        'invoice_date': r'\b(?:Date|Invoice Date)\b[\s:\-]*([0-9]{2}[\/\-][0-9]{2}[\/\-][0-9]{4})',
        'supplier_gst_number': r'GSTIN[\s:\-]*([0-9A-Z]{15})',
        'bill_to_gst_number': r'Bill.*?GSTIN[\s:\-]*([0-9A-Z]{15})',
        'po_number': r'\b(?:PO No|Purchase Order)\b[\s:\-]*([A-Z0-9\-]+)',
        'shipping_address': r'Shipping Address[\s:\-]*(.+)',
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
            fields[key] = match.group(1).strip()
            conf[key] = 0.9
        else:
            fields[key] = ""
            conf[key] = 0.0

    return fields, conf

def extract_line_items(text):
    # Use your previous extract_table_rows or improved version
    rows = []
    lines = text.split('\n')
    for line in lines:
        if re.search(r'^\d+\s+.*\d+\.\d+\s+\d+\.\d+\s+\d+\.\d+$', line.strip()):
            parts = line.split()
            try:
                row = {
                    "serial_number": parts[0],
                    "description": ' '.join(parts[1:-4]),
                    "hsn_sac": parts[-4],
                    "quantity": float(parts[-3]),
                    "unit_price": float(parts[-2]),
                    "total_amount": float(parts[-1])
                }
                row["description_confidence"] = 0.9
                row["line_total_check"] = {
                    "calculated_value": round(row["unit_price"] * row["quantity"], 2),
                    "extracted_value": row["total_amount"],
                    "check_passed": abs((row["unit_price"] * row["quantity"]) - row["total_amount"]) < 1
                }
                rows.append(row)
            except Exception as e:
                print(f"⚠️ Line parsing failed: {line} with error {e}")
                continue
    return rows

def main():
    all_data = []
    verif_report = {
        "field_verification": {},
        "line_items_verification": [],
        "total_calculations_verification": {},
        "summary": {}
    }

    files = [file for file in os.listdir(INPUT_DIR) if file.endswith(".pdf")]
    if not files:
        print("❌ No PDF files found in the input folder.")
        return

    for file in files:
        pdf_path = os.path.join(INPUT_DIR, file)
        images = pdf_to_images(pdf_path)

        for idx, img in images:
            processed = preprocess_image(img)
            processed_path = os.path.join(PREPROCESSED_DIR, f"page_{idx}.png")
            processed.save(processed_path)

            _, text = ocr_with_confidence(processed)

            print("🔤 OCR Text Output:\n", text[:1000])

            general_fields, conf_scores = extract_general_fields(text)
            line_items = extract_line_items(text)

            seal_detected = detect_seal_or_sign(processed_path)
            general_fields["seal_and_sign_present"] = seal_detected

            if not general_fields and not line_items:
                print("⚠️ No fields or items detected. Skipping this page.")
                continue

            verification = verify_data(general_fields, line_items)

            combined = general_fields.copy()
            combined["line_items"] = line_items
            all_data.append(combined)

            if line_items:
                df = pd.DataFrame(line_items)
                df.to_excel(os.path.join(OUTPUT_DIR, "extracted_data.xlsx"), index=False)

            verif_report["field_verification"] = {
                k: {"confidence": v, "present": bool(general_fields[k])} for k, v in conf_scores.items()
            }
            verif_report["line_items_verification"] = verification["line_items_verification"]
            verif_report["total_calculations_verification"] = {
                k: v for k, v in verification.items() if k.endswith("_check")
            }
            verif_report["summary"] = {
                "all_fields_confident": all(v > 0.8 for v in conf_scores.values()),
                "all_line_items_verified": all(x["line_total_check"]["check_passed"] for x in verification["line_items_verification"]),
                "totals_verified": all(x["check_passed"] for x in verification.values() if isinstance(x, dict)),
                "issues": []
            }

    if not all_data:
        print("❌ No invoice data could be extracted.")
        return

    with open(os.path.join(OUTPUT_DIR, "extracted_data.json"), "w") as f:
        json.dump(all_data[0], f, indent=2)

    with open(os.path.join(OUTPUT_DIR, "verifiability_report.json"), "w") as f:
        json.dump(verif_report, f, indent=2)

    print("✅ Extraction and Verification Completed.")
