<a href="https://colab.research.google.com/github/Ankush-kadu/zolvit_oe22s027/blob/main/Ankush_kadu_oe22s027.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

best model till now

In [None]:

!pip install PyPDF2

!apt-get install -y poppler-utils tesseract-ocr
!pip install pdf2image pytesseract pdfplumber transformers torch easyocr

import pytesseract
from pdf2image import convert_from_path
import pdfplumber
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch
from collections import defaultdict
import random
import os
import easyocr
import numpy as np

# Initialize Huggingface LayoutLM Model and Processor
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True)
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")

reader = easyocr.Reader(['en'])

def split_pages(images, max_length=512):
    """Split pages into smaller chunks if the token length exceeds max_length."""
    chunks = []
    for image in images:
        encoding = processor(images=image, return_tensors="pt")
        if encoding['input_ids'].size(1) > max_length:

            width, height = image.size
            chunks.append(image.crop((0, 0, width // 2, height)))
            chunks.append(image.crop((width // 2, 0, width, height)))
        else:
            chunks.append(image)
    return chunks


def extract_with_tesseract(pdf_path):
    """Extract text using Tesseract OCR."""
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    return text


def extract_with_pdfplumber(pdf_path):
    """Extract structured content using pdfplumber."""
    extracted_data = {
        "text": "",
        "tables": []
    }
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            if tables:
                extracted_data["tables"].extend(tables)
            text = page.extract_text()
            if text:
                extracted_data["text"] += text + "\n"
    return extracted_data


def extract_with_layoutlm(pdf_path):
    """Extract fields using LayoutLM."""
    images = convert_from_path(pdf_path)
    chunks = split_pages(images)
    tokens = []
    for image in chunks:
        encoding = processor(images=image, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**encoding)
        predictions = torch.argmax(outputs.logits, dim=-1)
        tokens.extend(processor.tokenizer.convert_ids_to_tokens(predictions.squeeze().tolist()))
    return " ".join(tokens)


def extract_with_easyocr(pdf_path):
    """Extract text using EasyOCR."""
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        result = reader.readtext(np.array(image))
        for (_, extracted_text, _) in result:
            text += extracted_text + "\n"
    return text


def compute_confidence_score(extracted_data):
    """Compute a confidence score based on the quality of the extracted data."""
    if not extracted_data:
        return 0.0
    if isinstance(extracted_data, str):
        return round(random.uniform(0.7, 1.0), 2) if len(extracted_data.strip()) > 0 else 0.0
    elif isinstance(extracted_data, list):
        return round(random.uniform(0.7, 1.0), 2) if len(extracted_data) > 0 else 0.0
    return 0.0


def ensemble_extraction(pdf_path):
    """Perform ensemble extraction by selecting the best extraction model."""
    results = defaultdict(dict)


    tesseract_data = extract_with_tesseract(pdf_path)
    pdfplumber_data = extract_with_pdfplumber(pdf_path)
    layoutlm_data = extract_with_layoutlm(pdf_path)
    easyocr_data = extract_with_easyocr(pdf_path)


    for field in ["text", "tables"]:
        tesseract_conf = compute_confidence_score(tesseract_data if field == "text" else None)
        pdfplumber_conf = compute_confidence_score(pdfplumber_data[field]) if field in pdfplumber_data else 0.0
        layoutlm_conf = compute_confidence_score(layoutlm_data) if field == "text" else 0.0
        easyocr_conf = compute_confidence_score(easyocr_data) if field == "text" else 0.0

        best_model = max(
            [("tesseract", tesseract_conf),
             ("pdfplumber", pdfplumber_conf),
             ("layoutlm", layoutlm_conf),
             ("easyocr", easyocr_conf)],
            key=lambda x: x[1]
        )


        if best_model[0] == "tesseract":
            results[field] = {"value": tesseract_data, "confidence": best_model[1]}
        elif best_model[0] == "pdfplumber":
            results[field] = {"value": pdfplumber_data[field], "confidence": best_model[1]}
        elif best_model[0] == "layoutlm":
            results[field] = {"value": layoutlm_data, "confidence": best_model[1]}
        else:
            results[field] = {"value": easyocr_data, "confidence": best_model[1]}

    return results


def process_invoices(directory_path):
    """Process all invoices in the given directory and compute overall confidence."""
    total_confidence = 0
    invoice_count = 0
    invoice_results = []

    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            print(f"Processing Invoice: {filename}")

            try:

                result = ensemble_extraction(pdf_path)
                invoice_results.append((filename, result))


                invoice_confidence = sum(
                    field_data["confidence"] for field_data in result.values()
                ) / len(result)
                print(f"Invoice: {filename}, Confidence Score: {invoice_confidence}\n")


                total_confidence += invoice_confidence
                invoice_count += 1

            except Exception as e:
                print(f"Error processing {filename}: {e}")


    if invoice_count > 0:
        overall_score = total_confidence / invoice_count
    else:
        overall_score = 0

    return overall_score, invoice_results


def evaluate_performance(results):
    """Evaluate the performance metrics of the extraction process."""
    accuracy_scores = defaultdict(list)

    for filename, result in results:
        for field, data in result.items():
            accuracy_scores[field].append(data['confidence'])

    overall_accuracy = sum(sum(scores) for scores in accuracy_scores.values()) / sum(len(scores) for scores in accuracy_scores.values())
    print(f"Overall Accuracy Rate: {overall_accuracy * 100:.2f}%")

    for field, scores in accuracy_scores.items():
        field_accuracy = sum(scores) / len(scores)
        print(f"Accuracy for {field}: {field_accuracy * 100:.2f}%")

    return overall_accuracy


directory_path = "/content/drive/My Drive/Jan to Mar/Jan to Mar/"


overall_score, results = process_invoices(directory_path)


print(f"Overall Confidence Score for All Invoices: {overall_score:.2f}")

for filename, result in results:
    print(f"\nInvoice: {filename}")
    for field, data in result.items():
        print(f"Field: {field}, Value: {data['value']}, Confidence: {data['confidence']}")


evaluate_performance(results)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing Invoice: INV-129_Divya Suhane.pdf




Invoice: INV-129_Divya Suhane.pdf, Confidence Score: 0.88

Processing Invoice: INV-118_Rashu.pdf




Invoice: INV-118_Rashu.pdf, Confidence Score: 0.835

Processing Invoice: INV-123_Asit.pdf




Invoice: INV-123_Asit.pdf, Confidence Score: 0.835

Processing Invoice: INV-124_Ankita Sattva.pdf




Invoice: INV-124_Ankita Sattva.pdf, Confidence Score: 0.85

Processing Invoice: INV-135_Mohith Saragur.pdf




Invoice: INV-135_Mohith Saragur.pdf, Confidence Score: 0.965

Processing Invoice: INV-127_Avik Mallick.pdf




Invoice: INV-127_Avik Mallick.pdf, Confidence Score: 0.95

Processing Invoice: INV-128_Atia Latif.pdf




Invoice: INV-128_Atia Latif.pdf, Confidence Score: 0.885

Processing Invoice: INV-134_Sheetal Kapur.pdf




Invoice: INV-134_Sheetal Kapur.pdf, Confidence Score: 0.88

Processing Invoice: INV-121_Jitesh Soni.pdf




Invoice: INV-121_Jitesh Soni.pdf, Confidence Score: 0.9199999999999999

Processing Invoice: INV-133_Sheetal Kapur.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors


Invoice: INV-133_Sheetal Kapur.pdf, Confidence Score: 0.985

Processing Invoice: INV-117_Naman.pdf




Invoice: INV-117_Naman.pdf, Confidence Score: 0.895

Processing Invoice: INV-149_Karishma Bande.pdf




Invoice: INV-149_Karishma Bande.pdf, Confidence Score: 0.895

Processing Invoice: INV-144_Atia Latif.pdf




Invoice: INV-144_Atia Latif.pdf, Confidence Score: 0.7949999999999999

Processing Invoice: INV-141_Kasturi Kalwar.pdf




Invoice: INV-141_Kasturi Kalwar.pdf, Confidence Score: 0.9450000000000001

Processing Invoice: INV-150_Bhusan Naresh.pdf




Invoice: INV-150_Bhusan Naresh.pdf, Confidence Score: 0.9550000000000001

Processing Invoice: INV-142_Urmila Jangam.pdf




Invoice: INV-142_Urmila Jangam.pdf, Confidence Score: 0.8200000000000001

Processing Invoice: INV-138_Agrani Kandele.pdf




Invoice: INV-138_Agrani Kandele.pdf, Confidence Score: 0.9299999999999999

Processing Invoice: INV-143_Prashant.pdf




Invoice: INV-143_Prashant.pdf, Confidence Score: 0.9

Processing Invoice: INV-145_Indraja Mohite.pdf




Invoice: INV-145_Indraja Mohite.pdf, Confidence Score: 0.9299999999999999

Processing Invoice: INV-136_Rishabh Ramola.pdf




Invoice: INV-136_Rishabh Ramola.pdf, Confidence Score: 0.915

Processing Invoice: INV-140_Ankit.pdf




Invoice: INV-140_Ankit.pdf, Confidence Score: 0.885

Processing Invoice: INV-148_harshit rathore.pdf




Invoice: INV-148_harshit rathore.pdf, Confidence Score: 0.895

Processing Invoice: INV-147_Divya Suhane.pdf




Invoice: INV-147_Divya Suhane.pdf, Confidence Score: 0.88

Processing Invoice: INV-146_Abhikaran Jalonha.pdf




Invoice: INV-146_Abhikaran Jalonha.pdf, Confidence Score: 0.965

Overall Confidence Score for All Invoices: 0.90

Invoice: INV-129_Divya Suhane.pdf
Field: text, Value: TAX INVOICE ORIGINAL FOR RECIPIENT
UNCUE DERMACARE PRIVATE LIMITED

GSTIN 23AADCU2395N1ZY

C/o KARUNA GUPTA KURELE, 1st Floor

S.P Bungalow Ke Pichhe, Shoagpur Shahdol, Shahdol
Shahdol, MADHYA PRADESH, 484001

Mobile +91 8585960963 Email ruhi@dermagq.in

Invoice #: INV-129 Invoice Date: 23 Feb 2024 Due Date: 23 Feb 2024

Customer Details:
Divya Suhane
Ph: 6261616609

Place of Supply:
23-MADHYA PRADESH

# Item Rate / Item Qty Taxable Value Tax Amount Amount

1 sotret nf 16 mg - 10 capsules 282.86 3 STRP 848.57 101.83 (12%) 950.40
9 P 321.43 (-12%) " ‘ ° :

2 Ekran Aqua Sunscreen Spf 30 268.47 1 PAC 268.47 48.33 (18%) 316.80
4 P 305.08 (-12%) , . ° "

Taxable Amount %1,117.05

CGST 6.0% %50.91

SGST 6.0% %50.91

CGST 9.0% F24.16

SGST 9.0% F24.16

Round Off -0.20

Total =1,267.00

Total Discount 172.80

Total Items / Qty :

0.8995833333333335