<a href="https://colab.research.google.com/github/AkashW45/DOCPOC/blob/main/DocAI_DOCPOC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!mkdir /content/images

mkdir: cannot create directory ‘/content/images’: File exists


In [21]:
!pip install easyocr




In [22]:
!pip install groq




In [23]:
from google.colab import userdata
import os

os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")

assert os.environ["GROQ_API_KEY"] is not None
print("✅ Groq API key loaded securely")


✅ Groq API key loaded securely


In [24]:
# ============================================
# Financial Table Image → Clean Markdown Table
# EasyOCR + Groq (Structure Repair)
# ============================================

!pip install easyocr groq

from pathlib import Path
import easyocr
from groq import Groq
from google.colab import userdata
import os

# -----------------------
# CONFIG
# -----------------------
IMAGE_FOLDER = Path("/content/images")
OUTPUT_MD = Path("/content/output.md")

os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")
client = Groq()

reader = easyocr.Reader(["en"], gpu=True)

md = ["# OCR Output\n"]

# -----------------------
# OCR → raw rows
# -----------------------
def extract_rows(image_path):
    results = reader.readtext(str(image_path))
    rows = {}

    for (bbox, text, conf) in results:
        y = int(sum(p[1] for p in bbox) / 4)
        rows.setdefault(y, []).append(text)

    return [" ".join(rows[y]) for y in sorted(rows)]

# -----------------------
# LLM → table formatter
# -----------------------
def rows_to_markdown(rows_text):
    prompt = f"""
You are a document-structure expert.

Convert the following OCR-extracted financial table rows
into a CLEAN Markdown table with proper | separators.

Rules:
- Detect headers
- Align columns
- Merge wrapped text
- Do NOT invent data
- Output ONLY markdown table

ROWS:
{chr(10).join(rows_text)}
"""

    response = client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()

# -----------------------
# PROCESS IMAGES
# -----------------------
for img in sorted(IMAGE_FOLDER.glob("*")):
    if img.suffix.lower() not in {".jpg", ".png", ".jpeg"}:
        continue

    md.append(f"## {img.name}\n")
    md.append(f"![{img.name}](./images/{img.name})\n")

    rows = extract_rows(img)
    table_md = rows_to_markdown(rows)

    md.append(table_md)
    md.append("\n")

# -----------------------
# SAVE OUTPUT
# -----------------------
OUTPUT_MD.write_text("\n".join(md), encoding="utf-8")
print("✅ Clean Markdown table generated:", OUTPUT_MD)






✅ Clean Markdown table generated: /content/output.md


In [25]:
from pathlib import Path
from groq import Groq

DOC_PATH = Path("/content/output.md")
document_text = DOC_PATH.read_text(encoding="utf-8")

client = Groq()


In [33]:
SYSTEM_PROMPT = """
You are a JSON-only table extraction engine.

You receive:
- A document containing pipe-separated markdown tables.
- A user question.

STRICT RULES:
- Output ONLY valid JSON.
- Do NOT use markdown.
- Do NOT explain.
- Do NOT add extra keys.
- Do NOT infer missing values.
- If data is missing, use null.

Table rules:
- Each line starting with | is one row.
- Column order is positional.
- Row index starts at 1 (excluding header).

Allowed operations:
- get_row
- filter
- count
- list

"""

In [34]:
import json

def ask_table(question: str):
    prompt = f"""
DOCUMENT:
{document_text}

QUESTION:
{question}

Return ONLY the extracted result as JSON.
"""

    response = client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    raw = response.choices[0].message.content.strip()

    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        raise ValueError("Model did not return valid JSON")


In [35]:
print(ask_table("give me details of 4th row"))

{'S.No.': 4, 'AMC': 'UTI', 'Scheme Name': 'Treasury Advantage', 'Plan': None, 'Weightage': None, 'Category': None, 'Risk': 'Medium/Long Term', 'Amount (Rs)': None, 'Type': None, 'Holder': 'Mr. X', 'Rating': None}
