## Income Statement validation

In [55]:
import os
import re
import json
from docling.document_converter import DocumentConverter

# Folder where PDFs are located
pdf_folder_path = "C:\\Users\\ramum\\pdfs"

# 🔹 Extract value from a column by index
def extract_number_from_column(line, column_index):
    try:
        parts = [cell.strip() for cell in line.split("|") if cell.strip()]
        value = parts[column_index].replace(",", "")
        return float(value)
    except (IndexError, ValueError):
        return 0.0

# Extract numeric values from loose text
def extract_number(line):
    match = re.search(r"(\d[\d,]*)", line.replace(",", ""))
    return float(match.group(1)) if match else 0.0

# Extract all markdown tables from full markdown content
def extract_income_tables_from_markdown(markdown_text):
    tables = []
    current_table = []
    inside_table = False

    for line in markdown_text.splitlines():
        line = line.strip()
        if line.startswith("|") and line.endswith("|"):
            inside_table = True
            current_table.append(line)
        elif inside_table and not line:
            inside_table = False
            if current_table:
                # Filter only income statement table by checking relevant keywords
                table_str = " ".join(current_table).lower()
                if any(keyword in table_str for keyword in ["profit after tax", "total income", "total expenses"]):
                    tables.append(current_table)
                current_table = []

    if current_table:
        table_str = " ".join(current_table).lower()
        if any(keyword in table_str for keyword in ["profit after tax", "total income", "total expenses"]):
            tables.append(current_table)

    return tables



# Parse tables to extract income-related values
def parse_income_statement_tables(tables, submitted_net_income=None):
    parsed = []

    for table in tables:
        headers = table[0].split("|")
        headers = [h.strip() for h in headers if h.strip()]
        if not headers:
            continue

        latest_quarter_index = 0
        for i, h in enumerate(headers):
            if re.match(r"Q\d\s*FY\d+", h) or re.match(r"FY\d+Q\d", h):
                latest_quarter_index = i
                break

        entry = {
            "quarter": headers[latest_quarter_index] if latest_quarter_index < len(headers) else "LatestQuarter",
            "revenues": 0.0,
            "expenses": 0.0,
            "netIncome": 0.0,
            "grossProfit": 0.0,
            "profitMarginPercent": 0.0,
            "submittedNetIncome": submitted_net_income,
            "calculatedNetIncome": 0.0,
            "isValid": None
        }

        for line in table:
            if "Total Income" in line and "operations" not in line:
                entry["revenues"] = extract_number_from_column(line, latest_quarter_index)
            elif "Total Expenses" in line:
                entry["expenses"] = extract_number_from_column(line, latest_quarter_index)
            elif "Profit After Tax" in line and "Margin" not in line:
                entry["netIncome"] = extract_number_from_column(line, latest_quarter_index)

        entry["grossProfit"] = entry["revenues"] - entry["expenses"]
        if entry["revenues"] > 0:
            entry["profitMarginPercent"] = round((entry["netIncome"] / entry["revenues"]) * 100, 2)

        if submitted_net_income is not None:
            entry["calculatedNetIncome"] = entry["netIncome"]
            entry["isValid"] = submitted_net_income == entry["netIncome"]

        # ❗ Only append if valid income data is present
        if entry["netIncome"] > 0:
            parsed.append(entry)

    return parsed


# Extract + validate all income statements from a single PDF file
def extract_and_validate_income_statements(pdf_path, submitted_net_income=None):
    converter = DocumentConverter()
    result = converter.convert(pdf_path)
    markdown = result.document.export_to_markdown()
    tables = extract_income_tables_from_markdown(markdown)
    parsed_tables = parse_income_statement_tables(tables, submitted_net_income=submitted_net_income)

    if not parsed_tables or all(p["netIncome"] == 0 for p in parsed_tables):
        parsed_tables = [extract_financials_by_line_level(markdown, submitted_net_income)]

    for table in parsed_tables:
        table["fileNameFromMetadata"] = result.document.origin.filename

    return parsed_tables

#  Process a list of files with expected net incomes
def validate_uploaded_pdfs(validation_requests):
    results = []
    for request in validation_requests:
        filename = request["fileName"]
        submitted_income = request["submittedNetIncome"]
        full_path = os.path.join(pdf_folder_path, filename)
        print(f"Processing: {filename}")
        parsed = extract_and_validate_income_statements(full_path, submitted_net_income=submitted_income)
        results.extend(parsed)
    return results

# Input
validation_requests = [
    {"fileName": "Q3FY25 Earnings Presentation V16.pdf", "submittedNetIncome": 3834},
    {"fileName": "INVESTOR_PRESENTATION_MAR25.pdf", "submittedNetIncome": 2291}
]

# Run
if __name__ == "__main__":
    validation_results = validate_uploaded_pdfs(validation_requests)
    print(json.dumps(validation_results, indent=2))


Processing: Q3FY25 Earnings Presentation V16.pdf
Processing: INVESTOR_PRESENTATION_MAR25.pdf
[
  {
    "quarter": "Q3 FY25",
    "revenues": 4807.0,
    "expenses": 1084.0,
    "netIncome": 3834.0,
    "grossProfit": 3723.0,
    "profitMarginPercent": 79.76,
    "submittedNetIncome": 3834,
    "calculatedNetIncome": 3834.0,
    "isValid": true,
    "fileNameFromMetadata": "Q3FY25 Earnings Presentation V16.pdf"
  },
  {
    "quarter": "Q3 FY25",
    "revenues": 4289.0,
    "expenses": 1241.0,
    "netIncome": 2291.0,
    "grossProfit": 3048.0,
    "profitMarginPercent": 53.42,
    "submittedNetIncome": 3834,
    "calculatedNetIncome": 2291.0,
    "isValid": false,
    "fileNameFromMetadata": "Q3FY25 Earnings Presentation V16.pdf"
  },
  {
    "quarter": "Q4 FY25",
    "revenues": 4397.0,
    "expenses": 1124.0,
    "netIncome": 2650.0,
    "grossProfit": 3273.0,
    "profitMarginPercent": 60.27,
    "submittedNetIncome": 2291,
    "calculatedNetIncome": 2650.0,
    "isValid": false,
   

## Invoice Statement validation

In [4]:
import os
import re
import json
from docling.document_converter import DocumentConverter

# Folder where invoice PDFs are stored
pdf_folder_path = "C:\\Users\\ramum\\invoice_pdf"

# Extract float value from a line containing "Rs."
def extract_number(line):
    match = re.search(r"Rs\.?\s*([\d,]+\.\d+)", line)
    if match:
        try:
            return float(match.group(1).replace(",", ""))
        except ValueError:
            return 0.0
    return 0.0

# Extract all invoice totals + derived calculations from markdown
def extract_invoice_totals_from_markdown(markdown):
    totals = {
        "taxableAmount": 0.0,
        "taxAmount": 0.0,
        "totalAmount": 0.0,
        "effectiveTaxRate": 0.0,
        "lineItemSum": 0.0,
        "lineItemDiscrepancy": 0.0
    }

    for line in markdown.splitlines():
        line = line.strip()
        if "Taxable Amount" in line:
            totals["taxableAmount"] = extract_number(line)
        elif "Tax Amount" in line:
            totals["taxAmount"] = extract_number(line)
        elif "Total Amount" in line:
            totals["totalAmount"] = extract_number(line)

        # Detect line-level total value (typically 4 columns: desc, qty, price, tax, total)
        matches = re.findall(r"\s(\d{1,3}(?:\.\d{2}))", line)
        if len(matches) >= 4:
            try:
                line_total = float(matches[-1])
                totals["lineItemSum"] += line_total
            except ValueError:
                pass

    #  Effective tax rate (%)
    if totals["taxableAmount"] > 0:
        totals["effectiveTaxRate"] = round((totals["taxAmount"] / totals["taxableAmount"]) * 100, 2)

    # Check difference between item sum and declared total
    totals["lineItemDiscrepancy"] = round(totals["lineItemSum"] - totals["totalAmount"], 2)

    return totals

# Validate extracted totals against submitted value
def validate_invoice_totals(calculated, submitted_amount):
    return {
        "submittedAmount": submitted_amount,
        "calculatedTaxable": calculated["taxableAmount"],
        "calculatedTax": calculated["taxAmount"],
        "calculatedTotal": calculated["totalAmount"],
        "effectiveTaxRatePercent": calculated["effectiveTaxRate"],
        "lineItemSum": calculated["lineItemSum"],
        "lineItemDiscrepancy": calculated["lineItemDiscrepancy"],
        "matchWithSubmission": submitted_amount <= calculated["totalAmount"]
    }

# List of invoices and user-submitted total amounts
invoice_files = [
    {"fileName": "invoice_1.pdf", "submittedAmount": 179.00},
    {"fileName": "invoice_2.pdf", "submittedAmount": 175.50},
    {"fileName": "invoice_3.pdf", "submittedAmount": 623.00}
]

# Validate each invoice and collect results
results = []
for invoice in invoice_files:
    file_name = invoice["fileName"]
    submitted_amount = invoice["submittedAmount"]
    full_path = os.path.join(pdf_folder_path, file_name)
    
    print(f" Validating: {file_name}")
    
    try:
        converter = DocumentConverter()
        result = converter.convert(full_path)
        markdown = result.document.export_to_markdown()

        extracted_totals = extract_invoice_totals_from_markdown(markdown)
        validation = validate_invoice_totals(extracted_totals, submitted_amount)
        validation["fileName"] = file_name
        results.append(validation)

    except Exception as e:
        results.append({
            "fileName": file_name,
            "error": str(e)
        })

#  Print results as formatted JSON
print(json.dumps(results, indent=2))


 Validating: invoice_1.pdf
 Validating: invoice_2.pdf
 Validating: invoice_3.pdf
[
  {
    "submittedAmount": 179.0,
    "calculatedTaxable": 150.0,
    "calculatedTax": 30.0,
    "calculatedTotal": 180.0,
    "effectiveTaxRatePercent": 20.0,
    "lineItemSum": 0.0,
    "lineItemDiscrepancy": -180.0,
    "matchWithSubmission": true,
    "fileName": "invoice_1.pdf"
  },
  {
    "submittedAmount": 175.5,
    "calculatedTaxable": 160.0,
    "calculatedTax": 0.0,
    "calculatedTotal": 159.5,
    "effectiveTaxRatePercent": 0.0,
    "lineItemSum": 0.0,
    "lineItemDiscrepancy": -159.5,
    "matchWithSubmission": false,
    "fileName": "invoice_2.pdf"
  },
  {
    "submittedAmount": 623.0,
    "calculatedTaxable": 500.0,
    "calculatedTax": 123.0,
    "calculatedTotal": 623.0,
    "effectiveTaxRatePercent": 24.6,
    "lineItemSum": 0.0,
    "lineItemDiscrepancy": -623.0,
    "matchWithSubmission": true,
    "fileName": "invoice_3.pdf"
  }
]
