In [2]:
!apt-get install poppler-utils tesseract-ocr -y
!pip install pdfplumber pdf2image pytesseract pillow regex


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.11 [186 kB]
Fetched 186 kB in 1s (311 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [3]:
# --- Upload PDF ---
from google.colab import files
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
print("Uploaded:", pdf_path)

# --- Imports ---
import re, json, pandas as pd
import pdfplumber, pytesseract
from pdf2image import convert_from_path
from datetime import datetime

# --- Helper functions ---
def pdf_to_text(path):
    """Extract text from PDF with pdfplumber, fallback to OCR if scanned."""
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
    if text.strip():
        return text
    # OCR fallback for scanned PDFs
    images = convert_from_path(path, dpi=300)
    ocr_text = [pytesseract.image_to_string(img) for img in images]
    return "\n".join(ocr_text)


def try_parse_date(s):
    """Attempt to normalize various date formats."""
    for fmt in ["%d/%m/%Y","%d-%m-%Y","%d %b %Y","%b %d %Y","%Y-%m-%d"]:
        try:
            return datetime.strptime(s.strip(), fmt).strftime("%Y-%m-%d")
        except:
            pass
    return s


# Parser for each Bank

# --- AXIS BANK PARSER ---
def parse_axis(text, clean_lines):
    fields = {}
    m = re.search(r"(?:\*{4,}|xxxx|ending\s*in)\s*(\d{4})", text, re.I)
    if not m:
        m = re.search(r"\d{4}\*{4,}\d{4}|\d{4}\s*\d{4}\s*\d{4}\s*(\d{4})", text)
    if m:
        fields["card_last4"] = m.group(1)

    summary_text = ""
    for i, line in enumerate(clean_lines):
        if re.search(r"Payment\s*Summary", line, re.I):
            summary_text = " ".join(clean_lines[i:i+5])
            break

    billing_cycle = re.search(r"(\d{2}/\d{2}/\d{4})\s*-\s*(\d{2}/\d{2}/\d{4})", summary_text)
    if billing_cycle:
        fields["billing_cycle"] = f"{billing_cycle.group(1)} to {billing_cycle.group(2)}"

    due_date = re.search(r"\b(\d{2}/\d{2}/\d{4})\b", summary_text)
    if due_date:
        fields["payment_due_date"] = try_parse_date(due_date.group(1))

    amounts = re.findall(r"[\d,]+\.\d{2}", summary_text)
    if len(amounts) >= 2:
        fields["total_due"] = amounts[-2]
        fields["min_due"] = amounts[-1]
    elif len(amounts) == 1:
        fields["total_due"] = amounts[0]
    else:
        fields["total_due"] = "Not Found"

    credit_limit = "Not Found"
    available_credit = "Not Found"
    for i, line in enumerate(clean_lines):
        if re.search(r"Credit\s*Limit", line, re.I):
            nearby_text = " ".join(clean_lines[i:i+4])
            numbers = re.findall(r"\d[\d,]*\.\d{2}|\d[\d,]*", nearby_text)
            if len(numbers) >= 2:
                credit_limit = f"{float(numbers[0].replace(',', '')):,.2f}"
                available_credit = f"{float(numbers[1].replace(',', '')):,.2f}"
            elif len(numbers) == 1:
                credit_limit = f"{float(numbers[0].replace(',', '')):,.2f}"
            break
    fields["credit_limit"] = credit_limit
    fields["available_credit"] = available_credit
    return fields


# --- HDFC BANK PARSER ---
def parse_hdfc(text, clean_lines):
    fields = {}
    m = re.search(r"(?:\*{4,}|xxxx|ending\s*in)\s*(\d{4})", text, re.I)
    if m:
        fields["card_last4"] = m.group(1)
    m = re.search(r"(?:payment\s*due\s*date|due\s*date)[:\s]*(.+)", text, re.I)
    if m:
        fields["payment_due_date"] = try_parse_date(m.group(1))
    for i, line in enumerate(clean_lines):
        if re.search(r"Total\s*Amount\s*Due", line, re.I):
            nearby_text = " ".join(clean_lines[i:i+3])
            nums = re.findall(r"\d[\d,]*\.\d{2}", nearby_text)
            if nums:
                fields["total_due"] = f"{max(float(n.replace(',', '')) for n in nums):,.2f}"
                break
    m = re.search(r"(?:billing\s*(?:cycle|period)|statement\s*period)[:\s]*(.+)", text, re.I)
    if m:
        fields["billing_cycle"] = m.group(1)
    credit_limit = "Not Found"
    available_credit = "Not Found"
    for i, line in enumerate(clean_lines):
        if re.search(r"Credit\s*Limit", line, re.I):
            nearby_text = " ".join(clean_lines[i:i+4])
            numbers = re.findall(r"\d[\d,]*\.\d{2}|\d[\d,]*", nearby_text)
            if len(numbers) >= 2:
                available_credit = f"{float(numbers[0].replace(',', '')):,.2f}"
                credit_limit = f"{float(numbers[1].replace(',', '')):,.2f}"
            elif len(numbers) == 1:
                credit_limit = f"{float(numbers[0].replace(',', '')):,.2f}"
            break
    fields["credit_limit"] = credit_limit
    fields["available_credit"] = available_credit
    return fields


# --- ICICI / IDFC BANK PARSER ---
def parse_icici_idfc(text, clean_lines):
    fields = {}
    m = re.search(r"(?:\*{4,}|xxxx|ending\s*in)\s*(\d{4})", text, re.I)
    if m:
        fields["card_last4"] = m.group(1)
    m = re.search(r"(?:payment\s*due\s*date|due\s*date)[:\s]*(.+)", text, re.I)
    if m:
        fields["payment_due_date"] = try_parse_date(m.group(1))
    for i, line in enumerate(clean_lines):
        if re.search(r"Total\s*Amount\s*Due", line, re.I):
            nearby_text = " ".join(clean_lines[i:i+3])
            nums = re.findall(r"\d[\d,]*\.\d{2}", nearby_text)
            if nums:
                fields["total_due"] = f"{max(float(n.replace(',', '')) for n in nums):,.2f}"
                break
    m = re.search(r"(?:billing\s*(?:cycle|period)|statement\s*period)[:\s]*(.+)", text, re.I)
    if m:
        fields["billing_cycle"] = m.group(1)

    def _normalize_amount(token):
        t = token.replace('₹', '').replace('r', '').replace('R', '').replace('Rs.', '')
        t = re.sub(r'[^\d\.,\-]', '', t).strip().replace(',', '')
        try:
            return float(t)
        except:
            return None

    credit_limit = "Not Found"
    available_credit = "Not Found"
    for i, line in enumerate(clean_lines):
        if re.search(r"Credit\s*Limit", line, re.I):
            block = " ".join(clean_lines[i:i+4])
            tokens = re.findall(r"(?:₹|r|Rs\.)?\s*[0-9][0-9\.,]*\d(?:\.\d{1,2})?", block, re.I)
            amounts = [_normalize_amount(tok) for tok in tokens if _normalize_amount(tok)]
            if len(amounts) >= 2:
                credit_limit = f"{amounts[0]:,.2f}"
                available_credit = f"{amounts[1]:,.2f}"
            elif len(amounts) == 1:
                credit_limit = f"{amounts[0]:,.2f}"
            break
    fields["credit_limit"] = credit_limit
    fields["available_credit"] = available_credit
    return fields


#detect Bank

def detect_bank(text):
    if re.search(r"axis\s*bank", text, re.I):
        return "AXIS"
    elif re.search(r"hdfc", text, re.I):
        return "HDFC"
    elif re.search(r"icici", text, re.I):
        return "ICICI"
    elif re.search(r"idfc", text, re.I):
        return "IDFC"
    else:
        return "UNKNOWN"


def parse_statement(path):
    text = pdf_to_text(path)
    clean_lines = [line.strip() for line in text.splitlines() if line.strip()]
    bank = detect_bank(text)
    print(f"\n🧾 Detected Bank: {bank}\n")

    if bank == "AXIS":
        fields = parse_axis(text, clean_lines)
    elif bank == "HDFC":
        fields = parse_hdfc(text, clean_lines)
    elif bank in ["ICICI", "IDFC"]:
        fields = parse_icici_idfc(text, clean_lines)
    else:
        fields = {"error": "Bank not recognized"}

    fields["bank"] = bank
    return fields


# --- RUN PARSER ---
parsed_fields = parse_statement(pdf_path)
print(json.dumps(parsed_fields, indent=2, ensure_ascii=False))

# --- EXPORT TO EXCEL ---
df = pd.DataFrame([parsed_fields])
excel_path = "statement_fields.xlsx"
df.to_excel(excel_path, index=False)

print(f"\nExcel saved as: {excel_path}")
files.download(excel_path)


Saving idfc.pdf to idfc.pdf
Uploaded: idfc.pdf

🧾 Detected Bank: IDFC

{
  "card_last4": "9058",
  "payment_due_date": "11/06/2022(cid:9)",
  "total_due": "80,393.10",
  "credit_limit": "181,000.00",
  "available_credit": "100,606.90",
  "bank": "IDFC"
}

Excel saved as: statement_fields.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>