**Extract text from pages and split the bulk pdfs**

In [2]:
import pdfplumber
from pypdf import PdfReader, PdfWriter
from collections import defaultdict
import os
import re
import shutil
from datetime import datetime

In [13]:
# -----------------------------
# Reset output
# -----------------------------
OUTPUT_DIR = "output"
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR)

# -----------------------------
# Utilities
# -----------------------------

def normalize(name):
    name = name.upper()
    name = re.sub(r"\s+", " ", name)
    return name.strip()

def first_two_words(name):
    return " ".join(normalize(name).split()[:2])

def extract_park(text):
    t = text.upper()

    if "SAPHIRE" in t or "SAPPHIRE" in t:
        return "SAPPHIRE"
    if "GRAPHITE" in t:
        return "GRAPHITE"
    if "EMERALD" in t:
        return "EMERALD"
    if "SCARLET" in t:
        return "SCARLET"

    return "UNKNOWN"

def extract_godowns(text, tenant_name):
    lines = [l.strip() for l in text.split("\n") if len(l.strip()) > 0]

    for i in range(len(lines)):
        if tenant_name.upper() in lines[i].upper():

            # check only next 3 lines
            for next_line in lines[i+1:i+4]:
                up = next_line.upper()

                # ignore dates
                if re.fullmatch(r"\d{2}/\d{2}/\d{4}", up):
                    continue

                # ignore P.O. Box
                if "P O BOX" in up or "P.O BOX" in up or "PO BOX" in up:
                    continue

                # valid godown = any other line with a digit
                if re.search(r"\d", up):
                    m = re.search(r"\d.*", up)
                    return m.group().strip()

            return "(OP)"

    return "(OP)"



def extract_month_year(text):
    m = re.search(r"\b\d{2}/\d{2}/\d{4}\b", text)
    if not m:
        return "UNKNOWN_DATE"

    dt = datetime.strptime(m.group(), "%d/%m/%Y")
    return dt.strftime("%b %y").upper()   # FEB 26

# -----------------------------
# Tenant name
# -----------------------------

def extract_tenant_name(text):
    lines = [l.strip() for l in text.split("\n") if len(l.strip()) > 2]

    for i in range(len(lines)):
        low = lines[i].lower()

        if low == "invoice to" or low == "to:":
            return lines[i+1]

        if "received from" in low:
            return lines[i+1]

        if low.endswith(" statement"):
            return lines[i].replace("Statement", "").strip()

    return None


# -----------------------------
# Core logic
# -----------------------------

def process_bulk_pdf(file_path, doc_type, tenants):
    with pdfplumber.open(file_path) as pdf:
        reader = PdfReader(file_path)

        current_key = None

        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            raw = extract_tenant_name(text)

            if raw:
                current_key = first_two_words(raw)

                # Only statements define metadata
                if doc_type == "statement":
                    tenants[current_key]["park"] = extract_park(text)
                    tenants[current_key]["godowns"] = extract_godowns(text, raw)
                    tenants[current_key]["date"] = extract_month_year(text)

                print(f"[{doc_type.upper()}]", current_key)

            if current_key:
                tenants[current_key][doc_type].append(reader.pages[i])


# -----------------------------
# Build outputs
# -----------------------------

def build_tenant_pdfs(tenants):
    for key, docs in tenants.items():
        writer = PdfWriter()

        for doc_type in ["statement", "invoice", "receipt", "water"]:
            for page in docs.get(doc_type, []):
                writer.add_page(page)

        park = docs.get("park", "UNKNOWN")
        godowns = docs.get("godowns", "").replace('/', '_') # Replace problematic character
        date = docs.get("date", "UNKNOWN_DATE")

        park_dir = os.path.join(OUTPUT_DIR, park)
        os.makedirs(park_dir, exist_ok=True)

        filename = f"{key} {godowns} - {date}.pdf"
        path = os.path.join(park_dir, filename)

        with open(path, "wb") as f:
            writer.write(f)

        print("CREATED:", path)


# -----------------------------
# RUN
# -----------------------------

tenants = defaultdict(lambda: defaultdict(list))

process_bulk_pdf("Statements.pdf", "statement", tenants)
process_bulk_pdf("Invoices.pdf", "invoice", tenants)
process_bulk_pdf("Receipts.pdf", "receipt", tenants)
#process_bulk_pdf("Water.pdf", "water", tenants)

build_tenant_pdfs(tenants)

[STATEMENT] AFRICA SMART
[STATEMENT] AFRISTONEX IMPORTERS
[STATEMENT] ALFABLENDS (E.A)
[STATEMENT] AQUABABY COLLECTION
[STATEMENT] ATESOILS EAST
[STATEMENT] AUTOMAXX IMPEX
[STATEMENT] AXIOM MANUFACTURERS
[STATEMENT] BENARD NGURE
[STATEMENT] BHANDERI DIESEL
[STATEMENT] BLU OSPREY
[STATEMENT] C.C.L.E RUBBER
[STATEMENT] CARTON MANUFACTURERS
[STATEMENT] DEW MANUFACTURERS
[STATEMENT] DILIGENT SUPPLIES
[STATEMENT] DIVYA LUBRIC
[STATEMENT] ECOBODAA COMPANY
[STATEMENT] ECOCHICKS (K)
[STATEMENT] ELONG WATERPROOF
[STATEMENT] FASHIN WOOD
[STATEMENT] FDC CONSTRUCTION
[STATEMENT] FRM E.A
[STATEMENT] FU RUILIN
[STATEMENT] G&S SELECTIONS
[STATEMENT] IPHIKER INDUSTRY
[STATEMENT] JAVANS COFFEE
[STATEMENT] JHAM ENGINEERING
[STATEMENT] JUMBO FOAM
[STATEMENT] JUMBO QUALITY
[STATEMENT] KEN ISATIS
[STATEMENT] LINK-X SUPPLY
[STATEMENT] LONGI EAST
[STATEMENT] MEECWELL COMPANY
[STATEMENT] MONOLITH BUSINESS
[STATEMENT] MURLI PARTS
[STATEMENT] NOVIXA INTERNATIONAL
[STATEMENT] OMEGA PRIME
[STATEMENT] ONE AFRICA
[