In [1]:
# ---- Run once per notebook to avoid truncated outputs ----
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

# Show outputs from every expression in a cell
InteractiveShell.ast_node_interactivity = "all"

# Pandas: show all rows/cols and full cell text
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)   # no ellipsis in long text cells
pd.set_option("display.expand_frame_repr", False)  # single-line wide frames
pd.set_option("display.width", 0)             # auto-detect terminal/area width

# NumPy: never summarize arrays
np.set_printoptions(threshold=np.inf, linewidth=10_000)

# Pretty-print large dict/list without abbreviations
import pprint
pp = pprint.PrettyPrinter(width=10_000, compact=False, sort_dicts=False)
def ppx(x):  # use ppx(obj) to print whole thing
    print(pp.pformat(x))


In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


import os, re, glob, json
import ollama
from pypdf import PdfReader
import numpy as np
from typing import List

from rich.console import Console
from rich.panel import Panel
from rich.text import Text

console = Console()

# ---------- Config ----------
# OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "phi4-mini:3.8b-fp16")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "phi4-mini:latest")
EMBED_MODEL  = os.getenv("EMBED_MODEL",  "nomic-embed-text")


In [3]:
def read_pdf(path:str):

    '''
    Read input PDF file, pagewise, and return text :str
    '''
    
    out = []
    r = PdfReader(path)
    
    for page in r.pages:

        t = page.extract_text() or ""
        out.append(t)

    return "\n".join(out)


In [4]:

SYSTEM = '''
You are a precise information extractor for Duke Energy utility bills. 
Return ONLY one JSON object that matches the provided schema. 
Use bill content only; do not guess. 
Dates must be ISO-8601 (YYYY-MM-DD). 
Money values must be numbers (e.g., 142.58). 
For missing data, use null or empty lists. 
No extra keys. No commentary.

User:
Extract the following fields and return ONLY JSON.

SCHEMA (example, not data):
{
  "schema_version": "duke.v1",
  "source_file": "<filename>",
  "account": { "number": "", "service_address": "" },
  "bill": {
    "bill_date": "YYYY-MM-DD",
    "period_start": "YYYY-MM-DD",
    "period_end": "YYYY-MM-DD",
    "due_date": "YYYY-MM-DD",
    "previous_amount_due": 0.00,
    "payment_received_date": "YYYY-MM-DD",
    "payment_received_amt": 0.00,
    "current_elec_charges": 0.00,
    "taxes_amount_usd": 0.00,
    "total_amount_due_usd": 0.00,
    "after_due_amount_usd": 0.00,
    "rate_plan": ""
  },
  "charges": [
    { "item": "", "qty": null, "unit": "", "rate_usd": null, "amount_usd": 0.00,
      "rider_code": null, "item_type": "CONNECTION|ENERGY|RIDER" }
  ],
  "usage": [
    { "meter": "", "prev_read_date": "YYYY-MM-DD", "prev_read": null,
      "curr_read_date": "YYYY-MM-DD", "curr_read": null,
      "kwh": null, "billed_kwh": null, "days": null }
  ],
  "taxes": [
    { "jurisdiction": "", "tax_name": "", "amount_usd": 0.00 }
  ],
  "notes": ""
}

TEXT PAGES:
--- PAGE 1 ---
<insert page 1 text>
--- PAGE 2 ---
<insert page 2 text>
...

TABLES (optional TSV):
--- TABLE 1 (TSV) ---
<tsv>
--- TABLE 2 (TSV) ---
<tsv>
'''


# def read_pdf_text(pdf_path):
#     pages_txt = []
#     with pdfplumber.open(pdf_path) as pdf:
#         for i, p in enumerate(pdf.pages, 1):
#             t = p.extract_text() or ""
#             t = re.sub(r"[ \t]+", " ", t).strip()
#             pages_txt.append(f"--- PAGE {i} ---\n{t}")
#     return "\n\n".join(pages_txt)

def call_ollama(model, system_msg, user_msg):
    resp = ollama.chat(
        model=model,
        messages=[{"role":"system","content":system_msg},{"role":"user","content":user_msg}],
        options={"temperature":0},
        format="json"  # force JSON-only output
    )
    return resp["message"]["content"]

def main():
    pdf = "./sample_data/June2025.pdf"
    out = "./sample_output/"

    text = read_pdf(pdf)

    # print(text)




    messages =[
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": text}
    ]

    response = ollama.chat(
        model = OLLAMA_MODEL,
        format="json",
        options={"temperature": 0},
        messages = messages
    )

    print("Exracted record ............... ")
    print(response)

if __name__ == "__main__":
    main()


Exracted record ............... 
model='phi4-mini:latest' created_at='2025-08-23T21:26:45.880275589Z' done=True done_reason='stop' total_duration=182667837577 load_duration=11555262434 prompt_eval_count=2255 prompt_eval_duration=107955601139 eval_count=379 eval_duration=63105839706 message=Message(role='assistant', content='{\n  "schema_version": "duke.v1",\n  "source_file": "",\n  "account": {\n    "number": null,\n    "service_address": "418 S GRANT ST APT 12, BLOOMINGTON IN 47401-4772"\n  },\n  "bill": {\n    "bill_date": "2025-07-07",\n    "period_start": "2024-06-04",\n    "period_end": "2025-07-02",\n    "due_date": "2025-08-28",\n    "previous_amount_due": 86.62,\n    "payment_received_date": null,\n    "payment_received_amt": null,\n    "current_elec_charges": 144.56,\n    "taxes_amount_usd": 10.12,\n    "total_amount_due_usd": 154.68,\n    "after_due_amount_usd": null,\n    "rate_plan": "Residential Electric Service (RS)"\n  },\n  "charges": [\n    {\n      "item": "",\n      