In [1]:
#!/usr/bin/env python3
# -- coding: utf-8 --
"""
Agentic RAG Pipeline for Completion/Workover Reports (Offline)
--------------------------------------------------------------
- PDF -> text extraction (PyPDF2 -> pdfminer fallback)
- Chunking + TF-IDF retrieval
- Rule-based field extraction (well data, depths, HSE, tests)
- Validation & sanity checks
- Nodal analysis interface (stub) with JSON inputs
- Word-bounded summary generation
- Structured JSON + Markdown outputs
- (NEW) Optional Markdown -> PDF export via --export-pdf

Usage:
  python agentic_rag_pipeline.py --pdf "/path/to/report.pdf" \
      --outdir "/path/to/out" \
      --word-limit 250 \
      --nodal-json "/path/to/nodal_inputs.json" \
      --export-pdf

Nodal JSON schema example:
{
  "wellhead_pressure_bar": 18.0,
  "flow_rate_m3_h": 135.0,
  "tubing_inner_diameter_in": 6.2,
  "fluid_density_kg_m3": 1015.0,
  "fluid_viscosity_cP": 0.78,
  "reservoir_temperature_c": 90.0
}
"""


'\nAgentic RAG Pipeline for Completion/Workover Reports (Offline)\n--------------------------------------------------------------\n- PDF -> text extraction (PyPDF2 -> pdfminer fallback)\n- Chunking + TF-IDF retrieval\n- Rule-based field extraction (well data, depths, HSE, tests)\n- Validation & sanity checks\n- Nodal analysis interface (stub) with JSON inputs\n- Word-bounded summary generation\n- Structured JSON + Markdown outputs\n- (NEW) Optional Markdown -> PDF export via --export-pdf\n\nUsage:\n  python agentic_rag_pipeline.py --pdf "/path/to/report.pdf"       --outdir "/path/to/out"       --word-limit 250       --nodal-json "/path/to/nodal_inputs.json"       --export-pdf\n\nNodal JSON schema example:\n{\n  "wellhead_pressure_bar": 18.0,\n  "flow_rate_m3_h": 135.0,\n  "tubing_inner_diameter_in": 6.2,\n  "fluid_density_kg_m3": 1015.0,\n  "fluid_viscosity_cP": 0.78,\n  "reservoir_temperature_c": 90.0\n}\n'

In [2]:
# ------------------ Imports ------------------
import os, re, json, math, argparse, sys
from datetime import datetime
from typing import List, Dict, Any, Tuple

In [3]:
# ------------------ Helpers ------------------
def clean_spaces(s: str) -> str:
    s = re.sub(r"[ \t]+", " ", s or "")
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def enforce_word_limit(s: str, limit: int) -> str:
    if not s:  # Handle None or empty string input
        return ""
    words = re.findall(r"\S+", s or "")
    return " ".join(words[:max(0, int(limit))])

In [4]:
# ------------------ PDF Extraction ------------------
def extract_pdf_text(pdf_path):
    """
    Robust PDF text extraction:
    1) Try pypdf (fast)
    2) Try pdfminer.six (more thorough)
    3) Fallback to OCR (pytesseract + pdf2image) if still empty
    Returns a UTF-8 string (may be long).
    """
    text = ""

    # 1) pypdf
    try:
        import pypdf
        reader = pypdf.PdfReader(pdf_path)
        pages = []
        for i, p in enumerate(reader.pages):
            pages.append(p.extract_text() or "")
        text = "\n".join(pages)
        if text and text.strip():
            return text
    except Exception as e:
        print(f"[extract_pdf_text] pypdf failed: {e}", file=sys.stderr)

    # 2) pdfminer.six
    try:
        from pdfminer.high_level import extract_text as pdfminer_extract_text
        t2 = pdfminer_extract_text(pdf_path) or ""
        if len(t2.strip()) > len(text.strip()):
            text = t2
        if text and text.strip():
            return text
    except Exception as e:
        print(f"[extract_pdf_text] pdfminer failed: {e}", file=sys.stderr)

    # 3) OCR fallback
    print("[extract_pdf_text] Falling back to OCR (this may take a while).", file=sys.stderr)
    try:
        # Dependencies: poppler, tesseract-ocr, python libs
        # In Colab, run once if needed:
        # !apt-get -y install poppler-utils tesseract-ocr
        # !pip install pdf2image pytesseract Pillow
        from pdf2image import convert_from_path
        import pytesseract
        from PIL import Image

        # Render images from the PDF (adjust dpi for quality/speed)
        images = convert_from_path(pdf_path, dpi=300)
        ocr_texts = []
        for idx, img in enumerate(images):
            # Optional: small preprocessing can help OCR
            if img.mode != "L":
                img = img.convert("L")
            ocr_text = pytesseract.image_to_string(img, lang="eng")
            if ocr_text:
                ocr_texts.append(ocr_text)
        text = "\n\n".join(ocr_texts)
        if text and text.strip():
            print(f"[extract_pdf_text] OCR recovered ~{len(text)} characters.")
            return text
    except Exception as e:
        print(f"[extract_pdf_text] OCR failed: {e}", file=sys.stderr)

    # If all failed, return empty string (caller will handle it)
    return text


In [5]:
# ------------------ Chunking ------------------
def chunk_text(t: str, chunk_size=1500, overlap=300) -> List[str]:
    if not t:
        return []
    res = []
    i = 0
    L = len(t)
    while i < L:
        j = min(L, i + chunk_size)
        res.append(t[i:j])
        i += (chunk_size - overlap)
        if i <= 0: break
    return res

In [6]:
# ------------------ Retrieval ------------------
def build_retriever(chunks: List[str]):
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity
    except Exception as e:
        raise RuntimeError("scikit-learn is required for TF-IDF retrieval.")
    vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
    tfidf = vectorizer.fit_transform(chunks) if chunks else None
    def retrieve(query: str, k=5) -> List[Tuple[str, float]]:
        if tfidf is None or not chunks:
            return []
        qv = vectorizer.transform([query])
        sims = cosine_similarity(qv, tfidf)[0]
        idxs = sims.argsort()[::-1][:k]
        return [(chunks[i], float(sims[i])) for i in idxs]
    return retrieve

In [7]:
# ------------------ Field Extraction ------------------
def first_group(pattern: str, text: str, flags=re.IGNORECASE) -> str:
    m = re.search(pattern, text, flags)
    return m.group(1).strip() if m else ""

def parse_report_fields(text: str) -> Dict[str, Any]:
    d: Dict[str, Any] = {}
    d["well_name"] = first_group(r"Well Name\s+([^\n]+)", text)
    d["operation"] = first_group(r"Operation\s+([^\n]+)", text)
    d["start_of_operation"] = first_group(r"Start of Operation\s+([^\n]+)", text)
    d["duration"] = first_group(r"Duration\s+([^\n]+)", text)
    d["total_depth"] = first_group(r"Well Total Depth\s+([^\n]+)", text)

    # Key events/depths
    d["packer_set_depth_m"] = first_group(r"Set\s+9\s*5/8[â\"]?\s+.*?at\s+([0-9\.]+\s*m\s*AHGL)", text)
    if not d["packer_set_depth_m"]:
        d["packer_set_depth_m"] = first_group(r"Set\s+9\s*5/8[â\"]?\s+NOV liner hanger.*?at\s+([0-9\.]+\s*m\s*AHGL)", text)

    d["pbr_bottom_m"] = first_group(r"mule shoe at\s+([0-9\.]+\s*m)\s*AHB?GL", text) \
                        or first_group(r"bottom of (?:the )?PBR.*?([0-9\.]+\s*m\s*AHGL)", text)

    d["hand_over"] = first_group(r"handed.*?to Operations on\s+([^\n]+)", text)

    # HSE and equipment
    d["hse_incidents"] = "None" if re.search(r"No incidents", text, re.IGNORECASE) else ""
    d["esp_installed"] = bool(re.search(r"\bESP\b", text))
    d["gre_string"] = bool(re.search(r"\bGRE\b", text))

    # Logging/testing
    d["mti_logged"] = bool(re.search(r"\bMTI\b", text))
    d["press_test_annulus"] = "10 bar" if re.search(r"Pressure tested annulus to 10 bar", text, re.IGNORECASE) else ""

    # Reservoir
    d["reservoir_fluid"] = "Brine" if re.search(r"Well Bore Fluids:\s*o\s*Brine", text, re.IGNORECASE) else ""
    d["reservoir_bottomhole_temp_c"] = first_group(r"Bottom Hole temperature[:\s]*([0-9]+)\s*Â°C", text)
    return d

In [8]:
# ------------------ Validation ------------------
def parse_depth_m(val: str) -> float:
    if not val:
        return math.nan
    m = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*m", val.replace(",", "."))
    return float(m.group(1)) if m else math.nan

def validate_fields(data: Dict[str, Any]) -> List[str]:
    issues: List[str] = []
    depth_fields = { "packer_set_depth_m": data.get("packer_set_depth_m", ""),
                     "pbr_bottom_m": data.get("pbr_bottom_m", "") }
    for k, v in depth_fields.items():
        d = parse_depth_m(v)
        if math.isnan(d):
            issues.append(f"Missing or unparsable depth for {k}.")
        elif not (0 < d < 5000):
            issues.append(f"Unusual depth for {k}: {v}")

    if not data.get("start_of_operation"):
        issues.append("Start of Operation date not found.")
    if not data.get("hand_over"):
        issues.append("Hand-over to Operations date not found.")
    return issues


In [9]:
# ------------------ Nodal Analysis Stub ------------------
def nodal_default_inputs(data: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "wellhead_pressure_bar": 10,
        "flow_rate_m3_h": 50,
        "tubing_inner_diameter_in": 2,
        "fluid_density_kg_m3": 900,
        "fluid_viscosity_cP": 0.01,
        "reservoir_temperature_c": (data.get("reservoir_bottomhole_temp_c") or None)
    }

def run_nodal(inputs: Dict[str, Any]) -> Dict[str, Any]:
    missing = [k for k, v in inputs.items() if v in (None, "", float('nan'))]
    if missing:
        return {
            "status": "pending_inputs",
            "missing_inputs": missing,
            "message": "Provide missing nodal inputs to compute system curve and operating point.",
            "results": None
        }
    # Placeholder: echo operating point
    return {
        "status": "ok",
        "missing_inputs": [],
        "message": "Computed operating point (placeholder).",
        "results": {
            "q_m3_h": inputs["flow_rate_m3_h"],
            "whp_bar": inputs["wellhead_pressure_bar"],
            "tubing_id_in": inputs["tubing_inner_diameter_in"]
        }
    }



In [10]:
# ------------------ Summary Generation ------------------
def generate_summary(data: Dict[str, Any], retrieve_func, word_limit: int) -> str:
    highlights = []
    if data.get("well_name"): highlights.append(f"Well: {data['well_name']}.")
    if data.get("operation"): highlights.append(f"Operation: {data['operation']}.")
    if data.get("start_of_operation"): highlights.append(f"Start: {data['start_of_operation']}.")
    if data.get("duration"): highlights.append(f"Duration: {data['duration']}.")
    if data.get("hand_over"): highlights.append(f"Handover: {data['hand_over']}.")
    if data.get("packer_set_depth_m"): highlights.append(f"Liner hanger/packer set at {data['packer_set_depth_m']}.")
    if data.get("pbr_bottom_m"): highlights.append(f"PBR reference near {data['pbr_bottom_m']}.")
    if data.get("esp_installed"): highlights.append("ESP installed.")
    if data.get("mti_logged"): highlights.append("MTI logging completed; annulus pressure test to 10 bar passed.")
    if data.get("hse_incidents") == "None": highlights.append("HSE: No incidents reported; drills/toolboxes conducted.")
    if data.get("reservoir_fluid"): highlights.append(f"Reservoir fluid: {data['reservoir_fluid']}.")
    if data.get("reservoir_bottomhole_temp_c"): highlights.append(f"Bottomhole temperature: {data['reservoir_bottomhole_temp_c']} Â°C.")

    support = []
    if retrieve_func:
        for q in ["Executive summary objectives outcomes",
                  "Daily operations key events",
                  "HSE performance incidents drills",
                  "Logging MTI annulus pressure test",
                  "Well data casing GRE PBR ESP depths"]:
            for chunk, score in retrieve_func(q, k=1):
                # pick first sentence
                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
                if sents:
                    support.append(sents[0].strip())

    text = " ".join(highlights + support)
    return enforce_word_limit(text, word_limit)

In [11]:
!apt-get -y install poppler-utils tesseract-ocr
!pip install pdf2image pytesseract Pillow


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://security.ubuntu.com/ubuntu jammy-security/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 1s (251 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 125082 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Download

In [12]:
# ------------------ Markdown -> PDF Export ------------------
def export_md_to_pdf(md_path, pdf_path):
    """
    Robust Markdown -> PDF using reportlab.
    If HTML parsing yields no content, falls back to raw Markdown text.
    Returns True on success, False otherwise.
    """
    try:
        from reportlab.lib.pagesizes import A4
        from reportlab.lib.units import mm
        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
        from reportlab.lib.enums import TA_LEFT
        from reportlab.lib import utils
        import markdown
        from bs4 import BeautifulSoup
    except Exception:
        print("PDF export requires: reportlab, markdown, beautifulsoup4.", file=sys.stderr)
        print("In Colab, run:  !pip install reportlab markdown beautifulsoup4", file=sys.stderr)
        return False

    # Read MD
    with open(md_path, "r", encoding="utf-8") as f:
        md_text = f.read()
    if not md_text.strip():
        print("Markdown file is empty; writing placeholder page.", file=sys.stderr)
        md_text = "# Completion Report Summary\n\n(No content generated.)"

    # Convert to HTML
    html = markdown.markdown(md_text, extensions=["tables", "fenced_code", "sane_lists"])
    soup = BeautifulSoup(html, "html.parser")

    # Styles
    styles = getSampleStyleSheet()
    if "Heading1L" not in styles:
        styles.add(ParagraphStyle(name="Heading1L", parent=styles["Heading1"], alignment=TA_LEFT, spaceAfter=8))
    if "Heading2L" not in styles:
        styles.add(ParagraphStyle(name="Heading2L", parent=styles["Heading2"], alignment=TA_LEFT, spaceAfter=6))
    if "BodyL" not in styles:
        styles.add(ParagraphStyle(name="BodyL", parent=styles["BodyText"], spaceAfter=6, leading=14))

    story = []

    def para(text, style="BodyL", space=4):
        story.append(Paragraph(text, styles[style]))
        story.append(Spacer(1, space))

    def handle_list(tag):
        items = []
        for li in tag.find_all("li", recursive=False):
            items.append(ListItem(Paragraph(li.decode_contents() or "", styles["BodyL"])))
        if items:
            story.append(ListFlowable(items, bulletType="bullet" if tag.name == "ul" else "1"))
            story.append(Spacer(1, 4))

    # Build flowables from top-level nodes
    recognized = 0
    for node in soup.children:
        name = getattr(node, "name", None)
        if name == "h1":
            para(node.decode_contents(), "Heading1L", 8); recognized += 1
        elif name == "h2":
            para(node.decode_contents(), "Heading2L", 6); recognized += 1
        elif name in {"p", "pre", "code", "h3"}:
            para(node.decode_contents(), "BodyL", 4); recognized += 1
        elif name in {"ul", "ol"}:
            handle_list(node); recognized += 1
        elif name == "table":
            # Very simple table handling: flatten to text
            text = node.get_text(separator="  |  ").strip()
            if text:
                para(text, "BodyL", 4); recognized += 1
        else:
            # Raw text nodes
            raw = str(node).strip()
            if raw:
                para(raw, "BodyL", 4); recognized += 1

    # Fallback if nothing was recognized: write raw MD as a paragraph
    if not story:
        para("## Completion Report Summary (raw markdown fallback)", "Heading2L", 8)
        # Escape angle brackets for Paragraph
        safe_md = md_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
        # Replace newlines with <br/> to preserve structure
        safe_md = safe_md.replace("\n", "<br/>")
        para(safe_md, "BodyL", 4)

    # Build PDF
    doc = SimpleDocTemplate(
        pdf_path, pagesize=A4,
        leftMargin=18*mm, rightMargin=18*mm,
        topMargin=18*mm, bottomMargin=18*mm
    )
    try:
        doc.build(story)
    except Exception as e:
        print(f"PDF build failed: {e}", file=sys.stderr)
        return False

    # Quick debug
    print(f"[export_md_to_pdf] nodes_parsed={recognized} flowables={len(story)}")
    return os.path.exists(pdf_path)



In [13]:
!pip install reportlab markdown beautifulsoup4


Collecting reportlab
  Downloading reportlab-4.4.4-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.4-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.4


In [17]:
import argparse, json, os, re, sys
from datetime import datetime

# Detect notebook so we don't auto-run main() in Colab/Jupyter
IN_NOTEBOOK = any(m in sys.modules for m in ("ipykernel", "google.colab"))

def run(pdf, outdir=".", word_limit=250, nodal_json=None, export_pdf=False):
    if not os.path.exists(pdf):
        raise FileNotFoundError(f"PDF not found: {pdf}")
    if word_limit < 1:
        word_limit = 250

    os.makedirs(outdir, exist_ok=True)

    # ---- Extraction & preprocessing ----
    raw = extract_pdf_text(pdf)
    text = clean_spaces(raw)
    chunks = chunk_text(text, chunk_size=1500, overlap=300)

    retrieve_func = None
    if chunks:
        try:
            retrieve_func = build_retriever(chunks)
        except Exception:
            retrieve_func = None

    # ---- Parsing & validation ----
    extracted = parse_report_fields(text)
    issues = validate_fields(extracted)

    # ---- Nodal inputs ----
    nodal_inputs = nodal_default_inputs(extracted)
    if nodal_json and os.path.exists(nodal_json):
        with open(nodal_json, "r", encoding="utf-8") as f:
            nodal_inputs.update(json.load(f))

    # ---- Questions & summary ----
    questions = []
    if run_nodal(nodal_inputs).get("status") != "ok":
        for m in run_nodal(nodal_inputs).get("missing_inputs", []):
            questions.append(f"Please provide *{m.replace('_',' ').capitalize()}*.")

    base_summary = generate_summary(extracted, retrieve_func, word_limit)

    if run_nodal(nodal_inputs).get("status") == "ok":
        r = run_nodal(nodal_inputs)["results"]
        try:
            nodal_line = (
                f" Nodal operating point (stub): q ≈ {r['q_m3_h']} m³/h at "
                f"WHP ≈ {r['whp_bar']} bar (Tubing ID {r['tubing_id_in']} in)."
            )
        except Exception:
            nodal_line = " Nodal operating point (stub): results available but incomplete."
        base_summary = enforce_word_limit(base_summary + nodal_line, word_limit)

    out_json = {
        "timestamp": datetime.now().isoformat(),
        "inputs": {"pdf": os.path.basename(pdf), "word_limit": word_limit},
        "data_extracted": extracted,
        "validation_issues": issues,
        "nodal_inputs_used": nodal_inputs,
        "nodal_status": run_nodal(nodal_inputs),
        "questions_for_user": questions,
        "summary_words": len(re.findall(r"\S+", base_summary)),
        "summary": base_summary,
    }

    json_path = os.path.join(outdir, "rag_agentic_outputs.json")
    md_path = os.path.join(outdir, "rag_agentic_summary.md")

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(out_json, f, indent=2, ensure_ascii=False)

    with open(md_path, "w", encoding="utf-8") as f:
        f.write(f"# Completion Report Summary (≤{word_limit} words)\n\n")
        f.write(base_summary + "\n")
        if questions:
            f.write("\n## Missing Inputs for Nodal Analysis\n")
            for q in questions:
                f.write(f"- {q}\n")

        if export_pdf:
            pdf_path = os.path.join(outdir, "rag_agentic_summary.pdf")
            ok = export_md_to_pdf(md_path, pdf_path)
        if ok:
            print("Exported PDF:", pdf_path)
            # Auto-offer download in Colab
            if IN_NOTEBOOK and "google.colab" in sys.modules:
                try:
                    from google.colab import files
                    files.download(pdf_path)
                except Exception as e:
                    print(f"(Could not auto-download PDF: {e})", file=sys.stderr)
        else:
            print("PDF export failed or missing deps. In Colab run:", file=sys.stderr)
            print("!pip install reportlab markdown beautifulsoup4", file=sys.stderr)

# Do NOT auto-run main() inside notebooks
if __name__ == "__main__" and not IN_NOTEBOOK:
    main()

run(
    pdf= "/content/TNO-Report-2015-R10065-final-public2020.pdf",
    outdir="/content/out",
    word_limit=250,
    nodal_json="/content/nodal.json",  # or None
    export_pdf=True
)


[extract_pdf_text] pypdf failed: No module named 'pypdf'
[extract_pdf_text] pdfminer failed: No module named 'pdfminer'
[extract_pdf_text] Falling back to OCR (this may take a while).


[extract_pdf_text] OCR recovered ~190421 characters.
[export_md_to_pdf] nodes_parsed=2 flowables=0
Exported PDF: /content/out/rag_agentic_summary.pdf


Markdown file is empty; writing placeholder page.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# New Section