In [1]:
import re
import os
import fitz
import time
import shutil
import pandas as pd
from chromadb import PersistentClient
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import pymupdf4llm
from langchain_community.document_loaders import PyMuPDFLoader
import json
import math
from typing import TypedDict, List
from langgraph.graph import StateGraph


  from .autonotebook import tqdm as notebook_tqdm


Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [2]:
%run "data_preprocessing.ipynb"

Reading table: ecat_filtered_data
Found 4 existing reports — will SKIP these samples.
Found 489 non-buying units — will KEEP these units.
Loaded chunk 1 (11919 rows)
Final dataset contains 11657 new rows for report generation.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent["RESULT_NUM"] = pd.to_numeric(recent["RESULT"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent["TEST_KEY"] = recent["TEST_NAME"].apply(normalize_name)


In [3]:
pipeline_results = main()

Reading table: ecat_filtered_data
Found 4 existing reports — will SKIP these samples.
Found 489 non-buying units — will KEEP these units.
Loaded chunk 1 (11919 rows)
Final dataset contains 11657 new rows for report generation.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent["RESULT_NUM"] = pd.to_numeric(recent["RESULT"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent["TEST_KEY"] = recent["TEST_NAME"].apply(normalize_name)


In [4]:
pipeline_results[1]["fused_summary"]

" SAMPLE INFORMATION\nSample 41003698 Sample Type Fines was taken on 2024-09-30 00:00:00 and received on 2024-10-11 00:00:00. It belongs to unit CITGO - CORPUS CHRISTI (NEW), TX, US (Unit ID: 313) and was analyzed under lab code 'C'.\n\n PARAMETER DETAILS\n- Parameter Al₂O₃ (wt%): captured value 49.82718, deviation 0.14407999999999532, isWithinLimit False. Y_Min: 32.76446, Y_Max: 44.50726. Last 5 sample values: 53.32167, 49.6831, 53.8245, 50.54328, 54.04769.\n- Parameter CaO (wt%): captured value 0.1923199999999999, deviation -0.0798400000000001, isWithinLimit False. Y_Min: 0.03, Y_Max: 0.17. Last 5 sample values: 0.1000999999999999, 0.27216, 0.12063, 0.1885699999999999, 0.1283499999999999.\n- Parameter Co (ppm): captured value 25.80823, deviation 2.6572899999999997, isWithinLimit False. Y_Min: 27.76159, Y_Max: 50.82113. Last 5 sample values: 25.90799, 23.15094, 25.16176, 24.91641, 22.50408.\n- Parameter Cu (ppm): captured value 26.72808, deviation 0.8529499999999999, isWithinLimit Tru

In [5]:
PDF_PATH = "GRACE Ecat Analysis Guide.pdf"
CHROMA_DIR = "CHROMA_DB"

EXTRACTING TEXT FROM PDF

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(PDF_PATH)
pages = loader.load()

raw_text = "\n".join(p.page_content for p in pages)

print("PDF text extracted")

PDF text extracted


In [7]:
def normalize_and_clean_pdf_text(raw_text: str) -> list[str]:
    """
    Normalizes PDF text and removes chart / axis / date noise
    that harms semantic chunking.
    Returns clean paragraph-level text blocks.
    """

    text = re.sub(r'\n\s*\n', '\n\n', raw_text)   # collapse blank lines
    text = re.sub(r'[ \t]+', ' ', text)           # collapse spaces/tabs

    cleaned_blocks = []

    for block in text.split("\n\n"):
        lines = []
        for line in block.splitlines():
            line = line.strip()

            if not line:
                continue

            # REMOVE DATE TOKENS
            # e.g. 1/1/14, 4/28/16, 12/09/2017
            line = re.sub(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', '', line)

            # REMOVE PURE NUMERIC / AXIS LINES 
            if not re.search(r'[A-Za-z]', line):
                continue

            # REMOVE REPEATED CHART LABELS 
            if re.fullmatch(
                r'(DATE|MAT ACTIVITY|SURFACE AREA|UMB/UMF|SA/K)'
                r'(\s+\1)*',
                line,
                flags=re.IGNORECASE
            ):
                continue

            # REMOVE NUMBER-DOMINATED LINES 
            digit_ratio = sum(c.isdigit() for c in line) / max(len(line), 1)
            if digit_ratio > 0.6:
                continue

            lines.append(line)

        cleaned_block = " ".join(lines).strip()
        if cleaned_block:
            cleaned_blocks.append(cleaned_block)

    return cleaned_blocks

text_blocks = normalize_and_clean_pdf_text(raw_text)

In [8]:
def split_into_sentences(text: str):
    pattern = re.compile(r'(?<=[.!?])\s+')
    return [s.strip() for s in pattern.split(text) if s.strip()]


In [9]:
def _sentence_pack(text, chunk_size, chunk_overlap):
    sentences = split_into_sentences(text)

    chunks = []
    current = ""
    
    for s in sentences:
        if len(current) + len(s) > chunk_size:
            chunks.append(current.strip())
            current = current[-chunk_overlap:] + " " + s
        else:
            current += " " + s

    if current.strip():
        chunks.append(current.strip())

    return chunks


In [10]:
def hybrid_sentence_heading_chunking(
    text: str,
    chunk_size: int = 800,
    chunk_overlap: int = 150
):
    # Detect simple headings (ALL CAPS or ending with :)
    heading_pattern = re.compile(r'^[A-Z0-9\s\-_/]{4,}$|:$')

    chunks = []
    buffer = ""
    current_heading = ""

    for block in text.split("\n"):
        block = block.strip()
        if not block:
            continue

        # Heading detection
        if heading_pattern.match(block):
            if buffer:
                chunks.extend(_sentence_pack(buffer, chunk_size, chunk_overlap))
                buffer = ""
            current_heading = block
        else:
            buffer += f"{current_heading} {block} "

    if buffer:
        chunks.extend(_sentence_pack(buffer, chunk_size, chunk_overlap))

    return chunks


In [11]:
CHUNK_SIZE = 800
CHUNK_OVERLAP = 150

pdf_text_docs = []

for block in text_blocks:
    chunks = hybrid_sentence_heading_chunking(
        block,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

    for chunk in chunks:
        pdf_text_docs.append(Document(page_content=chunk))

print(f"Total hybrid chunks created: {len(pdf_text_docs)}")


Total hybrid chunks created: 20


In [12]:
for i, doc in enumerate(pdf_text_docs):
    print("\n" )
    print(f"CHUNK {i} | WORDS: {len(doc.page_content.split())}")
    print(doc.page_content)



CHUNK 0 | WORDS: 113
W. R. Grace & Co. Valero – Wilmington: October 25, 2017 Focused. Innovative. Performance Driven. Interpreting ECAT Analyses W. R. Grace & Co. • Important tool for stewarding FCC operation • Frequent sampling is recommended • Fines analysis can be as important as unit ECAT when trouble shooting Interpreting ECAT Analyses W. R. Grace & Co. Interpreting ECAT Analyses • MAT The Micro Activity Test is a measure of catalyst activity in wt % conversion. MAT activity is affected by the addition rate of fresh catalyst, unit turnover rate, metals contamination of equilibrium catalyst, the fresh catalyst activity and quality. The MAT test is performed on a fluidized ACE (Advanced Cracking Evaluation) machine.


CHUNK 1 | WORDS: 123
of equilibrium catalyst, the fresh catalyst activity and quality. The MAT test is performed on a fluidized ACE (Advanced Cracking Evaluation) machine. MAT, wt% The surface area measurement (m2/gm) is the total of the zeolite and matrix surface ar

SETTING UP EMBEDDING MODEL

In [13]:
import torch
from transformers import AutoTokenizer, AutoModel

class BGE_Embedding:
    def __init__(self, model_name="BAAI/bge-large-en"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def embed(self, texts):
        enc = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(self.device)
        with torch.no_grad():
            out = self.model(**enc)
        emb = out.last_hidden_state.mean(dim=1)
        emb = torch.nn.functional.normalize(emb, p=2, dim=1)
        return emb.cpu().numpy()

embedder = BGE_Embedding()

# Langchain Embedding Wrapper.
from langchain_core.embeddings import Embeddings
class BGEVectorEmbedder(Embeddings):
    def __init__(self, embedder):
        self.embedder = embedder

    def embed_documents(self, texts):
        return self.embedder.embed(texts)

    def embed_query(self, text):
        return self.embedder.embed([text])[0]

embedding_fn = BGEVectorEmbedder(embedder)


In [14]:
batch_size = 32
pdf_embeddings = []

for i in range(0, len(pdf_text_docs), batch_size):

    batch_docs = pdf_text_docs[i : i + batch_size]
    batch_texts = [doc.page_content for doc in batch_docs]
    batch_emb = embedding_fn.embed_documents(batch_texts)
    pdf_embeddings.extend(batch_emb)

print("Total PDF embeddings:", len(pdf_embeddings))

Total PDF embeddings: 20


CREATING CHROMDB FOR VECTOR STORAGE

In [15]:

persist_dir = "CHROMA_DB_ECATS"

shutil.rmtree(persist_dir, ignore_errors=True)
time.sleep(0.1)
os.makedirs(persist_dir, exist_ok=True)

client = PersistentClient(path=persist_dir)

pdf_collection = client.get_or_create_collection(
    name="ECAT_PDF"
)


# IDs (must match chunk count)
pdf_all_ids = [f"pdf_chunk_{i}" for i in range(len(pdf_text_docs))]

# Extract text from Document objects
pdf_all_docs = [doc.page_content for doc in pdf_text_docs]

# Ensure embeddings are plain lists
pdf_embeddings_list = [
    emb.tolist() if hasattr(emb, "tolist") else list(emb)
    for emb in pdf_embeddings
]


pdf_collection.add(
    ids=pdf_all_ids,
    documents=pdf_all_docs,
    embeddings=pdf_embeddings_list
)


print("PDF chunks stored:", len(pdf_all_ids))
print("ChromaDB insertion completed successfully.")


PDF chunks stored: 20
ChromaDB insertion completed successfully.


CHUNK VERIFICATIONS

In [16]:
# VERIFY PDF CHUNKS
pdf_stored = pdf_collection.get()
pdf_stored_docs = pdf_stored.get("documents", [])
print(f"PDF chunks originally created: {len(pdf_all_docs)}")
print(f"PDF chunks stored in ChromaDB: {len(pdf_stored_docs)}")

if len(pdf_all_docs) == len(pdf_stored_docs):
    print("Count match!")
else:
    print("Count mismatch!")


PDF chunks originally created: 20
PDF chunks stored in ChromaDB: 20
Count match!


GPTOSS-20B API WRAPPER FUNCTION

In [17]:
import requests
import json
from dotenv import load_dotenv

class GPT_OSS:
    def __init__(self, api_key, model="openai/gpt-oss-20b"):
        self.api_key = api_key
        self.model = model
        self.url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }

    def __call__(self, prompt):
        payload = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "extra_body": {"reasoning": {"enabled": True}}
        }

        response = requests.post(
            url=self.url,
            headers=self.headers,
            data=json.dumps(payload)
        ).json()

        # Debug print shows the raw API output when errors occur
        # print("\nRAW OSS RESPONSE:\n", response)

        # Standard OpenAI format
        if "choices" in response:
            try:
                return response["choices"][0]["message"]["content"]
            except:
                pass

        # openrouter text
        if "output_text" in response:
            return response["output_text"]

        # vLLM-style text
        if "text" in response:
            return response["text"]

        # Some OSS models nest text
        if "response" in response:
            return response["response"]

        # Error handling
        if "error" in response:
            raise RuntimeError(f"OSS API Error: {response['error']}")

        # If no known field exists
        raise RuntimeError(f"Unexpected OSS response format: {response}")
load_dotenv()
# OpenRouter's APIKEY
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

In [18]:
import google.generativeai as genai


class GeminiFlash:
    def __init__(
        self,
        api_key: str,
        model: str = "gemini-2.5-flash"
    ):
        if not api_key or not api_key.strip():
            raise ValueError("Gemini API key must be provided")

        self.api_key = api_key
        self.model_name = model

        # Configure Gemini with notebook-provided key
        genai.configure(api_key=self.api_key)

        self.model = genai.GenerativeModel(self.model_name)

    def __call__(self, prompt: str) -> str:
        if not prompt or not prompt.strip():
            raise ValueError("Prompt is empty")

        response = self.model.generate_content(prompt)

        if hasattr(response, "text") and response.text:
            return response.text

        raise RuntimeError("Gemini returned an empty response")
load_dotenv()    
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")



All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [19]:
class LLMWithFallback:
    def __init__(self, primary_llm, fallback_llm):
        self.primary = primary_llm
        self.fallback = fallback_llm

    def __call__(self, prompt: str) -> str:
        try:
            return self.primary(prompt)

        except Exception as e:
            err = str(e).lower()

            # Only fallback on availability issues
            if any(x in err for x in ["429", "rate", "not found", "provider", "temporarily"]):
                return (
                    "[FALLBACK MODEL USED: Gemini 2.5 Flash]\n\n"
                    + self.fallback(prompt)
                )

            # Unknown error → propagate (you WANT to see this)
            raise



In [30]:

llm = GPT_OSS(api_key=OPENROUTER_API_KEY)

RAG1 -> HELPER FUNCTION

In [31]:
#HELPER FUNCTION FOR EXTRACTING THE KEYWORD FROM FUSED SUMMARY
def extract_parameters(fused_summary):
    pattern = r"Parameter\s+([A-Za-z0-9\/\-\+ ]+):"
    return [p.strip() for p in re.findall(pattern, fused_summary)]
 
def extract_keywords(parameter_name):
    p = parameter_name.strip().lower()
    p = re.sub(r"\s+", " ", p)
    p = re.sub(r"\s*-\s*", "-", p)
    return p
def extract_all_keywords(fused_summary):
    params = extract_parameters(fused_summary)
    keywords = [extract_keywords(p) for p in params]
    return params, keywords
 
 

PROMPT FOR RAG1 ISSUE SUMMARY GENERATION

In [32]:


def generate_issue1_analysis(fused_summary, rag_context, parameter_correlation):
    """
    NEW ISSUE IDENTIFICATION GENERATOR (filtered)
    Uses fused summary + retrieved chunks to explain:
    - what each parameter means
    - whether it is normal / abnormal
    - issues present
    - root causes
    - trend behavior
    Returns JSON list of ONLY parameters that:83994040404490000
      - have isWithinLimit == False
      - AND have a non-NaN deviation value
    """
 
 
    prompt = f"""
      You are an expert FCC laboratory test analyst.

      You are provided with:
      1. A FUSED SUMMARY containing sample metadata and parameter results
      2. RETRIEVED CONTEXT from reference documents
      3. A PARAMETER RELATIONSHIP MAP that defines:
         - the Test Group each parameter belongs to
         - the correlated parameters that MUST be considered together

      =====================
      PARAMETER RELATIONSHIP MAP (AUTHORITATIVE)
      Use this map to determine correlated parameters and test groups.
      You are FORBIDDEN from analyzing any parameter in isolation.
      {json.dumps(parameter_correlation, indent=2)}
      =====================

      Analyze every test parameter from the fused summary using
      ITS CORRELATED PARAMETERS as defined in the relationship map.

      ────────────────────────────────────────
      FIRST LINE OUTPUT (MANDATORY)
      ────────────────────────────────────────
      - Begin the output with a single-line Sample Overview header
      - Extract the following strictly from the FUSED SUMMARY:
      • Sample Number
      • Unit Name
      • Unit ID
      • Sample Type
      • Date Taken
      • Date Received
      • Lab Code

      Format EXACTLY like this (single line, no bullets):

      Sample <Sample_Number> | Unit: <Unit_Name> (Unit ID <Unit_Id>) | Type: <Sample_Type> | Taken: <Date_Taken> | Received: <Date_Received> | Lab: <Lab_Code>

      This line must appear ONCE, at the very top, before any parameter analysis.

      =====================
      FUSED SUMMARY
      {fused_summary}
      =====================

      =====================
      RETRIEVED CONTEXT
      {rag_context}
      =====================

      ────────────────────────────────────────
      PER-PARAMETER ANALYSIS REQUIREMENTS
      ────────────────────────────────────────

      For EACH parameter in the fused summary, output the following fields:

      1. Parameter: <parameter_name>
      2. Sample Number: <sample_number>
      3. Captured Value: <value from fused summary>
      4. Deviation: <deviation from fused summary>
      5. Is Within Limit: <true/false from fused summary>
      6. Last 5 Sample Values: <values from fused summary>
      7. Correlated Parameters:
         - List ALL correlated parameters exactly as defined in the relationship map
      8. Test_Group:
         - Output the Test Group name exactly as defined in the relationship map

      ────────────────────────────────────────
      MULTI-PARAMETER DIAGNOSTIC LOGIC (MANDATORY)
      ────────────────────────────────────────

      For the following sections, you MUST reason using:
      - the current parameter
      - AND its correlated parameters
      - AND any observable pattern or consistency between them

      You are NOT allowed to:
      - explain a parameter independently
      - ignore correlated parameters
      - restate values without interpretation

      9. issue_summary:

         - If Is Within Limit = true →
            Write a concise, technical diagnostic interpretation that explains:
            • what the parameter indicates IN CONTEXT of its correlated parameters,
            • what the combined within-limit condition implies for catalyst health,
            • whether the correlated parameters reinforce stability or mask early risk.

         - If Is Within Limit = false →
            Explain clearly:
            • what the deviation means in context of correlated parameters,
            • how the combined pattern impacts FCC performance,
            • why this deviation is diagnostically significant.

      10. possible_causes:

         - ONLY generate this section if:
               Is Within Limit = false.

         - When generated, you MUST:
            • derive causes using relationships between the parameter
               and its correlated parameters,
            • avoid single-parameter explanations,
            • provide concise root-cause reasoning grounded in FCC behavior.

      ────────────────────────────────────────
      STRICT OUTPUT FORMAT (NORMAL TEXT, NOT JSON)
      ────────────────────────────────────────

      Parameter: <parameter_name>,
      Sample Number: <sample_number>,
      Captured Value: <value>, Deviation: <value>,
      Is Within Limit: <true/false>,
      Last 5 Sample Values: <values>
      Correlated Parameters: <comma-separated list>
      Test_Group: <group name>
      issue_summary: "<text>"        <-- include ONLY when Is Within Limit = false
      Summary: "<text>"              <-- include ONLY when Is Within Limit = true
      possible_causes: "<text>"      <-- include ONLY when allowed

      ────────────────────────────────────────
      GLOBAL RULES (NON-NEGOTIABLE)
      ────────────────────────────────────────

      - Always list ALL parameters from the fused summary.
      - Parameter names must match the fused summary EXACTLY.
      - If Deviation is NaN:
            • DO NOT generate issue_summary
            • DO NOT generate possible_causes
      - Never invent limits, numerical ranges, or product capabilities.
      - Use RETRIEVED CONTEXT only to enrich explanations, not override data.
      - Maintain refinery-grade, concise, technical language.
      - Any violation of correlation rules invalidates the analysis.
      """

 
    resp = llm(prompt)
    return resp
  
 

LANGGRAPH STATE FOR RAG1

In [33]:
from typing import TypedDict, Dict, Any

class RAG1State(TypedDict):
    fused_summary: str
    rag_context: str
    parameter_correlation: Dict[str, Any] 
    issue_analysis: str
    final_output: str

NODE 1: Load JSON + Fused Summary

In [34]:
def load_fused_summary_node(state: RAG1State):
    return {
        "fused_summary": state["fused_summary"]
    }

NODE 2: ECAT Retrieval (ChromaDB)

In [35]:
def ecat_retrieval_node(state: RAG1State):
 
    fused_summary = state["fused_summary"]
    _, keywords = extract_all_keywords(fused_summary)
 
    collected_docs = []
 
    for kw in keywords:
        emb = embedding_fn.embed_query(kw)    #Convert EACH parameter into a vector query
        res = pdf_collection.query(           #Embedded keyword has been converted into query for chromdb and gives TOP3 ECAT chunks
            query_embeddings=[emb],
            n_results=3
        )
        collected_docs.extend(res["documents"][0])
 
    rag_context = "\n\n".join(list(set(collected_docs)))
 
    return {
        "rag_context": rag_context
    }
 
 

NODE 3: Issue Analysis Node

In [36]:
def issue_analysis_node(state: RAG1State):

    analysis = generate_issue1_analysis(
        fused_summary=state["fused_summary"],
        rag_context=state["rag_context"],
        parameter_correlation=state["parameter_correlation"]
    )

    return {"issue_analysis": analysis}


NODE 4: Final Node

In [37]:
#This node copies the analyzed issue output into the final, canonical output field of the graph state.
def final_node(state: RAG1State):
    return {"final_output": state["issue_analysis"]}


In [None]:
#Node by Node pipline
graph = StateGraph(RAG1State)
 
graph.add_node("load_fused_summary", load_fused_summary_node)
graph.add_node("retrieve_ecat", ecat_retrieval_node)
graph.add_node("analyze", issue_analysis_node)
graph.add_node("final", final_node)

graph.set_entry_point("load_fused_summary")
 
graph.add_edge("load_fused_summary", "retrieve_ecat")
graph.add_edge("retrieve_ecat", "analyze")
graph.add_edge("analyze", "final")
 
app = graph.compile()

In [53]:
def run_rag1_graph(app, fused_summary, parameter_correlation):
    """
    Triggers the RAG1 LangGraph pipeline end-to-end.

    Required inputs:
    - fused_summary: str
    - parameter_correlation: dict

    Returns:
    - final graph output (state)
    """

    input_state = {
        "fused_summary": fused_summary,
        "parameter_correlation": parameter_correlation
    }

    result = app.invoke(input_state)

    return result


In [57]:
# ===== SINGLE-CELL: RUN RAG1 GRAPH FOR ONE SAMPLE AND PRINT MARKDOWN =====

import json

# 1. Load parameter correlation JSON
with open("Parameter_Correlation.json", "r") as f:
    PARAMETER_CORRELATION = json.load(f)

# 2. Select ONE sample (change index if needed)
sample = pipeline_results[0]   # e.g., 0, 1, 2, ...
single_summary = sample["fused_summary"]

# 3. Trigger the LangGraph (RAG1 only)
result = app.invoke({
    "fused_summary": single_summary,
    "parameter_correlation": PARAMETER_CORRELATION
})

# 4. Print the generated markdown output
print(result["final_output"])

# (Optional sanity check)
# print(result.keys())


Sample 41004748 | Unit: CITGO - CORPUS CHRISTI (NEW), TX, US (Unit ID 313) | Type: Ecat | Taken: 2024-10-09 | Received: 2024-10-14 | Lab: C

Parameter: Parameter 0–40 (µ),
Sample Number: 41004748,
Captured Value: 2.229, Deviation: -0.23899999999999988,
Is Within Limit: true,
Last 5 Sample Values: 2.468, 2.578
Correlated Parameters: 
Test_Group: 
Summary: "Within limits; no correlated parameters available."

Parameter: Parameter 0–60 (µ),
Sample Number: 41004748,
Captured Value: 19.79, Deviation: -1.0519999999999996,
Is Within Limit: true,
Last 5 Sample Values: 20.842, 21.327
Correlated Parameters: 
Test_Group: 
Summary: "Within limits; no correlated parameters available."

Parameter: Parameter 0–80 (µ),
Sample Number: 41004748,
Captured Value: 43.751, Deviation: -1.6099999999999994,
Is Within Limit: true,
Last 5 Sample Values: 45.361, 46.011
Correlated Parameters: 
Test_Group: 
Summary: "Within limits; no correlated parameters available."

Parameter: Gas Factor (nan),
Sample Number: 41

RAG2 -> HELPER FUNCTION

In [39]:
def split_issue_blocks(issue_text):
    blocks = issue_text.strip().split("\n\nParameter:")
    clean = []
 
    for b in blocks:
        b = b.strip()
        if not b:
            continue
        if not b.startswith("Parameter:"):
            b = "Parameter: " + b
        clean.append(b)
 
    return clean
 
chunks = split_issue_blocks(result["final_output"])
 
 
print("\n===== TOTAL CHUNKS:", len(chunks), "=====\n")
 
for i, c in enumerate(chunks):
    print(f"\n========= CHUNK {i} =========\n")
    print(c)
 
 

NameError: name 'result' is not defined

In [40]:
def extract_issue_summary(chunk: str):
    match = re.search(
        r"issue_summary:\s*(.*?)(?:\n|$)",
        chunk,
        flags=re.IGNORECASE
    )
    if match:
        return match.group(1).strip()
    return None

for i, chunk in enumerate(chunks, 1):
    summary = extract_issue_summary(chunk)

    print(f"\n--- CHUNK {i} ---")
    print(chunk)

    print("\nEXTRACTED issue_summary:")
    if summary:
        print(summary)
    else:
        print("No issue_summary found")




--- CHUNK 1 ---
W. R. Grace & Co. Valero – Wilmington: October 25, 2017 Focused. Innovative. Performance Driven. Interpreting ECAT Analyses W. R. Grace & Co. • Important tool for stewarding FCC operation • Frequent sampling is recommended • Fines analysis can be as important as unit ECAT when trouble shooting Interpreting ECAT Analyses W. R. Grace & Co. Interpreting ECAT Analyses • MAT The Micro Activity Test is a measure of catalyst activity in wt % conversion. MAT activity is affected by the addition rate of fresh catalyst, unit turnover rate, metals contamination of equilibrium catalyst, the fresh catalyst activity and quality. The MAT test is performed on a fluidized ACE (Advanced Cracking Evaluation) machine.

EXTRACTED issue_summary:
No issue_summary found

--- CHUNK 2 ---
of equilibrium catalyst, the fresh catalyst activity and quality. The MAT test is performed on a fluidized ACE (Advanced Cracking Evaluation) machine. MAT, wt% The surface area measurement (m2/gm) is the total

In [41]:
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
# Load Excel documents from Chroma
stored_data = pdf_collection.get(include=["documents"])
all_docs = stored_data["documents"]

# BM25 setup
tokenized_corpus = [doc.lower().split() for doc in all_docs]
bm25 = BM25Okapi(tokenized_corpus)

# Cross-Encoder reranker
reranker = CrossEncoder("BAAI/bge-reranker-base")

print("BM25 and Cross-Encoder ready")


BM25 and Cross-Encoder ready


In [42]:
def hybrid_retrieve(query, top_k=20, bm25_k=20, vector_k=20):

    query_embedding = embedding_fn.embed_query(query)

    # Vector search
    vector_results = pdf_collection.query(
        query_embeddings=[query_embedding],
        n_results=vector_k
    )
    vector_docs = vector_results["documents"][0]

    vector_scores = {doc: 1.0 for doc in vector_docs}

    # BM25
    bm25_scores_raw = bm25.get_scores(query.lower().split())
    top_bm25_idx = np.argsort(bm25_scores_raw)[::-1][:bm25_k]

    bm25_scores = {}
    for idx in top_bm25_idx:
        if bm25_scores_raw[idx] > 0:
            bm25_scores[all_docs[idx]] = bm25_scores_raw[idx]

    if bm25_scores:
        max_bm, min_bm = max(bm25_scores.values()), min(bm25_scores.values())
        bm25_scores = {
            k: (v - min_bm) / (max_bm - min_bm + 1e-6)
            for k, v in bm25_scores.items()
        }

    combined = {}
    for t, s in vector_scores.items():
        combined[t] = combined.get(t, 0) + 0.6 * s
    for t, s in bm25_scores.items():
        combined[t] = combined.get(t, 0) + 0.4 * s

    ranked = sorted(combined.items(), key=lambda x: x[1], reverse=True)
    return [t for t, _ in ranked[:top_k]]


In [43]:
def rerank(query, docs, top_k=10):
    if not docs:
        return []
    pairs = [[query, doc] for doc in docs]
    scores = reranker.predict(pairs)
    ranked = [doc for _, doc in sorted(zip(scores, docs), reverse=True)]
    return ranked[:top_k]


In [44]:
def rag_retrieve(query, top_k=10):
    hybrid_docs = hybrid_retrieve(query, top_k=50)
    return rerank(query, hybrid_docs, top_k)


In [45]:
def generate_issue2_analysis(parameter_context):
    """
    NEW ISSUE IDENTIFICATION GENERATOR (filtered)
    Uses ONLY rag_context (which already contains:
       - fused summary
       - issue blocks
       - extracted issue summaries
    )
    Returns the full ECAT Technical Report using structured prompt.
    """
 
    prompt = f"""
You are an expert FCC catalyst laboratory analyst with advanced expertise in:
ECAT diagnostics, and FCC catalyst product-selection engineering.
 
You MUST strictly follow the structure provided.
Do NOT add, delete, or rename any section.
 
The core objective of this analysis is to identify issues and recommend strategies
that improve ECAT performance while **minimizing fresh catalyst addition rate**
in the FCC unit as well as  to provide a **technical diagnostic assessment**
of the ECAT sample based ONLY on parameter-wise analysis.
 
 
# FCC Catalyst Technical Report- Sample:<Sample Number>
# Sample Overview
 
Generate a structured **Sample Overview table** EXACTLY in the format below.
All values MUST be extracted from the diagnostic context.
 
| Item | Detail |
|-----|--------|
| Unit ID | <Unit ID> |
| Sample ID | <Sample Number> |
| Sampling Date | <Sampling / Date Taken> |
| Received Date | <Date Received> |
| Unit Name | <Unit Name> |
| Lab Code | <Lab Code> |
| Anomalous Parameters | <Comma-separated list of out-of-limit parameters> |
 
### Key Observations
- Provide **2–4 each observation as a separate bullet point on its own line** summarizing:
  • major deviations,
  • abnormal ratios,
  • contamination indicators,
  • data-quality anomalies (e.g., missing ratios).
- Observations MUST be technically precise and evidence-based.
 
# Parameter-by-Parameter Interpretation
Insert THIS TABLE EXACTLY:
Parameter | Status | Diagnostic Summary
--------- | ------ | ------------------
MANDATORY RULES:
- Populate **ONE row for EACH parameter** present in the diagnostic context.
- **Status column MUST be exactly one of the following**:
  - `Within Limit`
  - `Exceeds Limit`
- Status must be derived strictly from the parameter limit condition.
 
Diagnostic Summary Requirements:
- Write a **short, technical 2-4 line diagnostic statement** for EACH parameter.
- For within-limit parameters, summarize normal operational meaning.
- For out-of-limit parameters, summarize deviation impact and risk.
- Do NOT include labels like “Summary” or “issue_summary” inside the table.
- Maintain refinery-grade, concise, engineering language.
 
# Consolidated Issue Summary
Insert THIS TABLE EXACTLY:
Issues | Evidence | Impact
------ | -------- | -------
- Include ONLY out-of-limit parameters.
- Evidence must come from parameter_context trends, deviations, or issue blocks.
- Impact must reflect operational and catalyst health consequences.
 
# Corrective Actions & Optimization Strategies
Insert  THIS TABLE EXACTLY:
| Issue | Corrective Action
|-------|------------------
- Corrective actions in bullet points and **operational or monitoring-oriented**.
 
# Final Remarks
Write 3–4 Bullet Points summarizing:
- root-cause understanding,
- severity of observed deviations,
- Corrective action: key evidence supporting conclusions,
- expected operational trajectory if uncorrected.
 
RULES (MANDATORY):
- Use ONLY the diagnostic context provided below.
- Do NOT invent limits, products, or corrective chemicals.
- Maintain refinery-grade, precise, engineering language.
- All tables must be complete and aligned exactly as specified.
 
FULL RAG CONTEXT ( ALL Paramerter Analysis ):
{parameter_context}
 
Generate the final product-centric ECAT technical report now.
"""
 
 
    resp = llm(prompt)
    return resp
 
 

In [46]:
import re
import markdown
from bs4 import BeautifulSoup
from reportlab.platypus import (
    SimpleDocTemplate,
    Paragraph,
    Table,
    TableStyle,
    Spacer,
    ListFlowable,
    ListItem,
    PageBreak,
)
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors


SUBSCRIPT_MAP = {
    "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4",
    "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9",
}

def convert_subscripts(text: str) -> str:
    def repl(match):
        formula = match.group(0)
        out = ""
        for ch in formula:
            if ch in SUBSCRIPT_MAP:
                out += f"<sub>{SUBSCRIPT_MAP[ch]}</sub>"
            else:
                out += ch
        return out

    pattern = r"[A-Za-z]{1,3}[₀₁₂₃₄₅₆₇₈₉]+(?:[A-Za-z]{0,2}[₀₁₂₃₄₅₆₇₈₉]+)*"
    return re.sub(pattern, repl, text)


def format_table_bullets(text: str) -> str:
    """
    Convert markdown-style '- ' bullets inside table cells,
    but ONLY if the cell truly contains a bullet list.
    """
    # Count bullet markers
    bullet_count = text.count("- ")

    if bullet_count <= 1:
        return text

    parts = []
    tokens = text.split("- ")

    for i, tok in enumerate(tokens):
        tok = tok.strip()
        if not tok:
            continue
        if i == 0:
            parts.append(tok)
        else:
            parts.append(f"• {tok}")

    return "<br/>".join(parts)

def md_to_pdf(markdown_text: str, output_file: str = "report.pdf"):

    html = markdown.markdown(markdown_text, extensions=["tables"])
    soup = BeautifulSoup(html, "html.parser")

    doc = SimpleDocTemplate(
        output_file,
        pagesize=A4,
        leftMargin=36,
        rightMargin=36,
        topMargin=36,
        bottomMargin=36,
    )

    styles = getSampleStyleSheet()


    styles.add(ParagraphStyle(
        name="H1",
        fontSize=20,
        leading=24,
        fontName="Helvetica-Bold",
        spaceAfter=16
    ))
    styles.add(ParagraphStyle(
        name="H2",
        fontSize=16,
        leading=20,
        fontName="Helvetica-Bold",
        spaceAfter=14,
        keepWithNext=True
    ))
    styles.add(ParagraphStyle(
        name="H3",
        fontSize=13.5,
        leading=17,
        fontName="Helvetica-Bold",
        spaceAfter=12,
        keepWithNext=True
    ))

  
    styles.add(ParagraphStyle(
        name="Body",
        fontSize=11,
        leading=15,
        alignment=4,          
        spaceAfter=6,
        wordWrap="CJK"        
    ))

    styles.add(ParagraphStyle(
        name="CustomBullet",
        fontSize=11,
        leading=15,
        leftIndent=12,
        wordWrap="CJK"
    ))

    styles.add(ParagraphStyle(
        name="TableCell",
        fontSize=9.5,
        leading=13,
        alignment=4,
        wordWrap="CJK"
    ))
    styles.add(ParagraphStyle(
        name="TableHeader",
        fontSize=9.5,
        leading=13,
        fontName="Helvetica-Bold",
        alignment=1
    ))

    elements = []

    PAGE_BREAK_SECTIONS = {
        "Parameter-by-Parameter Interpretation",
        "Consolidated Issue Summary",
        "Corrective Actions & Optimization Strategies",
        "Final Remarks",
    }

    for tag in soup.children:
        if not getattr(tag, "name", None):
            continue

        raw_text = tag.get_text(strip=True)
        text = convert_subscripts(raw_text)

        if text.lower().startswith("[fallback"):
            continue
        if tag.name == "h1":
            elements.append(Paragraph(text, styles["H1"]))

        elif tag.name == "h2":
            if raw_text in PAGE_BREAK_SECTIONS:
                elements.append(PageBreak())
            elements.append(Spacer(1, 10))
            elements.append(Paragraph(text, styles["H2"]))

        elif tag.name == "h3":
            elements.append(Paragraph(text, styles["H3"]))

        elif tag.name == "p":
            elements.append(Paragraph(text, styles["Body"]))

        elif tag.name == "ul":
            items = []
            for li in tag.find_all("li", recursive=False):
                li_text = convert_subscripts(li.get_text(strip=True))
                items.append(ListItem(Paragraph(li_text, styles["CustomBullet"])))

            elements.append(
                ListFlowable(
                    items,
                    bulletType="bullet",
                    leftIndent=18,
                    spaceAfter=8,
                    keepTogether=True
                )
            )

        elif tag.name == "table":
            rows = tag.find_all("tr")
            col_count = len(rows[0].find_all(["th", "td"]))

            if col_count == 2:
                col_widths = [120, A4[0] - 36*2 - 120]
            elif col_count == 3:
                col_widths = [90, 80, A4[0] - 36*2 - 170]
            else:
                col_widths = "*"

            table_data = []
            for r_idx, row in enumerate(rows):
                row_cells = []
                for cell in row.find_all(["th", "td"]):
                    cell_text = cell.get_text(strip=True)
                    cell_text = format_table_bullets(cell_text)
                    cell_text = convert_subscripts(cell_text)

                    style = styles["TableHeader"] if r_idx == 0 else styles["TableCell"]
                    row_cells.append(Paragraph(cell_text, style))

                table_data.append(row_cells)

            table = Table(
                table_data,
                colWidths=col_widths,
                repeatRows=1,
                splitByRow=1
            )

            table.setStyle(TableStyle([
                ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
                ("BACKGROUND", (0, 0), (-1, 0), colors.lightgrey),
                ("VALIGN", (0, 0), (-1, -1), "TOP"),
                ("LEFTPADDING", (0, 0), (-1, -1), 6),
                ("RIGHTPADDING", (0, 0), (-1, -1), 6),
                ("TOPPADDING", (0, 0), (-1, -1), 6),
                ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
                ("KEEPWITHNEXT", (0, 0), (-1, 1), True),
            ]))

            elements.append(Spacer(1, 8))
            elements.append(table)
            elements.append(Spacer(1, 18))

    doc.build(elements)
    print("PDF saved:", output_file)


LANGGRAPH STATE FOR RAG2

In [47]:
from typing import TypedDict, List, Dict
 
class RAG2State(TypedDict):
    issue_analysis_text: str                 # output from RAG-1
    issue_blocks: List[Dict]                 # parsed out-of-limit issues
    final_report: str                        # final LLM report
    pdf_path: str

NODE 5: Issue Extraction

In [48]:
def issue_extraction_node(state: RAG2State):
 
    chunks = split_issue_blocks(state["issue_analysis_text"])
    issue_blocks = []
 
    blocks = split_issue_blocks(state["issue_analysis_text"])
 
    parameter_blocks = []
 
    for block in blocks:
        # Keep only valid parameter blocks
        if block.strip().startswith("Parameter:"):
            parameter_blocks.append(block.strip())
 
    return {"issue_blocks": parameter_blocks}

NODE 6: Final Report Generator


In [49]:
def final_report_node(state: RAG2State):
    """
    Builds final report context using
    per-parameter blocks (summary + issue_summary)
    """
 
    parameter_context = "\n\n".join(state["issue_blocks"])
 
    final_context = f"""
PARAMETER-WISE DIAGNOSTIC BLOCKS
(Each block contains either Summary or issue_summary)
 
{parameter_context}
"""
 
    report = generate_issue2_analysis(final_context)
 
    return {"final_report": report}
 
 

NODE 8: PDF GENERATION

In [50]:
import os
import re

def pdf_generation_node(state: RAG2State):

    report_md = state["final_report"]

    # Extract sample number from report text
    match = re.search(r"Sample ID\s*\|\s*(\d+)", report_md)
    sample_no = match.group(1) if match else "unknown"

    # Target directory
    output_dir = r"REPORT_PDF"

    # Ensure directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Full output path
    output_file = os.path.join(
        output_dir,
        f"ECAT_Report_{sample_no}.pdf"
    )

    # Generate PDF
    md_to_pdf(report_md, output_file)

    return {
        "pdf_path": output_file
    }


In [51]:
rag2_graph = StateGraph(RAG2State)

rag2_graph.add_node("extract_issues", issue_extraction_node)
rag2_graph.add_node("generate_report", final_report_node)
rag2_graph.add_node("save_pdf", pdf_generation_node)

rag2_graph.set_entry_point("extract_issues")

rag2_graph.add_edge("extract_issues", "generate_report")
rag2_graph.add_edge("generate_report", "save_pdf")

rag2_app = rag2_graph.compile()


In [52]:
import json

with open("Parameter_Correlation.json", "r") as f:
    PARAMETER_CORRELATION = json.load(f)

def run_full_pipeline_and_generate_pdfs(
    pipeline_results,
    rag1_app,
    rag2_app
):
    """
    Runs RAG1 followed by RAG2 for each fused summary
    and generates PDFs end-to-end.
    """

    all_outputs = []

    for idx, sample in enumerate(pipeline_results, start=0):
        print(f"\n Processing Sample {idx} / {len(pipeline_results)}")

        single_summary = sample["fused_summary"]

        rag1_result = rag1_app.invoke({
            "fused_summary": single_summary,
            "parameter_correlation": PARAMETER_CORRELATION
        })


        if not rag1_result or "final_output" not in rag1_result:
            print("RAG1 failed — skipping this sample.")
            continue

        rag2_result = rag2_app.invoke({
            "issue_analysis_text": rag1_result["final_output"]
        })

        if not rag2_result or "pdf_path" not in rag2_result:
            print("RAG2 failed — skipping PDF generation.")
            continue

        print("PDF generated at:", rag2_result["pdf_path"])

        all_outputs.append({
            "rag1_output": rag1_result,
            "rag2_output": rag2_result
        })

    print(f"\nCompleted PDF generation for {len(all_outputs)} samples.")
    return all_outputs

final_outputs = run_full_pipeline_and_generate_pdfs(
    pipeline_results=pipeline_results,
    rag1_app=app,        
    rag2_app=rag2_app    
)



 Processing Sample 0 / 170
PDF saved: REPORT_PDF\ECAT_Report_41004748.pdf
PDF generated at: REPORT_PDF\ECAT_Report_41004748.pdf

 Processing Sample 1 / 170
PDF saved: REPORT_PDF\ECAT_Report_41003698.pdf
PDF generated at: REPORT_PDF\ECAT_Report_41003698.pdf

 Processing Sample 2 / 170


KeyboardInterrupt: 