In [None]:
pip install pymupdf langchain faiss-cpu openai tiktoken langchain-community


Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typi

In [None]:
import os
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
from io import BytesIO
from tqdm import tqdm
from langchain.embeddings import OpenAIEmbeddings as OE
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Load BLIP for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
blip_model.to(device)

# API Key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "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")


# Embedding models
text_emb = OE(model="text-embedding-3-small")
image_emb = SentenceTransformer("clip-ViT-B-32")

def describe_image(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

def extract_pdf_pages(pdf_path, image_dir="imgs"):
    os.makedirs(image_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    pages = []

    for i, page in enumerate(doc, 1):
        txt = page.get_text()
        imgs = []

        for img in page.get_images(full=True):
            xref = img[0]
            try:
                pix = fitz.Pixmap(doc, xref)
                if pix.colorspace.n not in [1, 3]:  # Not grayscale or RGB
                    pix = fitz.Pixmap(fitz.csRGB, pix)
                buf = pix.tobytes("png")
                path = f"{image_dir}/pg{i}_img{xref}.png"
                Image.open(BytesIO(buf)).save(path)
                imgs.append(path)
            except Exception as e:
                #print(f"Skipping image {xref} on page {i}: {e}")
                print(".")

        pages.append({"page": i, "text": txt, "images": imgs})
    return pages

def build_embeddings(pages, text_batch_size=50):
    texts, metadatas = [], []

    print("Processing pages...")
    for p in pages:
        if p["text"].strip():
            texts.append(p["text"])
            metadatas.append({"type": "text", "page": p["page"]})

        for im in p["images"]:
            caption = describe_image(im)
            texts.append(caption)
            metadatas.append({
                "type": "image_caption",
                "page": p["page"],
                "path": im,
                "caption": caption
            })

    text_vecs = []
    for i in tqdm(range(0, len(texts), text_batch_size), desc="Embedding text and captions"):
        batch = texts[i : i + text_batch_size]
        text_vecs.extend(text_emb.embed_documents(batch))
    text_vecs = np.array(text_vecs)

    return text_vecs, texts, metadatas

def create_vectorstore(pages, index_path="mm_faiss"):
    idx, docs, metas = build_embeddings(pages)
    text_embedding_pairs = list(zip(docs, idx))

    vs = FAISS.from_embeddings(
        text_embedding_pairs,
        embedding=text_emb,
        metadatas=metas
    )
    vs.save_local(index_path)
    return vs

def load_or_create(pdf_path):
    if os.path.exists("mm_faiss/index.faiss"):
        vs = FAISS.load_local("mm_faiss", text_emb, allow_dangerous_deserialization=True)
    else:
        pages = extract_pdf_pages(pdf_path)
        vs = create_vectorstore(pages)
    return vs

def hybrid_rag_chain(vs, k=5):
    retriever = vs.as_retriever(search_kwargs={"k": k})
    llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.2)
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

def query_rag(rag, user_q):
    result = rag({"query": user_q})
    return result["result"], result["source_documents"]

# MAIN
if __name__ == "__main__":
    path = "/content/drive/MyDrive/AI Summary Trial Run/3-8-24 combined missing 2  or more maybe-sm.pdf"  # Replace with your actual PDF path
    query = "What are the key takeaways from the document?"

    vs = load_or_create(path)
    rag = hybrid_rag_chain(vs, k=5)

    answer, docs = query_rag(rag, query)

    print("\n🧠 Answer:\n", answer)
    print("\n📚 Retrieved Sources:")
    for d in docs:
        print(d.metadata, d.page_content[:100], "..." if len(d.page_content) > 100 else "")


❌ Skipping image 3458 on page 505: code=4: pixmap must be grayscale or rgb to write as png
❌ Skipping image 1364 on page 623: code=4: pixmap must be grayscale or rgb to write as png
❌ Skipping image 1518 on page 686: 'NoneType' object has no attribute 'n'
❌ Skipping image 1519 on page 686: 'NoneType' object has no attribute 'n'
❌ Skipping image 3458 on page 705: code=4: pixmap must be grayscale or rgb to write as png
❌ Skipping image 1659 on page 745: 'NoneType' object has no attribute 'n'
❌ Skipping image 1660 on page 745: 'NoneType' object has no attribute 'n'
❌ Skipping image 1665 on page 746: 'NoneType' object has no attribute 'n'
❌ Skipping image 1666 on page 746: 'NoneType' object has no attribute 'n'
❌ Skipping image 1667 on page 746: 'NoneType' object has no attribute 'n'
❌ Skipping image 1672 on page 747: 'NoneType' object has no attribute 'n'
❌ Skipping image 1673 on page 747: 'NoneType' object has no attribute 'n'
❌ Skipping image 1674 on page 747: 'NoneType' object has no a

Embedding text and captions: 100%|██████████| 30/30 [00:42<00:00,  1.43s/it]
  llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.2)
  result = rag({"query": user_q})



🧠 Answer:
 I’m happy to help! Could you please specify which document you are referring to?

📚 Retrieved Sources:
{'type': 'image_caption', 'page': 105, 'path': 'imgs/pg105_img3368.png', 'caption': 'a document with the text and description of the document'} a document with the text and description of the document 
{'type': 'image_caption', 'page': 173, 'path': 'imgs/pg173_img3349.png', 'caption': 'a document with the text and description of the document'} a document with the text and description of the document 
{'type': 'image_caption', 'page': 175, 'path': 'imgs/pg175_img3351.png', 'caption': 'a document with the text and description of the document'} a document with the text and description of the document 
{'type': 'image_caption', 'page': 192, 'path': 'imgs/pg192_img3368.png', 'caption': 'a document with the text and description of the document'} a document with the text and description of the document 
{'type': 'image_caption', 'page': 86, 'path': 'imgs/pg86_img3349.png', 'capti

In [None]:
# MAIN
if __name__ == "__main__":
    path = "/content/drive/MyDrive/AI Summary Trial Run/3-8-24 combined missing 2  or more maybe-sm.pdf"  # Replace with your actual PDF path

    query = "What enforcement actions can the Control Authority take if an industrial user continues to violate wastewater discharge requirements?"

    vs = load_or_create(path)
    rag = hybrid_rag_chain(vs, k=5)

    answer, docs = query_rag(rag, query)

    print("\n🧠 Answer:\n", answer)
    print("\n📚 Retrieved Sources:")
    for d in docs:
        print(d.metadata, d.page_content[:100], "..." if len(d.page_content) > 100 else "")


🧠 Answer:
 If an industrial user continues to violate wastewater discharge requirements, the Control Authority can take several enforcement actions based on the Enforcement Response Plan (ERP) it has developed. These actions include:

(A) Notification of Violation: Serving a notice of the violation to the user, requiring a plan for correction and prevention.

(B) Consent Orders: Entering into agreements with the user to correct the noncompliance within a specified time.

(C) Show Cause Hearing: Ordering the user to show cause why enforcement action should not be taken, with notice given at least 10 days prior.

(D) Compliance Orders: Issuing orders that may direct disconnection of sewer service after a specified time unless adequate treatment is installed and operated; may also require pretreatment technology, additional monitoring, and management practices.

(E) Cease and Desist Orders: Ordering the user to immediately comply and take remedial or preventative actions, including halti

In [None]:
# MAIN
if __name__ == "__main__":
    path = "/content/drive/MyDrive/AI Summary Trial Run/3-8-24 combined missing 2  or more maybe-sm.pdf"  # Replace with your actual PDF path

    query = "What type of encroachment was applied for by McCurdy Development, LLC in the permit submitted on 11-22-17?"

    vs = load_or_create(path)
    rag = hybrid_rag_chain(vs, k=5)

    answer, docs = query_rag(rag, query)

    print("\n🧠 Answer:\n", answer)
    print("\n📚 Retrieved Sources:")
    for d in docs:
        print(d.metadata, d.page_content[:100], "..." if len(d.page_content) > 100 else "")


🧠 Answer:
 The encroachment permit application submitted by McCurdy Development, LLC on 11-22-17 was for encroachments in the public right-of-way or drainage easements, specifically at the location South Bound Riverside Drive with coordinates approximately 37.968154 -87.574564. The application indicates work involving crossing perpendicular to the street centerline and encroaching on the right-of-way line.

📚 Retrieved Sources:
{'type': 'text', 'page': 521} City of Evansville
APPLICATION FOR ENCROACHMENT PERMIT
(Permit for the purpose of Encroachments in P ...
{'type': 'text', 'page': 720} City of Evansville
APPLICATION FOR ENCROACHMENT PERMIT
(Permit for the purpose of Encroachments in P ...
{'type': 'text', 'page': 535}  
 
 
McCurdy 100 Development, LLC 
October 2, 2017 
Page 3 
 
 
 
 
 
Enclosure 
 
cc: 
Evansville  ...
{'type': 'text', 'page': 861} “fabricated”, “reckless” and “actionable”. EWSU would direct Development’s
attention to EMC 13.05.14 ...
{'type': 'text', 'page': 56

In [None]:
# MAIN
if __name__ == "__main__":
    path = "/content/drive/MyDrive/AI Summary Trial Run/3-8-24 combined missing 2  or more maybe-sm.pdf"  # Replace with your actual PDF path

    query = "Who signed the affidavit for the Evansville Sewage Works Department on July 10, 2018, and what was the purpose of the document?"

    vs = load_or_create(path)
    rag = hybrid_rag_chain(vs, k=5)

    answer, docs = query_rag(rag, query)

    print("\n🧠 Answer:\n", answer)
    print("\n📚 Retrieved Sources:")
    for d in docs:
        print(d.metadata, d.page_content[:100], "..." if len(d.page_content) > 100 else "")


🧠 Answer:
 The affidavit for the Evansville Sewage Works Department on July 10, 2018, was signed by Sarah Burlison, Accounts Receivable Clerk. The purpose of the document was to affirm, under penalties of perjury, that reasonable care was taken to redact each Social Security number in the document unless required by law. The affidavit was notarized by Ashley Mosby, a Notary Public in Vanderburgh County, Indiana.

📚 Retrieved Sources:
{'type': 'text', 'page': 1326} 82C01-1807-PL-004109
Vanderburgh Circuit Court
EXHIBIT B
Filed: 7/25/2018 12:38 PM
Clerk
Vanderburgh ...
{'type': 'text', 'page': 1403} 82C01-1807-PL-004109
Vanderburgh Circuit Court
EXHIBIT B
Filed: 7/25/2018 12:38 PM
Clerk
Vanderburgh ...
{'type': 'text', 'page': 1430} 82C01-1807-PL-004109
Vanderburgh Circuit Court
EXHIBIT B
Filed: 7/25/2018 12:38 PM
Clerk
Vanderburgh ...
{'type': 'text', 'page': 1407} 82C01-1807-PL-004109
Vanderburgh Circuit Court
EXHIBIT C
Filed: 7/25/2018 12:38 PM
Clerk
Vanderburgh ...
{'type': 'text', 