In [None]:
import os
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import re

# --- Input PDF file ---
pdf_path = "127Riverside-Drive-DHCR.pdf"

# --- Derive folder name from PDF ---
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_dir = os.path.join(os.path.dirname(pdf_path), base_name)
os.makedirs(output_dir, exist_ok=True)

# --- Convert PDF pages to PNGs ---
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
    pix = doc[page_num].get_pixmap(dpi=300)
    output_path = os.path.join(output_dir, f"page_{page_num+1}.png")
    pix.save(output_path)
    print(f"Saved: {output_path}")

print(f"\n✅ All pages saved inside: {output_dir}")

# --- OCR part with natural sorting ---
def natural_sort_key(s):
    """Sort filenames containing numbers in human order."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)]

# Gather all PNGs
png_files = [f for f in os.listdir(output_dir) if f.endswith(".png")]
png_files = sorted(png_files, key=natural_sort_key)  # <- apply natural sort

# Combined OCR text
all_text = ""
txt_output_path = os.path.join(output_dir, f"{base_name}.txt")

for filename in png_files:
    image_path = os.path.join(output_dir, filename)
    text = pytesseract.image_to_string(Image.open(image_path), lang="eng")
    all_text += f"\n\n--- {filename} ---\n\n{text}"

# Save combined text
with open(txt_output_path, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"\n✅ OCR complete. Combined text saved at: {txt_output_path}")


In [None]:
# For reading text files from a folder
from llama_index.core import SimpleDirectoryReader

# Core LlamaIndex utilities
from llama_index.core import VectorStoreIndex
from llama_index.core.settings import Settings

# Milvus vector store
from llama_index.vector_stores.milvus import MilvusVectorStore

# Ollama LLaMA 3.1 local model
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

# -------------------------------
# 1️⃣ Paths
# -------------------------------

# Folder where your OCR text files are
data_dir = "127Riverside-Drive-DHCR"

# -------------------------------
# 2️⃣ Load documents
# -------------------------------

documents = SimpleDirectoryReader(data_dir).load_data()
print(f"Loaded {len(documents)} documents for indexing.")

# -------------------------------
# 3️⃣ Connect Milvus
# -------------------------------

vector_store = MilvusVectorStore(
    collection_name="nyc_rent_docs",
    host="localhost",  # adjust if your Milvus host is different
    port="19530",      # default Milvus port
    dim=4096,          # match your embedding model dimension (llama3.1:8b)
    embedding_field="embedding"  # explicitly specify the embedding field name
)

# -------------------------------
# 4️⃣ Initialize Ollama LLaMA3.1
# -------------------------------

# Configure global settings for LLM and embedding model
Settings.llm = Ollama(model="llama3.1:8b")  # use the correct model name
Settings.embed_model = OllamaEmbedding(model_name="llama3.1:8b")  # use the correct model name

# -------------------------------
# 5️⃣ Build LlamaIndex
# -------------------------------

index = VectorStoreIndex.from_documents(
    documents,
    vector_store=vector_store
)

query_engine = index.as_query_engine()

# -------------------------------
# 6️⃣ Example queries
# -------------------------------

questions = [
    "What is the legal regulated rent for apartment 22?",
    "Who is the tenant of apartment 3?",
    "List all tenants with a legal regulated rent above $1000."
]

for q in questions:
    response = query_engine.query(q)
    print(f"\n❓ {q}\n💬 {response}")