In [3]:
from pathlib import Path

from helper import cut_pages, merge_pdfs

from docling.document_converter import DocumentConverter  # 

from llama_index.readers.docling import DoclingReader      # 
from llama_index.node_parser.docling import DoclingNodeParser  # 
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

BASE_DIR = Path(".").resolve()
DATA_DIR = BASE_DIR / "data"
DATA_DIR.mkdir(exist_ok=True)

# big_book = DATA_DIR / "green-line-1-g9.pdf"  # put your 120-page PDF here


In [None]:
# snippets = []

# # Example ranges â€“ adjust for your book
# snippets.append(
#     cut_pages(big_book, pages=list(range(13,35)), output_pdf=DATA_DIR / "toy_intro_1-3.pdf")
# )
# snippets.append(
#     cut_pages(big_book, pages=[10, 11, 12, 13], output_pdf=DATA_DIR / "toy_unit1_10-13.pdf")
# )
# snippets.append(
#     cut_pages(big_book, pages=[40, 41], output_pdf=DATA_DIR / "toy_exercises_40-41.pdf")
# )

# toy_pdf = merge_pdfs(snippets, DATA_DIR / "toy_green_line_1.pdf")
# toy_pdf


In [None]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Assume the database is running locally (e.g. via docker-compose up -d db)
from ingest.pipeline import run_ingestion
from ingest.db import get_db_connection
import uuid
import os


In [None]:
# Define input file and book ID
json_file = "data/toy_green_line_1_docling.json"
book_id = uuid.uuid4()
print(f"Ingesting {json_file} as book {book_id}...")

# Run the full pipeline
try:
    run_ingestion(json_file, book_id=book_id, should_mock_embedding=False)
    print("Ingestion successful!")
except Exception as e:
    print(f"Ingestion failed: {e}")


In [None]:
# Verify Data in DB
try:
    conn = get_db_connection()
    cur = conn.cursor()

    # Check structure nodes
    cur.execute("SELECT count(*) FROM structure_nodes WHERE book_id = %s", (book_id,))
    node_count = cur.fetchone()[0]
    print(f"Structure Nodes: {node_count}")

    # Check content atoms
    cur.execute("SELECT count(*) FROM content_atoms WHERE book_id = %s", (book_id,))
    atom_count = cur.fetchone()[0]
    print(f"Content Atoms: {atom_count}")

    # Check one embedding
    cur.execute("SELECT embedding FROM content_atoms WHERE book_id = %s LIMIT 1", (book_id,))
    row = cur.fetchone()
    if row:
        emb = row[0]
        # pgvector returns a string or list depending on adapter. We just print length or type.
        print(f"Sample embedding type: {type(emb)}")
        if hasattr(emb, '__len__'):
            print(f"Sample embedding length: {len(emb)}")

    conn.close()
except Exception as e:
    print(f"Verification failed (DB might not be running): {e}")


In [None]:
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding

try:
    print("Connecting to PGVector Store...")
    vector_store = PGVectorStore.from_params(
        database=os.getenv("POSTGRES_DB", "rag"),
        host=os.getenv("POSTGRES_HOST", "localhost"),
        password=os.getenv("POSTGRES_PASSWORD", "rag"),
        port=int(os.getenv("POSTGRES_PORT", 5432)),
        user=os.getenv("POSTGRES_USER", "rag"),
        table_name="content_atoms",
        embed_dim=1536
    )

    print("Initializing Index from Vector Store...")
    embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY"))
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)

    print("Creating Query Engine...")
    # We can add filters here if needed, e.g. MetadataFilters(filters=[ExactMatchFilter(key="book_id", value=str(book_id))])
    query_engine = index.as_query_engine()

    query_text = "write a small quizz with text and blanks based on vocab and content of unit 1"
    print(f"Executing Query: {query_text}")
    
    response = query_engine.query(query_text)
    print("\n--- Response ---\n")
    print(response)
    
except Exception as e:
    print(f"RAG Query failed: {e}")