In [1]:
# Install required packages
%pip install chromadb
%pip install langchain-google-genai
%pip install langchain-chroma
%pip install langchain-groq
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## 1. Setup and Imports

In [2]:
import chromadb
from chromadb.config import Settings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
import json
import re
from typing import List, Dict

# Load environment variables
load_dotenv()

print("‚úÖ All imports loaded successfully")

‚úÖ All imports loaded successfully


## 2. Initialize ChromaDB Client

In [3]:
# Initialize ChromaDB with persistent storage
DB_PATH = "./unified_chroma_db"

chroma_client = chromadb.PersistentClient(
    path=DB_PATH,
    settings=Settings(
        anonymized_telemetry=False,
        allow_reset=True
    )
)

print(f"‚úÖ ChromaDB initialized at: {DB_PATH}")
print(f"üìä Available collections: {[col.name for col in chroma_client.list_collections()]}")

‚úÖ ChromaDB initialized at: ./unified_chroma_db
üìä Available collections: ['project_attention_transformer_project']


## 3. Initialize Embedding Model

In [4]:
# Initialize Google Gemini embedding model (same as ingestion pipelines)
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    task_type="retrieval_query"  # Use retrieval_query for queries
)

print(f"‚úÖ Embedding model loaded: Google Gemini text-embedding-004")
print(f"üìê Task type: retrieval_query")

‚úÖ Embedding model loaded: Google Gemini text-embedding-004
üìê Task type: retrieval_query


## 4. Load Vector Database

In [5]:
def load_unified_database(project_name: str = "default_project"):
    """
    Load the unified ChromaDB collection for a project.
    
    Args:
        project_name: Name of the project
        
    Returns:
        Chroma vector store object
    """
    collection_name = f"project_{project_name.lower().replace(' ', '_')}"
    
    try:
        # Load existing collection using LangChain wrapper
        vectorstore = Chroma(
            client=chroma_client,
            collection_name=collection_name,
            embedding_function=embedding_model
        )
        
        collection = chroma_client.get_collection(name=collection_name)
        total_count = collection.count()
        
        # Get stats
        all_data = collection.get(include=['metadatas'])
        metadatas = all_data['metadatas']
        
        transcript_count = sum(1 for m in metadatas if m.get('source_type') == 'meeting_transcript')
        document_count = sum(1 for m in metadatas if m.get('source_type') == 'document')
        
        print(f"‚úÖ Loaded collection: {collection_name}")
        print(f"üìä Total chunks: {total_count}")
        print(f"   üìÑ Document chunks: {document_count}")
        print(f"   üéôÔ∏è  Transcript chunks: {transcript_count}")
        
        return vectorstore
        
    except Exception as e:
        print(f"‚ùå Error loading collection '{collection_name}': {e}")
        return None


# Example: Load your project
PROJECT_NAME = "attention_transformer_project"  # Update to your project name
db = load_unified_database(PROJECT_NAME)

‚úÖ Loaded collection: project_attention_transformer_project
üìä Total chunks: 3
   üìÑ Document chunks: 0
   üéôÔ∏è  Transcript chunks: 3


## 5. Query and Retrieval Functions

In [9]:
def query_unified_collection(
    query: str,
    project_name: str = "default_project",
    k: int = 5,
    filter_source_type: str = None
):
    """
    Query the unified collection and retrieve relevant chunks.
    
    Args:
        query: The search query
        project_name: Name of the project
        k: Number of results to retrieve
        filter_source_type: Optional filter ("document" or "meeting_transcript")
        
    Returns:
        List of retrieved chunks
    """
    vectorstore = load_unified_database(project_name)
    
    if not vectorstore:
        return []
    
    print(f"\nüîç Query: {query}")
    print(f"üìä Retrieving top {k} chunks...")
    
    # Build filter if specified
    search_kwargs = {"k": k}
    if filter_source_type:
        search_kwargs["filter"] = {"source_type": filter_source_type}
        print(f"üîé Filtering by source_type: {filter_source_type}")
    
    # Create retriever and query
    retriever = vectorstore.as_retriever(search_kwargs=search_kwargs)
    retrieved_chunks = retriever.invoke(query)
    
    print(f"‚úÖ Retrieved {len(retrieved_chunks)} chunks")
    
    # Display chunk sources
    for i, chunk in enumerate(retrieved_chunks):
        source_type = chunk.metadata.get('source_type', 'unknown')
        if source_type == 'meeting_transcript':
            meeting = chunk.metadata.get('meeting_name', 'Unknown')
            print(f"   {i+1}. üéôÔ∏è  Transcript: {meeting}")
        else:
            doc = chunk.metadata.get('document_name', 'Unknown')
            page = chunk.metadata.get('page_number', 'N/A')
            print(f"   {i+1}. üìÑ Document: {doc} (p.{page})")
    
    return retrieved_chunks


# Test query
query = "Who Completed the english to german experiments"
retrieved_chunks = query_unified_collection(query, project_name=PROJECT_NAME, k=1)
retrieved_chunks

‚úÖ Loaded collection: project_attention_transformer_project
üìä Total chunks: 3
   üìÑ Document chunks: 0
   üéôÔ∏è  Transcript chunks: 3

üîç Query: Who Completed the english to german experiments
üìä Retrieving top 1 chunks...
‚úÖ Retrieved 1 chunks
   1. üéôÔ∏è  Transcript: transformer_meeting


[Document(id='transcript_transformer_meeting_1', metadata={'end_time': '00:02:32.100', 'project_name': 'attention_transformer_project', 'speakers_in_chunk': '["Illia", "Ashish", "Niki", "Noam", "Jakob"]', 'source_type': 'meeting_transcript', 'chunk_index': 1, 'meeting_date': '2026-01-11', 'start_time': '00:01:02.500', 'turn_count': 8, 'meeting_name': 'transformer_meeting'}, page_content="Meeting: transformer_meeting\n        Project: attention_transformer_project\n        Date: 2026-01-11\n        Time Range: 00:01:02.500 - 00:02:32.100\n        Speakers: Illia, Ashish, Niki, Noam, Jakob\n\n        Transcript:\n        Illia: Let's discuss the encoder-decoder structure. The encoder maps input $x$ to continuous representations $z$, then the decoder generates output $y$ auto-regressively.\nJakob: I think we should emphasize that each encoder layer has two sub-layers: multi-head self-attention and a position-wise feed-forward network.\nAshish: Plus the residual connections around each sub

## 6. Enhanced Citation Functions

Support citations for both documents and transcripts.

In [10]:
def extract_citations_with_metadata(answer_text, chunks_metadata):
    """
    Extract citations and map to metadata for both documents and transcripts.
    
    Args:
        answer_text: The generated answer with [CITE:X] markers
        chunks_metadata: Dictionary mapping chunk IDs to metadata
        
    Returns:
        List of citation dictionaries with source-specific metadata
    """
    citation_pattern = r'\[CITE:([0-9,\s]+)\]'
    cited_chunks = re.findall(citation_pattern, answer_text)
    
    citations = []
    unique_chunks = set()
    
    for cite_group in cited_chunks:
        chunk_ids = [int(x.strip()) for x in cite_group.split(',')]
        unique_chunks.update(chunk_ids)
    
    for chunk_id in sorted(unique_chunks):
        metadata = chunks_metadata.get(chunk_id)
        if metadata:
            source_type = metadata.get("source_type", "document")
            
            if source_type == "meeting_transcript":
                # Transcript citation
                citations.append({
                    "chunk_id": str(chunk_id),
                    "source_type": "transcript",
                    "meeting_name": metadata.get("meeting_name"),
                    "meeting_date": metadata.get("meeting_date"),
                    "start_time": metadata.get("start_time"),
                    "end_time": metadata.get("end_time"),
                    "speakers": metadata.get("speakers", [])
                })
            else:
                # Document citation
                citations.append({
                    "chunk_id": str(chunk_id),
                    "source_type": "document",
                    "document": metadata.get("document"),
                    "page": metadata.get("page"),
                    "positions": metadata.get("positions", [])
                })
    
    return citations


def format_answer_with_citations(answer_text, chunks_metadata):
    """
    Replace [CITE:X] with appropriate format for documents and transcripts.
    
    Documents: [doc_name(p.X)]
    Transcripts: [meeting_name(timestamp)]
    """
    def replace_citation(match):
        cite_group = match.group(1)
        chunk_ids = [int(x.strip()) for x in cite_group.split(',')]
        
        # Group by source type
        doc_chunks = []
        transcript_chunks = []
        
        for chunk_id in chunk_ids:
            metadata = chunks_metadata.get(chunk_id)
            if metadata:
                source_type = metadata.get("source_type", "document")
                if source_type == "meeting_transcript":
                    transcript_chunks.append(metadata)
                else:
                    doc_chunks.append(metadata)
        
        # Format citations
        citations = []
        
        # Format document citations
        if doc_chunks:
            doc_name = doc_chunks[0].get("document", "").replace(".pdf", "")
            pages = list(set(m.get("page") for m in doc_chunks if m.get("page")))
            pages_str = ", ".join([f"p.{p}" for p in sorted(pages) if p != "N/A"])
            if doc_name and pages_str:
                citations.append(f"{doc_name}({pages_str})")
        
        # Format transcript citations
        for transcript_meta in transcript_chunks:
            meeting_name = transcript_meta.get("meeting_name", "Meeting")
            start_time = transcript_meta.get("start_time", "")
            if meeting_name:
                time_suffix = f"@{start_time}" if start_time else ""
                citations.append(f"{meeting_name}{time_suffix}")
        
        if citations:
            return f"[{', '.join(citations)}]"
        
        return match.group(0)
    
    citation_pattern = r'\[CITE:([0-9,\s]+)\]'
    formatted_answer = re.sub(citation_pattern, replace_citation, answer_text)
    return formatted_answer


print("‚úÖ Citation functions loaded")

‚úÖ Citation functions loaded


## 7. Answer Generation with Unified Context

Generate answers using both document and transcript chunks.

In [11]:
def generate_unified_answer(chunks, query):
    """
    Generate answer with citations supporting both documents and transcripts.
    
    Args:
        chunks: List of retrieved chunks (mix of documents and transcripts)
        query: The user query
        
    Returns:
        Dictionary with answer, raw_answer, and chunks_metadata
    """
    try:
        llm = ChatGroq(
            model_name="meta-llama/llama-4-scout-17b-16e-instruct", 
            temperature=0,
            max_tokens=4096
        )
        
        context_parts = []
        all_images = []
        chunks_metadata = {}
        
        for i, chunk in enumerate(chunks):
            chunk_id = i + 1
            source_type = chunk.metadata.get("source_type", "document")
            
            # Build chunk header
            if source_type == "meeting_transcript":
                meeting_name = chunk.metadata.get("meeting_name", "Meeting")
                meeting_date = chunk.metadata.get("meeting_date", "")
                start_time = chunk.metadata.get("start_time", "")
                speakers_json = chunk.metadata.get("speakers_in_chunk", "[]")
                speakers = json.loads(speakers_json) if isinstance(speakers_json, str) else speakers_json
                
                doc_header = f"### [CHUNK {chunk_id}] - TRANSCRIPT ###\n"
                doc_header += f"Meeting: {meeting_name}\n"
                doc_header += f"Date: {meeting_date}\n"
                doc_header += f"Time: {start_time}\n"
                doc_header += f"Speakers: {', '.join(speakers)}\n\n"
                
                # Store metadata for citation
                chunks_metadata[chunk_id] = {
                    "source_type": "meeting_transcript",
                    "meeting_name": meeting_name,
                    "meeting_date": meeting_date,
                    "start_time": start_time,
                    "end_time": chunk.metadata.get("end_time", ""),
                    "speakers": speakers
                }
                
                # Get transcript content
                doc_body = chunk.page_content
                
            else:
                # Document chunk
                doc_header = f"### [CHUNK {chunk_id}] - DOCUMENT ###\n"
                doc_body = ""
                
                positions_str = chunk.metadata.get("positions", "[]")
                positions = json.loads(positions_str) if isinstance(positions_str, str) else positions_str
                
                # Store metadata for citation
                chunks_metadata[chunk_id] = {
                    "source_type": "document",
                    "page": chunk.metadata.get("page_number", "N/A"),
                    "document": chunk.metadata.get("document_name", "document.pdf"),
                    "positions": positions
                }
                
                # Extract content
                if "original_content" in chunk.metadata:
                    orig_data = json.loads(chunk.metadata["original_content"])
                    text = orig_data.get("raw_text", "")
                    tables = orig_data.get("tables_html", [])
                    
                    doc_body += f"TEXT CONTENT:\n{text}\n"
                    if tables:
                        doc_body += "\nTABULAR DATA:\n" + "\n".join(tables) + "\n"
                    
                    all_images.extend(orig_data.get("images_base64", []))
                else:
                    doc_body += chunk.page_content
            
            context_parts.append(doc_header + doc_body)

        final_context = "\n\n".join(context_parts)
        
        instruction_prompt = f"""You are a precise research assistant. Answer the user query using ONLY the provided context.

The context includes both DOCUMENT chunks (from PDFs) and TRANSCRIPT chunks (from meeting recordings).

CITATION RULES:
1. Add [CITE:X] citations ONLY after complete sentences or paragraphs
2. NEVER add citations inside tables, lists, or mid-sentence
3. For tables: Add a single citation AFTER the entire table
4. Example: "The results are shown below.\n\n[table here]\n\n[CITE:3]"
5. For information from multiple chunks, use [CITE:X, Y, Z] format
6. You can cite both documents and transcripts - they are equally valid sources
7. If information is not in the context, say "I don't have information about that"

USER QUERY: {query}

RESEARCH CONTEXT:
{final_context}

ANSWER (with [CITE:X] citations AFTER sentences/tables):"""

        message_content = [{"type": "text", "text": instruction_prompt}]
        
        # Add images from document chunks (transcripts don't have images)
        for img_b64 in all_images[:5]:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
            })
        
        response = llm.invoke([HumanMessage(content=message_content)])
        
        formatted_answer = format_answer_with_citations(response.content, chunks_metadata)
        
        return {
            "answer": formatted_answer,
            "raw_answer": response.content,
            "chunks_metadata": chunks_metadata
        }
        
    except Exception as e:
        print(f"‚ùå Generation failed: {e}")
        return {
            "answer": "Error generating response",
            "raw_answer": "",
            "chunks_metadata": {}
        }


print("‚úÖ Answer generation function loaded")

‚úÖ Answer generation function loaded


## 8. Run Complete Query Pipeline

In [12]:
# === CONFIGURE YOUR QUERY HERE ===

# Your question
QUERY = "Who Completed the english to german experiments"

# Project name (should match your ingestion pipelines)
PROJECT_NAME = "attention_transformer_project"

# Number of chunks to retrieve
K_RESULTS = 1

# Optional: Filter by source type (None, "document", or "meeting_transcript")
FILTER_SOURCE = None  # Set to "document" or "meeting_transcript" to filter

print("=" * 80)
print("üöÄ UNIFIED QUERY PIPELINE")
print("=" * 80)

# Step 1: Retrieve chunks
retrieved_chunks = query_unified_collection(
    query=QUERY,
    project_name=PROJECT_NAME,
    k=K_RESULTS,
    filter_source_type=FILTER_SOURCE
)

if not retrieved_chunks:
    print("‚ùå No chunks retrieved")
else:
    # Step 2: Generate answer
    print(f"\nü§ñ Generating answer with Llama 4 Scout...")
    result = generate_unified_answer(retrieved_chunks, QUERY)
    
    # Step 3: Extract citations
    citations = extract_citations_with_metadata(result["raw_answer"], result["chunks_metadata"])
    
    # Display results
    print("\n" + "=" * 80)
    print("üìù ANSWER:")
    print("=" * 80)
    print(result["answer"])
    
    print("\n" + "=" * 80)
    print("üìö CITATIONS:")
    print("=" * 80)
    
    for cite in citations:
        if cite['source_type'] == 'document':
            print(f"\nüìÑ Chunk {cite['chunk_id']} - Document Citation:")
            print(f"   Document: {cite['document']}")
            print(f"   Page: {cite['page']}")
            print(f"   Positions: {len(cite['positions'])} elements")
        else:
            print(f"\nüéôÔ∏è  Chunk {cite['chunk_id']} - Transcript Citation:")
            print(f"   Meeting: {cite['meeting_name']}")
            print(f"   Date: {cite['meeting_date']}")
            print(f"   Time: {cite['start_time']} - {cite['end_time']}")
            print(f"   Speakers: {', '.join(cite['speakers'])}")
    
    print("\n" + "=" * 80)
    print("‚úÖ QUERY PIPELINE COMPLETE!")
    print("=" * 80)

üöÄ UNIFIED QUERY PIPELINE
‚úÖ Loaded collection: project_attention_transformer_project
üìä Total chunks: 3
   üìÑ Document chunks: 0
   üéôÔ∏è  Transcript chunks: 3

üîç Query: Who Completed the english to german experiments
üìä Retrieving top 1 chunks...
‚úÖ Retrieved 1 chunks
   1. üéôÔ∏è  Transcript: transformer_meeting

ü§ñ Generating answer with Llama 4 Scout...

üìù ANSWER:
Niki completed the English-to-German experiments. [transformer_meeting@00:01:02.500]

üìö CITATIONS:

üéôÔ∏è  Chunk 1 - Transcript Citation:
   Meeting: transformer_meeting
   Date: 2026-01-11
   Time: 00:01:02.500 - 00:02:32.100
   Speakers: Illia, Ashish, Niki, Noam, Jakob

‚úÖ QUERY PIPELINE COMPLETE!


## 9. Export Results to JSON

In [13]:
def export_query_results(chunks, answer_result, filename="unified_query_results.json"):
    """
    Export query results including chunks, answer, and citations to JSON.
    
    Args:
        chunks: Retrieved chunks
        answer_result: Result from generate_unified_answer()
        filename: Output filename
    """
    export_data = {
        "query": QUERY,
        "timestamp": str(pd.Timestamp.now()) if 'pd' in dir() else None,
        "answer": answer_result["answer"],
        "raw_answer": answer_result["raw_answer"],
        "chunks": [],
        "citations": []
    }
    
    # Export chunks
    for i, chunk in enumerate(chunks):
        chunk_id = i + 1
        source_type = chunk.metadata.get("source_type", "document")
        
        chunk_data = {
            "chunk_id": chunk_id,
            "source_type": source_type,
            "content": chunk.page_content,
            "metadata": {}
        }
        
        if source_type == "meeting_transcript":
            chunk_data["metadata"] = {
                "meeting_name": chunk.metadata.get("meeting_name"),
                "meeting_date": chunk.metadata.get("meeting_date"),
                "start_time": chunk.metadata.get("start_time"),
                "end_time": chunk.metadata.get("end_time"),
                "speakers": json.loads(chunk.metadata.get("speakers_in_chunk", "[]"))
            }
        else:
            chunk_data["metadata"] = {
                "document_name": chunk.metadata.get("document_name"),
                "page_number": chunk.metadata.get("page_number"),
                "has_tables": "original_content" in chunk.metadata and "tables_html" in json.loads(chunk.metadata["original_content"]),
                "has_images": "original_content" in chunk.metadata and "images_base64" in json.loads(chunk.metadata["original_content"])
            }
        
        export_data["chunks"].append(chunk_data)
    
    # Export citations
    citations = extract_citations_with_metadata(answer_result["raw_answer"], answer_result["chunks_metadata"])
    export_data["citations"] = citations
    
    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)
    
    print(f"‚úÖ Exported results to: {filename}")


# Export results
if retrieved_chunks and result:
    export_query_results(retrieved_chunks, result, "unified_query_results.json")

‚úÖ Exported results to: unified_query_results.json


## 10. Advanced Query Options

In [None]:
# Example: Query only documents
print("\nüìÑ DOCUMENT-ONLY QUERY:")
print("=" * 80)

doc_chunks = query_unified_collection(
    query="transformer architecture details",
    project_name=PROJECT_NAME,
    k=3,
    filter_source_type="document"
)

if doc_chunks:
    doc_result = generate_unified_answer(doc_chunks, "transformer architecture details")
    print(doc_result["answer"])

print("\n\n")

# Example: Query only transcripts
print("üéôÔ∏è  TRANSCRIPT-ONLY QUERY:")
print("=" * 80)

transcript_chunks = query_unified_collection(
    query="what was discussed in the meeting",
    project_name=PROJECT_NAME,
    k=3,
    filter_source_type="meeting_transcript"
)

if transcript_chunks:
    transcript_result = generate_unified_answer(transcript_chunks, "what was discussed in the meeting")
    print(transcript_result["answer"])

## Summary

This unified query pipeline provides:

‚úÖ **Unified Querying**: Query both documents and transcripts together  
‚úÖ **Source-Aware Citations**: Different citation formats for documents vs transcripts  
‚úÖ **Document Citations**: `[doc_name(p.X)]` with page numbers and positions  
‚úÖ **Transcript Citations**: `[meeting_name@timestamp]` with meeting metadata  
‚úÖ **Mixed Results**: AI can combine information from both sources  
‚úÖ **Filtering Options**: Query all sources or filter by type  
‚úÖ **Vision Support**: Process images from document chunks  
‚úÖ **Export Results**: Save answers and citations to JSON  

**Citation Formats:**
- **Documents**: `[attention-is-all-you-need(p.3, p.5)]`
- **Transcripts**: `[Project Kickoff Meeting@00:15:30]`
- **Mixed**: `[attention-is-all-you-need(p.3), Project Kickoff Meeting@00:15:30]`

**Next Steps:**
1. Update `PROJECT_NAME` to match your ingestion pipelines
2. Modify `QUERY` to ask your questions
3. Run the pipeline to get answers with mixed sources
4. Export results for further analysis