In [1]:
# Query construction is the process of parsing natural language questions and extracting structured filters that can be used in the query to a vector store, narrowing down the documents to search before similarity search even happens.

In [2]:
from dotenv import load_dotenv
load_dotenv() # load environment variables

True

### Problem Statement

Vector search retrieves results only based on semantic similarity, ignoring structured metadata that might dramatically improve precision.

**Example**:

Query: "What did Steve Jobs say about innovation in 2007?"

A naive vector search may return general quotes from Jobs, or innovation-related paragraphs — from any year, any speaker.

You could extract:

```json
{
  "speaker": "Steve Jobs",
  "topic": "innovation",
  "year": 2007
}
```

Then run the vector search only over chunks with metadata matching that filter, improving both relevance and precision.

### Query Construction

![Image ](rsc/jupyter/query_construction.png)

**Idea**: Many vectorstores contain metadata fields. This makes it possible to filter for specific chunks based on metadata. We want to convert natural language into structured search queries that leverage these metadata fields.

In [3]:
#Let's look at some example metadata we might see in a database of YouTube transcripts.

import requests

def get_youtube_metadata(video_url: str) -> dict:
    """Fetch YouTube metadata using oEmbed."""
    oembed_endpoint = "https://www.youtube.com/oembed"
    params = {
        "url": video_url,
        "format": "json"
    }

    try:
        response = requests.get(oembed_endpoint, params=params)
        response.raise_for_status()  # Raise HTTPError for bad responses
        return response.json()
    except requests.HTTPError as e:
        print(f"HTTP Error: {e}")
        return {"error": str(e)}
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {"error": str(e)}

# Example usage
video_url = "https://www.youtube.com/watch?v=pbAd8O1Lvm4"
metadata = get_youtube_metadata(video_url)

print(metadata)

{'title': 'Self-reflective RAG with LangGraph: Self-RAG and CRAG', 'author_name': 'LangChain', 'author_url': 'https://www.youtube.com/@LangChain', 'type': 'video', 'height': 150, 'width': 200, 'version': '1.0', 'provider_name': 'YouTube', 'provider_url': 'https://www.youtube.com/', 'thumbnail_height': 360, 'thumbnail_width': 480, 'thumbnail_url': 'https://i.ytimg.com/vi/pbAd8O1Lvm4/hqdefault.jpg', 'html': '<iframe width="200" height="150" src="https://www.youtube.com/embed/pbAd8O1Lvm4?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen title="Self-reflective RAG with LangGraph: Self-RAG and CRAG"></iframe>'}


Let's say we have a database of transcripts of tutorial videos on YouTube. Each document in the database has a:
- text transcript (content)
- metadata about the video (title, length, view count, publish date)

Let’s also say we’ve built an index over this database such that we can:
1. Perform unstructured search over the contents and title of each document.
2. Use range filtering on view count, publication date, and length.

**Our goal is to convert natural language into structured search queries.**

In [4]:
# Let's define a schema for structured search queries (TutorialSearch schema).

import datetime
from pydantic import BaseModel, Field
from typing import Optional

class TutorialSearch(BaseModel):
    """Search over a database of tutorial videos about a software library."""

    content_search: str = Field(
        ...,
        description="Similarity search query applied to video transcripts.",
    )
    title_search: str = Field(
        ...,
        description=(
            "Alternate version of the content search query to apply to video titles. "
            "Should be succinct and only include key words that could be in a video "
            "title."
        ),
    )
    min_view_count: Optional[int] = Field(
        None,
        description="Minimum view count filter, inclusive. Only use if explicitly specified.",
    )
    max_view_count: Optional[int] = Field(
        None,
        description="Maximum view count filter, exclusive. Only use if explicitly specified.",
    )
    earliest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Earliest publish date filter, inclusive. Only use if explicitly specified.",
    )
    latest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Latest publish date filter, exclusive. Only use if explicitly specified.",
    )
    min_length_sec: Optional[int] = Field(
        None,
        description="Minimum video length in seconds, inclusive. Only use if explicitly specified.",
    )
    max_length_sec: Optional[int] = Field(
        None,
        description="Maximum video length in seconds, exclusive. Only use if explicitly specified.",
    )

    def pretty_print(self) -> None:
        for field_name, field_info in self.__class__.model_fields.items():
            value = getattr(self, field_name)
            if value is not None and value != getattr(field_info, "default", None):
                print(f"{field_name}: {value}")

In [156]:
# Let's set up a chain to generate TutorialSearch queries from plain text queries.
#   - I modified this part with some few-shot prompts and saw a significant performance boost.

from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate

examples = [
    {"input": "videos on chat langchain longer than 10 minutes", "output": '{"content_search":"chat langchain","title_search":"langchain","min_length_sec":"600"}'},
    {"input": "videos on chat langchain published in 2021", "output": '{"content_search":"chat langchain","title_search":"langchain","earliest_publish_date":"2021-01-01","latest_publish_date":"2021-12-31"}'},
    {"input": "videos on chat langchain", "output": '{"content_search":"chat langchain","title_search":"langchain"}'},
]
example_prompt = ChatPromptTemplate.from_messages([
    ('human', '{input}'),
    ('ai', '{output}')
])
few_shot_prompt = FewShotChatMessagePromptTemplate(
    examples=examples,
    example_prompt=example_prompt, # formats each individual example
)

system = """You are an expert at converting user questions into TutorialSearch queries to a vector database. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a query optimized to retrieve the most relevant results.

Rules:
- If there are acronyms or words you are not familiar with, do not try to rephrase them.
- Only set fields if they are mentioned in the user questions.
- Don't use SQL. Use the fields of the TutorialSearch model provided."""
final_prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    few_shot_prompt,
    ("human", "{question}"),
])

llm = ChatOllama(model="gemma3n")
structured_llm = llm.with_structured_output(TutorialSearch)
query_analyzer = final_prompt | structured_llm

final_prompt.invoke({"question": "videos on chat langchain published in 2023"})

ChatPromptValue(messages=[SystemMessage(content="You are an expert at converting user questions into TutorialSearch queries to a vector database. You have access to a database of tutorial videos about a software library for building LLM-powered applications. Given a question, return a query optimized to retrieve the most relevant results.\n\nRules:\n- If there are acronyms or words you are not familiar with, do not try to rephrase them.\n- Only set fields if they are mentioned in the user questions.\n- Don't use SQL. Use the fields of the TutorialSearch model provided.", additional_kwargs={}, response_metadata={}), HumanMessage(content='videos on chat langchain longer than 10 minutes', additional_kwargs={}, response_metadata={}), AIMessage(content='{"content_search":"chat langchain","title_search":"langchain","min_length_sec":"600"}', additional_kwargs={}, response_metadata={}), HumanMessage(content='videos on chat langchain published in 2021', additional_kwargs={}, response_metadata={

In [154]:
# general search
query_analyzer.invoke(
    {"question": "rag from scratch"}
).pretty_print()

content_search: rag from scratch
title_search: rag


In [155]:
# search based on specific publish date
query_analyzer.invoke(
    {"question": "videos on chat langchain published in 2023"}
).pretty_print()

content_search: chat langchain
title_search: langchain
earliest_publish_date: 2023-01-01
latest_publish_date: 2023-12-31


In [157]:
# search based on publish date window
query_analyzer.invoke(
    {"question": "videos that are focused on the topic of chat langchain that are published before 2024"}
).pretty_print()

content_search: chat langchain
title_search: langchain
latest_publish_date: 2024-01-01


In [158]:
# search based on max-length
query_analyzer.invoke(
    {"question": "how to use multi-modal models in an agent, only videos under 5 minutes"}
).pretty_print()

content_search: multi-modal models
title_search: agent
max_length_sec: 300


In [160]:
# search based on min-length
query_analyzer.invoke(
    {"question": "videos on multi-modal models in an agent, only videos over 5 minutes"}
).pretty_print()

content_search: multi-modal models
title_search: agent
min_length_sec: 300


In [176]:
# search based on multiple criteria
query_analyzer.invoke(
    {"question": "videos on llm agents that have at least 100 views, under 8 minutes"}
).pretty_print()

content_search: llm agents
title_search: llm agents
min_view_count: 100
max_length_sec: 480
