# Airline Flight Insights - Full Pipeline

This notebook provides a complete Graph-RAG pipeline including:
1. **Neo4j Database Connection**
2. **LLM Setup** (Gemini)
3. **Embeddings** - Vector embeddings for semantic search
4. **Hybrid Retrieval** - Cypher + Semantic search
5. **Question Answering**

## 1. Imports and Setup

In [5]:
from neo4j import GraphDatabase, Driver
from dotenv import load_dotenv, find_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Optional, Callable, Tuple
import numpy as np
import os
import json

In [14]:
# Load environment variables
load_dotenv(find_dotenv())

NEO4J_URI = os.getenv('NEO4J_URI') or os.getenv('URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME') or os.getenv('USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD') or os.getenv('PASSWORD')
google_api_key = os.getenv('GOOGLE_API_KEY')

print(f"URI: {NEO4J_URI}")
print(f"Google API key loaded: {'Yes' if google_api_key else 'No'}")

URI: neo4j+s://d9ac65c9.databases.neo4j.io
Google API key loaded: Yes


In [None]:
# Create Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
driver.verify_connectivity()
print("Connected to Neo4j!")

NameError: name 'TRUST_ALL_CERTIFICATES' is not defined

In [None]:
# Setup Gemini LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=google_api_key,
    temperature=0
)
print("Gemini LLM loaded!")

## 2. Cypher Queries

In [None]:
queries = [
    # Intent 1: Operational Delay Diagnostics
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay DESC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay ASC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:DEPARTS_FROM]->(a:Airport) RETURN a.station_code AS origin, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay DESC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:DEPARTS_FROM]->(a:Airport) RETURN a.station_code AS origin, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay ASC LIMIT $x",
    "MATCH (o:Airport {station_code: $origin_station_code})<-[:DEPARTS_FROM]-(f:Flight)-[:ARRIVES_AT]->(d:Airport), (j:Journey)-[:ON]->(f) WITH o, d, AVG(j.arrival_delay_minutes) AS avg_delay WHERE avg_delay > $x RETURN o.station_code AS origin, d.station_code AS destination, avg_delay",
    "MATCH (j:Journey {number_of_legs: $x}) RETURN AVG(j.arrival_delay_minutes) AS avg_delay",
    # Intent 2: Service Quality
    "MATCH (o:Airport)<-[:DEPARTS_FROM]-(f:Flight)-[:ARRIVES_AT]->(d:Airport), (j:Journey {passenger_class: $class_name})-[:ON]->(f) WITH o, d, AVG(j.food_satisfaction_score) AS avg_food_score WHERE avg_food_score < $threshold RETURN o.station_code AS origin, d.station_code AS destination, avg_food_score",
    "MATCH (j:Journey {food_satisfaction_score: 1})-[:ON]->(f:Flight) WHERE j.actual_flown_miles > $x RETURN DISTINCT f.flight_number",
    # Intent 3: Fleet Performance
    "MATCH (j:Journey)-[:ON]->(f:Flight) WHERE j.arrival_delay_minutes > $x RETURN f.fleet_type_description AS aircraft_type, COUNT(j) AS delay_frequency ORDER BY delay_frequency DESC LIMIT 1",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) RETURN AVG(j.food_satisfaction_score) AS avg_food_score",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) RETURN AVG(j.actual_flown_miles) AS avg_miles",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) WITH COUNT(j) AS total_flights, COUNT(CASE WHEN j.arrival_delay_minutes < 0 THEN 1 END) AS early_flights RETURN (TOFLOAT(early_flights) / total_flights) * 100 AS early_arrival_percentage",
    # Intent 4: Loyalty
    "MATCH (p:Passenger {loyalty_program_level: $loyalty_program_level})-[:TOOK]->(j:Journey) RETURN AVG(j.arrival_delay_minutes) AS avg_delay",
    "MATCH (p:Passenger {loyalty_program_level: $loyalty_program_level})-[:TOOK]->(j:Journey) WHERE j.arrival_delay_minutes > $x RETURN p.record_locator AS passenger_id, j.arrival_delay_minutes AS delay",
    # Intent 5: Demographics
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight) WHERE j.actual_flown_miles > $threshold RETURN f.fleet_type_description AS aircraft_type, COUNT(f) AS usage_count ORDER BY usage_count DESC LIMIT 1",
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight) RETURN f.fleet_type_description AS fleet_type, COUNT(f) AS usage_count ORDER BY usage_count DESC LIMIT 1",
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, COUNT(p) AS passenger_volume ORDER BY passenger_volume DESC LIMIT $x"
]

query_descriptions = [
    "Identify the top ${x} destination stations with the highest accumulated arrival delay minutes.",
    "Identify the top ${x} destination stations with the lowest accumulated arrival delay minutes.",
    "Identify the top ${x} origin stations with the highest accumulated arrival delay minutes.",
    "Identify the top ${x} origin stations with the lowest accumulated arrival delay minutes.",
    "Find routes from the origin station ${origin_station_code} where the average arrival delay exceeds ${x} minutes.",
    "Calculate the average arrival delay for flights consisting of exactly ${x} legs.",
    "Identify routes for the passenger class ${class_name} where the average food satisfaction score is below ${threshold}.",
    "List the flight numbers for journeys longer than ${x} miles where the food satisfaction score was 1.",
    "Identify the aircraft type that has the highest frequency of arrival delays greater than ${x} minutes.",
    "Calculate the average food satisfaction score for passengers flying on the ${x} fleet.",
    "Calculate the average actual flown miles for the ${x} fleet.",
    "Calculate the percentage of early arrivals for the ${x} fleet.",
    "Calculate the average arrival delay experienced by passengers with the loyalty level ${loyalty_program_level}.",
    "Find the record locators for passengers with loyalty level ${loyalty_program_level} who experienced a delay greater than ${x} minutes.",
    "Identify the most common aircraft type used by the ${generation} generation for journeys exceeding ${threshold} miles.",
    "Identify the most frequently used fleet type for the ${generation} generation.",
    "Identify the top ${x} destination stations for the ${generation} generation based on passenger volume."
]

print(f"Loaded {len(queries)} queries")

## 3. Query Execution Functions

In [None]:
def run_query(query_index: int, **params) -> list:
    """Run a query by index with parameters."""
    if query_index < 0 or query_index >= len(queries):
        raise ValueError(f"Query index {query_index} out of range (0-{len(queries)-1})")
    with driver.session() as session:
        result = session.run(queries[query_index], **params)
        return [record.data() for record in result]

In [None]:
def get_context(prompt: str) -> list:
    """Use Gemini to identify relevant queries and extract parameters."""
    safe_descriptions = [desc.replace('${', '<').replace('}', '>') for desc in query_descriptions]
    query_list = "\n".join([f"{i}: {desc}" for i, desc in enumerate(safe_descriptions)])
    
    full_prompt = (
        "You are an expert at analyzing user questions about airline flight data.\n\n"
        "Available queries:\n" + query_list + "\n\n"
        "Your task:\n"
        "1. Identify which query indices (0-" + str(len(queries)-1) + ") provide useful context\n"
        "2. Extract ALL required parameters\n\n"
        "Parameters:\n"
        "- x: number (default: 5 for counts, 30 for delays)\n"
        "- origin_station_code: airport code like 'LAX'\n"
        "- class_name: 'Economy', 'Business', 'First'\n"
        "- threshold: numeric (default: 1000 for miles)\n"
        "- loyalty_program_level: 'Gold', 'Silver', 'Platinum'\n"
        "- generation: 'Millennial', 'Gen X', 'Baby Boomer', 'Gen Z'\n\n"
        'Return ONLY a valid JSON array: [{"query_index": 0, "params": {"x": 3}}]\n\n'
        "User question: " + prompt + "\n\nJSON:"
    )
    
    response = llm.invoke(full_prompt)
    response_text = response.content.strip().replace('```json', '').replace('```', '').strip()
    
    json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
    if json_match:
        try:
            return json.loads(json_match.group())
        except json.JSONDecodeError as e:
            print(f"JSON parse error: {e}")
    return []

In [None]:
def format_query_result(query_index: int, **params) -> str:
    """Run query and format result as context."""
    if query_index < 0 or query_index >= len(queries):
        return f"Error: Query index {query_index} out of range."
    
    description = query_descriptions[query_index]
    for name, value in params.items():
        description = description.replace(f"${{{name}}}", str(value))
    
    try:
        results = run_query(query_index, **params)
    except Exception as e:
        return f'Error for "{description}": {e}'
    
    if not results:
        return f'"{description}": No data found.'
    
    formatted = []
    for r in results:
        parts = [f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}" for k, v in r.items()]
        formatted.append("  - " + ", ".join(parts))
    
    return f'"{description}":\n' + "\n".join(formatted)

## 4. Embeddings Module

Vector embeddings for semantic search. Models:
- `minilm`: all-MiniLM-L6-v2 (384 dims, fast)
- `mpnet`: all-mpnet-base-v2 (768 dims, higher quality)

In [None]:
# Embedding model configurations
EMBEDDING_MODELS = {
    "minilm": {"name": "all-MiniLM-L6-v2", "dimensions": 384, "property_name": "embedding_minilm"},
    "mpnet": {"name": "all-mpnet-base-v2", "dimensions": 768, "property_name": "embedding_mpnet"}
}

_model_cache: Dict[str, SentenceTransformer] = {}

def get_model(model_key: str) -> SentenceTransformer:
    """Load and cache an embedding model."""
    if model_key not in EMBEDDING_MODELS:
        raise ValueError(f"Unknown model: {model_key}. Use 'minilm' or 'mpnet'")
    if model_key not in _model_cache:
        print(f"Loading {EMBEDDING_MODELS[model_key]['name']}...")
        _model_cache[model_key] = SentenceTransformer(EMBEDDING_MODELS[model_key]["name"])
        print("Model loaded!")
    return _model_cache[model_key]

In [None]:
def create_journey_text(props: Dict[str, Any]) -> str:
    """Create text representation of a Journey node."""
    passenger_class = props.get("passenger_class", "Unknown")
    food_score = props.get("food_satisfaction_score", "N/A")
    delay = props.get("arrival_delay_minutes", 0)
    miles = props.get("actual_flown_miles", 0)
    legs = props.get("number_of_legs", 1)
    
    delay_text = f"arrived {abs(delay)} minutes early" if delay < 0 else "on time" if delay == 0 else f"delayed {delay} minutes"
    food_labels = {1: "very poor", 2: "poor", 3: "average", 4: "good", 5: "excellent"}
    satisfaction = food_labels.get(food_score, "unknown")
    
    return f"A {passenger_class} class journey covering {miles:.0f} miles over {legs} segment{'s' if legs > 1 else ''}. Flight {delay_text}, {satisfaction} food satisfaction (score: {food_score}/5)."


def create_flight_text(props: Dict[str, Any], origin: str = None, destination: str = None) -> str:
    """Create text representation of a Flight node."""
    flight_num = props.get("flight_number", "Unknown")
    fleet = props.get("fleet_type_description", "Unknown aircraft")
    route = f" from {origin} to {destination}" if origin and destination else f" from {origin}" if origin else f" to {destination}" if destination else ""
    return f"Flight {flight_num} operated by {fleet}{route}."


def create_passenger_text(props: Dict[str, Any]) -> str:
    """Create text representation of a Passenger node."""
    return f"A {props.get('generation', 'unknown')} passenger with {props.get('loyalty_program_level', 'unknown')} loyalty status."

In [None]:
def generate_embeddings(texts: List[str], model_key: str = "minilm") -> np.ndarray:
    """Generate embeddings for a list of texts."""
    model = get_model(model_key)
    return model.encode(texts, show_progress_bar=True, convert_to_numpy=True)


def generate_single_embedding(text: str, model_key: str = "minilm") -> List[float]:
    """Generate embedding for a single text."""
    model = get_model(model_key)
    return model.encode(text, convert_to_numpy=True).tolist()

In [None]:
def fetch_journey_nodes(driver: Driver) -> List[Dict[str, Any]]:
    """Fetch all Journey nodes from Neo4j."""
    query = """
    MATCH (j:Journey)
    RETURN j.feedback_ID AS feedback_ID, j.passenger_class AS passenger_class,
           j.food_satisfaction_score AS food_satisfaction_score,
           j.arrival_delay_minutes AS arrival_delay_minutes,
           j.actual_flown_miles AS actual_flown_miles, j.number_of_legs AS number_of_legs
    """
    with driver.session() as session:
        result = session.run(query)
        return [{"feedback_ID": r["feedback_ID"], "properties": dict(r)} for r in result]


def fetch_flight_nodes(driver: Driver) -> List[Dict[str, Any]]:
    """Fetch all Flight nodes from Neo4j with route info."""
    query = """
    MATCH (f:Flight)
    OPTIONAL MATCH (f)-[:DEPARTS_FROM]->(origin:Airport)
    OPTIONAL MATCH (f)-[:ARRIVES_AT]->(dest:Airport)
    RETURN f.flight_number AS flight_number, f.fleet_type_description AS fleet_type_description,
           origin.station_code AS origin, dest.station_code AS destination
    """
    with driver.session() as session:
        result = session.run(query)
        return [{
            "flight_number": r["flight_number"],
            "fleet_type_description": r["fleet_type_description"],
            "properties": {"flight_number": r["flight_number"], "fleet_type_description": r["fleet_type_description"]},
            "origin": r["origin"], "destination": r["destination"]
        } for r in result]

In [None]:
def create_vector_index(driver: Driver, model_key: str, node_label: str = "Journey"):
    """Create a vector index in Neo4j."""
    config = EMBEDDING_MODELS[model_key]
    index_name = f"{node_label.lower()}_{config['property_name']}"
    
    create_query = f"""
    CREATE VECTOR INDEX {index_name} IF NOT EXISTS
    FOR (n:{node_label}) ON n.{config['property_name']}
    OPTIONS {{indexConfig: {{
        `vector.dimensions`: {config['dimensions']},
        `vector.similarity_function`: 'cosine'
    }}}}
    """
    with driver.session() as session:
        try:
            session.run(f"DROP INDEX {index_name} IF EXISTS")
        except: pass
        session.run(create_query)
        print(f"Created index: {index_name}")

In [None]:
def store_journey_embeddings(driver: Driver, feedback_ids: List[str], embeddings: np.ndarray, 
                              model_key: str = "minilm", batch_size: int = 100):
    """Store embeddings for Journey nodes."""
    prop = EMBEDDING_MODELS[model_key]["property_name"]
    query = f"UNWIND $batch AS item MATCH (j:Journey {{feedback_ID: item.feedback_ID}}) SET j.{prop} = item.embedding"
    
    with driver.session() as session:
        for i in range(0, len(feedback_ids), batch_size):
            batch = [{"feedback_ID": feedback_ids[j], "embedding": embeddings[j].tolist()} 
                     for j in range(i, min(i + batch_size, len(feedback_ids)))]
            session.run(query, batch=batch)
            print(f"Stored {min(i + batch_size, len(feedback_ids))}/{len(feedback_ids)}...")


def store_flight_embeddings(driver: Driver, flights: List[Dict], embeddings: np.ndarray,
                            model_key: str = "minilm", batch_size: int = 100):
    """Store embeddings for Flight nodes."""
    prop = EMBEDDING_MODELS[model_key]["property_name"]
    query = f"UNWIND $batch AS item MATCH (f:Flight {{flight_number: item.flight_number, fleet_type_description: item.fleet_type_description}}) SET f.{prop} = item.embedding"
    
    with driver.session() as session:
        for i in range(0, len(flights), batch_size):
            batch = [{"flight_number": flights[j]["flight_number"], 
                      "fleet_type_description": flights[j]["fleet_type_description"],
                      "embedding": embeddings[j].tolist()} 
                     for j in range(i, min(i + batch_size, len(flights)))]
            session.run(query, batch=batch)
            print(f"Stored {min(i + batch_size, len(flights))}/{len(flights)}...")

In [None]:
def semantic_search_journeys(driver: Driver, query_text: str, model_key: str = "minilm", top_k: int = 5) -> List[Dict]:
    """Semantic search on Journey nodes."""
    query_embedding = generate_single_embedding(query_text, model_key)
    index_name = f"journey_{EMBEDDING_MODELS[model_key]['property_name']}"
    
    search_query = f"""
    CALL db.index.vector.queryNodes('{index_name}', $top_k, $query_embedding)
    YIELD node, score
    RETURN node.feedback_ID AS feedback_ID, node.passenger_class AS passenger_class,
           node.food_satisfaction_score AS food_satisfaction_score,
           node.arrival_delay_minutes AS arrival_delay_minutes,
           node.actual_flown_miles AS actual_flown_miles, node.number_of_legs AS number_of_legs, score
    ORDER BY score DESC
    """
    with driver.session() as session:
        result = session.run(search_query, top_k=top_k, query_embedding=query_embedding)
        return [{**dict(r), "similarity_score": r["score"]} for r in result]


def semantic_search_flights(driver: Driver, query_text: str, model_key: str = "minilm", top_k: int = 5) -> List[Dict]:
    """Semantic search on Flight nodes."""
    query_embedding = generate_single_embedding(query_text, model_key)
    index_name = f"flight_{EMBEDDING_MODELS[model_key]['property_name']}"
    
    search_query = f"""
    CALL db.index.vector.queryNodes('{index_name}', $top_k, $query_embedding)
    YIELD node, score
    MATCH (node)-[:DEPARTS_FROM]->(origin:Airport)
    MATCH (node)-[:ARRIVES_AT]->(dest:Airport)
    RETURN node.flight_number AS flight_number, node.fleet_type_description AS fleet_type_description,
           origin.station_code AS origin, dest.station_code AS destination, score
    ORDER BY score DESC
    """
    with driver.session() as session:
        result = session.run(search_query, top_k=top_k, query_embedding=query_embedding)
        return [{**dict(r), "similarity_score": r["score"]} for r in result]

In [None]:
def format_embedding_results(results: List[Dict], node_type: str = "Journey") -> str:
    """Format embedding search results as context."""
    if not results:
        return f"No similar {node_type} nodes found."
    
    lines = [f"Found {len(results)} similar {node_type} records:"]
    for i, r in enumerate(results, 1):
        if node_type == "Journey":
            text = create_journey_text(r)
        else:
            text = create_flight_text({"flight_number": r.get("flight_number"), 
                                       "fleet_type_description": r.get("fleet_type_description")},
                                      r.get("origin"), r.get("destination"))
        lines.append(f"  {i}. (score: {r['similarity_score']:.3f}) {text}")
    return "\n".join(lines)


def get_embedding_context(driver: Driver, query: str, model_key: str = "minilm", top_k: int = 5) -> str:
    """Get context from embedding-based semantic search."""
    contexts = []
    try:
        contexts.append(format_embedding_results(semantic_search_journeys(driver, query, model_key, top_k), "Journey"))
    except Exception as e:
        contexts.append(f"Journey search error: {e}")
    try:
        contexts.append(format_embedding_results(semantic_search_flights(driver, query, model_key, top_k), "Flight"))
    except Exception as e:
        contexts.append(f"Flight search error: {e}")
    return "\n\n".join(contexts)

In [None]:
def generate_and_store_all_embeddings(driver: Driver, model_key: str = "minilm"):
    """Generate and store embeddings for all Journey and Flight nodes."""
    print(f"\n{'='*60}")
    print(f"Generating embeddings with {EMBEDDING_MODELS[model_key]['name']}")
    print(f"{'='*60}\n")
    
    # Journeys
    print("Fetching Journey nodes...")
    journeys = fetch_journey_nodes(driver)
    print(f"Found {len(journeys)} journeys")
    
    if journeys:
        texts = [create_journey_text(j["properties"]) for j in journeys]
        embeddings = generate_embeddings(texts, model_key)
        create_vector_index(driver, model_key, "Journey")
        store_journey_embeddings(driver, [j["feedback_ID"] for j in journeys], embeddings, model_key)
    
    # Flights
    print("\nFetching Flight nodes...")
    flights = fetch_flight_nodes(driver)
    print(f"Found {len(flights)} flights")
    
    if flights:
        texts = [create_flight_text(f["properties"], f["origin"], f["destination"]) for f in flights]
        embeddings = generate_embeddings(texts, model_key)
        create_vector_index(driver, model_key, "Flight")
        store_flight_embeddings(driver, flights, embeddings, model_key)
    
    print(f"\nEmbedding generation complete!")

## 5. Hybrid Retrieval

Combines Cypher queries with embedding-based semantic search.

In [None]:
def get_hybrid_context(driver: Driver, prompt: str, model_key: str = "minilm", top_k: int = 3) -> Dict[str, Any]:
    """Get context from both Cypher queries and embedding search."""
    results = {'cypher_context': [], 'embedding_context': '', 'combined_context': ''}
    
    # Cypher context
    try:
        for cq in get_context(prompt):
            results['cypher_context'].append(format_query_result(cq['query_index'], **cq['params']))
    except Exception as e:
        results['cypher_context'].append(f"Cypher error: {e}")
    
    # Embedding context
    try:
        results['embedding_context'] = get_embedding_context(driver, prompt, model_key, top_k)
    except Exception as e:
        results['embedding_context'] = f"Embedding error: {e}"
    
    # Combine
    cypher_text = '\n\n'.join(results['cypher_context'])
    results['combined_context'] = f"=== STRUCTURED QUERY RESULTS ===\n{cypher_text}\n\n=== SEMANTIC SEARCH RESULTS ===\n{results['embedding_context']}"
    
    return results

In [None]:
def answer_with_hybrid_context(driver: Driver, question: str, model_key: str = "minilm") -> str:
    """Answer a question using hybrid retrieval."""
    context = get_hybrid_context(driver, question, model_key, top_k=5)['combined_context']
    
    prompt = f"""You are an AI assistant for an airline company analyzing flight data.

Based on this context from our knowledge graph, answer the user's question.
Only use information from the context. If insufficient, say so.

CONTEXT:
{context}

USER QUESTION: {question}

ANSWER:"""
    
    return llm.invoke(prompt).content

In [None]:
def compare_retrieval_methods(driver: Driver, question: str) -> Dict[str, Any]:
    """Compare results from different retrieval methods."""
    results = {'cypher_only': [], 'embedding_minilm': '', 'embedding_mpnet': '', 
               'hybrid_minilm': None, 'hybrid_mpnet': None}
    
    # Cypher only
    try:
        for cq in get_context(question):
            results['cypher_only'].append(format_query_result(cq['query_index'], **cq['params']))
    except Exception as e:
        results['cypher_only'] = [f"Error: {e}"]
    
    # Embeddings
    for key in ['minilm', 'mpnet']:
        try:
            results[f'embedding_{key}'] = get_embedding_context(driver, question, key, 5)
        except Exception as e:
            results[f'embedding_{key}'] = f"Error: {e}"
    
    # Hybrid
    results['hybrid_minilm'] = get_hybrid_context(driver, question, "minilm", 5)
    results['hybrid_mpnet'] = get_hybrid_context(driver, question, "mpnet", 5)
    
    return results


def print_comparison(results: Dict[str, Any]):
    """Print comparison results."""
    print("=" * 80 + "\nRETRIEVAL METHOD COMPARISON\n" + "=" * 80)
    print("\n--- CYPHER ONLY ---")
    for ctx in results['cypher_only']: print(ctx + "\n")
    print("\n--- EMBEDDING (MiniLM) ---\n" + results['embedding_minilm'])
    print("\n--- EMBEDDING (MPNet) ---\n" + results['embedding_mpnet'])
    if results['hybrid_minilm']:
        print("\n--- HYBRID (MiniLM) ---\n" + results['hybrid_minilm']['combined_context'])
    if results['hybrid_mpnet']:
        print("\n--- HYBRID (MPNet) ---\n" + results['hybrid_mpnet']['combined_context'])

## 6. Interactive Q&A

In [None]:
def ask(question: str, use_hybrid: bool = True, model_key: str = "minilm") -> str:
    """Ask a question using the full pipeline."""
    print(f"\n{'='*60}\nQ: {question}\nMode: {'Hybrid' if use_hybrid else 'Cypher Only'}\n{'='*60}\n")
    
    if use_hybrid:
        answer = answer_with_hybrid_context(driver, question, model_key)
    else:
        context_parts = [format_query_result(cq['query_index'], **cq['params']) for cq in get_context(question)]
        prompt = f"""You are an AI assistant for an airline analyzing flight data.
Answer using only this context:

{chr(10).join(context_parts)}

Question: {question}

Answer:"""
        answer = llm.invoke(prompt).content
    
    print(f"ANSWER:\n{'-'*40}\n{answer}\n")
    return answer

## 7. Usage Examples

In [None]:
# Generate embeddings (run once)
# generate_and_store_all_embeddings(driver, "minilm")

print("Uncomment the line above to generate embeddings.")

In [None]:
# Example questions:
ask("What are the top 5 airports with the most delays?")
ask("How do Millennials travel compared to Baby Boomers?")
ask("Which aircraft type has the best on-time performance?")

print("Uncomment an example to try the pipeline!")

In [None]:
# Close driver when done
# driver.close()
# print("Closed.")