# Airline Flight Insights - Full Pipeline

This notebook provides a complete Graph-RAG pipeline including:
1. **Neo4j Database Connection**
2. **LLM Setup** (Gemini)
3. **Embeddings** - Vector embeddings for semantic search
4. **Hybrid Retrieval** - Cypher + Semantic search
5. **Question Answering**

## 1. Imports and Setup

In [1]:
from neo4j import GraphDatabase, Driver
from dotenv import load_dotenv, find_dotenv
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Optional, Callable, Tuple
import numpy as np
import os
import json
import re

In [2]:
# Load environment variables
load_dotenv(find_dotenv())

NEO4J_URI = os.getenv('NEO4J_URI') or os.getenv('URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME') or os.getenv('USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD') or os.getenv('PASSWORD')
groq_api_key = os.getenv('GROQ_API_KEY') or os.getenv('GROQ')
gemini_api_key = os.getenv('GOOGLE_API_KEY')

print(f"URI: {NEO4J_URI}")
print(f"Groq API key loaded: {'Yes' if groq_api_key else 'No'}")

URI: neo4j+s://d9ac65c9.databases.neo4j.io
Groq API key loaded: Yes


In [3]:
# Create Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
driver.verify_connectivity()
print("Connected to Neo4j!")

Connected to Neo4j!


In [4]:
# Groq: FREE, 30+ requests/minute, very fast!
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    api_key=groq_api_key,
    temperature=0
)
print("Groq LLM loaded! (llama-3.3-70b-versatile)")

gemini = ChatGoogleGenerativeAI(
    api_key=gemini_api_key,
    model="gemini-2.0-flash-exp",
    temperature=0
)
print("Gemini LLM loaded! (gemini-2.0-flash-exp)")

Groq LLM loaded! (llama-3.3-70b-versatile)
Gemini LLM loaded! (gemini-2.0-flash-exp)


## 2. Cypher Queries

In [5]:
queries = [
    # Intent 1: Operational Delay Diagnostics
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay DESC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay ASC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:DEPARTS_FROM]->(a:Airport) RETURN a.station_code AS origin, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay DESC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:DEPARTS_FROM]->(a:Airport) RETURN a.station_code AS origin, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay ASC LIMIT $x",
    "MATCH (o:Airport {station_code: $origin_station_code})<-[:DEPARTS_FROM]-(f:Flight)-[:ARRIVES_AT]->(d:Airport), (j:Journey)-[:ON]->(f) WITH o, d, AVG(j.arrival_delay_minutes) AS avg_delay WHERE avg_delay > $x RETURN o.station_code AS origin, d.station_code AS destination, avg_delay",
    "MATCH (j:Journey {number_of_legs: $x}) RETURN AVG(j.arrival_delay_minutes) AS avg_delay",
    # Intent 2: Service Quality
    "MATCH (o:Airport)<-[:DEPARTS_FROM]-(f:Flight)-[:ARRIVES_AT]->(d:Airport), (j:Journey {passenger_class: $class_name})-[:ON]->(f) WITH o, d, AVG(j.food_satisfaction_score) AS avg_food_score WHERE avg_food_score < $threshold RETURN o.station_code AS origin, d.station_code AS destination, avg_food_score",
    "MATCH (j:Journey {food_satisfaction_score: 1})-[:ON]->(f:Flight) WHERE j.actual_flown_miles > $x RETURN DISTINCT f.flight_number",
    # Intent 3: Fleet Performance
    "MATCH (j:Journey)-[:ON]->(f:Flight) WHERE j.arrival_delay_minutes > $x RETURN f.fleet_type_description AS aircraft_type, COUNT(j) AS delay_frequency ORDER BY delay_frequency DESC LIMIT 1",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) RETURN AVG(j.food_satisfaction_score) AS avg_food_score",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) RETURN AVG(j.actual_flown_miles) AS avg_miles",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) WITH COUNT(j) AS total_flights, COUNT(CASE WHEN j.arrival_delay_minutes < 0 THEN 1 END) AS early_flights RETURN (TOFLOAT(early_flights) / total_flights) * 100 AS early_arrival_percentage",
    # Intent 3b: Aircraft Performance Aggregation (NEW)
    "MATCH (j:Journey)-[:ON]->(f:Flight) RETURN f.fleet_type_description AS aircraft_type, AVG(j.arrival_delay_minutes) AS avg_delay, COUNT(j) AS flight_count ORDER BY avg_delay ASC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight) RETURN f.fleet_type_description AS aircraft_type, AVG(j.arrival_delay_minutes) AS avg_delay, COUNT(j) AS flight_count ORDER BY avg_delay DESC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight) WITH f.fleet_type_description AS aircraft_type, COUNT(j) AS total, COUNT(CASE WHEN j.arrival_delay_minutes <= 0 THEN 1 END) AS on_time RETURN aircraft_type, (toFloat(on_time) / total) * 100 AS on_time_pct, total AS flight_count ORDER BY on_time_pct DESC LIMIT $x",
    # Intent 4: Loyalty
    "MATCH (p:Passenger {loyalty_program_level: $loyalty_program_level})-[:TOOK]->(j:Journey) RETURN AVG(j.arrival_delay_minutes) AS avg_delay",
    "MATCH (p:Passenger {loyalty_program_level: $loyalty_program_level})-[:TOOK]->(j:Journey) WHERE j.arrival_delay_minutes > $x RETURN p.record_locator AS passenger_id, j.arrival_delay_minutes AS delay",
    # Intent 5: Demographics
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight) WHERE j.actual_flown_miles > $threshold RETURN f.fleet_type_description AS aircraft_type, COUNT(f) AS usage_count ORDER BY usage_count DESC LIMIT 1",
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight) RETURN f.fleet_type_description AS fleet_type, COUNT(f) AS usage_count ORDER BY usage_count DESC LIMIT 1",
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, COUNT(p) AS passenger_volume ORDER BY passenger_volume DESC LIMIT $x"
]

query_descriptions = [
    "Identify the top ${x} destination stations with the highest accumulated arrival delay minutes.",
    "Identify the top ${x} destination stations with the lowest accumulated arrival delay minutes.",
    "Identify the top ${x} origin stations with the highest accumulated arrival delay minutes.",
    "Identify the top ${x} origin stations with the lowest accumulated arrival delay minutes.",
    "Find routes from the origin station ${origin_station_code} where the average arrival delay exceeds ${x} minutes.",
    "Calculate the average arrival delay for flights consisting of exactly ${x} legs.",
    "Identify routes for the passenger class ${class_name} where the average food satisfaction score is below ${threshold}.",
    "List the flight numbers for journeys longer than ${x} miles where the food satisfaction score was 1.",
    "Identify the aircraft type that has the highest frequency of arrival delays greater than ${x} minutes.",
    "Calculate the average food satisfaction score for passengers flying on the ${x} fleet.",
    "Calculate the average actual flown miles for the ${x} fleet.",
    "Calculate the percentage of early arrivals for the ${x} fleet.",
    # NEW: Aircraft performance aggregation
    "List the top ${x} aircraft types with the LOWEST average arrival delay (best on-time performance).",
    "List the top ${x} aircraft types with the HIGHEST average arrival delay (worst on-time performance).",
    "List the top ${x} aircraft types by on-time arrival percentage (arrivals with delay <= 0 minutes).",
    # Loyalty
    "Calculate the average arrival delay experienced by passengers with the loyalty level ${loyalty_program_level}.",
    "Find the record locators for passengers with loyalty level ${loyalty_program_level} who experienced a delay greater than ${x} minutes.",
    # Demographics
    "Identify the most common aircraft type used by the ${generation} generation for journeys exceeding ${threshold} miles.",
    "Identify the most frequently used fleet type for the ${generation} generation.",
    "Identify the top ${x} destination stations for the ${generation} generation based on passenger volume."
]

print(f"Loaded {len(queries)} queries")

Loaded 20 queries


## 3. Query Execution Functions

In [6]:
def run_query(query_index: int, **params) -> list:
    """Run a query by index with parameters."""
    if query_index < 0 or query_index >= len(queries):
        raise ValueError(f"Query index {query_index} out of range (0-{len(queries)-1})")
    with driver.session() as session:
        result = session.run(queries[query_index], **params)
        return [record.data() for record in result]

In [7]:
# Load KG schema values for better parameter matching
def load_kg_schema(driver) -> Dict[str, Any]:
    """Query the KG to get valid values for each parameter field."""
    schema = {}
    
    with driver.session() as session:
        # Airport codes
        result = session.run('MATCH (a:Airport) RETURN DISTINCT a.station_code AS code ORDER BY code')
        schema['airport_codes'] = [r['code'] for r in result]
        
        # Passenger classes
        result = session.run('MATCH (j:Journey) RETURN DISTINCT j.passenger_class AS class ORDER BY class')
        schema['passenger_classes'] = [r['class'] for r in result if r['class']]
        
        # Generations
        result = session.run('MATCH (p:Passenger) RETURN DISTINCT p.generation AS gen ORDER BY gen')
        schema['generations'] = [r['gen'] for r in result if r['gen']]
        
        # Loyalty levels
        result = session.run('MATCH (p:Passenger) RETURN DISTINCT p.loyalty_program_level AS level ORDER BY level')
        schema['loyalty_levels'] = [r['level'] for r in result if r['level']]
        
        # Fleet types
        result = session.run('MATCH (f:Flight) RETURN DISTINCT f.fleet_type_description AS fleet ORDER BY fleet')
        schema['fleet_types'] = [r['fleet'] for r in result if r['fleet']]
        
        # Number of legs
        result = session.run('MATCH (j:Journey) RETURN DISTINCT j.number_of_legs AS legs ORDER BY legs')
        schema['number_of_legs'] = [r['legs'] for r in result if r['legs']]
    
    return schema

# Load schema on startup
kg_schema = load_kg_schema(driver)
print(f"Loaded KG schema:")
print(f"  - {len(kg_schema['airport_codes'])} airport codes")
print(f"  - Generations: {kg_schema['generations']}")
print(f"  - Loyalty levels: {kg_schema['loyalty_levels']}")
print(f"  - Fleet types: {len(kg_schema['fleet_types'])} types")
print(f"  - Passenger classes: {kg_schema['passenger_classes']}")

Loaded KG schema:
  - 158 airport codes
  - Generations: ['Boomer', 'Gen X', 'Gen Z', 'Millennial', 'NBK', 'Silent']
  - Loyalty levels: ['NBK', 'global services', 'non-elite', 'premier 1k', 'premier gold', 'premier platinum', 'premier silver']
  - Fleet types: 20 types
  - Passenger classes: ['Economy']


In [8]:
def get_context(prompt: str) -> list:
    """Use Gemini LLM to identify ALL relevant queries and extract parameters with KG schema awareness."""
    safe_descriptions = [desc.replace('${', '<').replace('}', '>') for desc in query_descriptions]
    query_list = "\n".join([f"{i}: {desc}" for i, desc in enumerate(safe_descriptions)])
    
    # Build comprehensive schema reference
    schema_info = (
        "=== DATABASE SCHEMA ===\n"
        "The knowledge graph contains these entities and relationships:\n"
        "- Airport: station_code (e.g., LAX, JFK, ORD)\n"
        "- Flight: flight_number, fleet_type_description\n"
        "- Journey: passenger_class, food_satisfaction_score (1-5), arrival_delay_minutes, actual_flown_miles, number_of_legs\n"
        "- Passenger: generation, loyalty_program_level\n\n"
        "=== VALID VALUES ===\n"
        f"- airport codes: {kg_schema['airport_codes'][:20]}... ({len(kg_schema['airport_codes'])} total)\n"
        f"- generation: {kg_schema['generations']}\n"
        f"- loyalty_program_level: {kg_schema['loyalty_levels']}\n"
        f"- passenger_class: {kg_schema['passenger_classes']}\n"
        f"- fleet_type_description: {kg_schema['fleet_types']}\n"
        f"- number_of_legs: {kg_schema['number_of_legs']}\n"
    )
    
    full_prompt = f"""You are an expert at analyzing user questions about airline flight data and mapping them to database queries.

=== AVAILABLE QUERIES ===
{query_list}

{schema_info}

=== YOUR TASK ===
1. Carefully read and understand the user's question
2. Review ALL available queries above and understand what each one retrieves
3. Identify EVERY query that could help answer the user's question (even partially)
4. Extract the correct parameters for each selected query

=== PARAMETER RULES ===
- x: numeric value for counts/limits (default: 5), delay thresholds (default: 30 minutes), or miles
- origin_station_code: must be an exact airport code from the list
- generation: must match exactly (e.g., 'Baby Boomer' ‚Üí 'Boomer', 'millennials' ‚Üí 'Millennial')
- loyalty_program_level: must match exactly (e.g., 'gold member' ‚Üí 'premier gold')
- class_name: must match exactly from passenger_class list
- For fleet-related queries (indices 9-11), x must be an EXACT fleet type string

=== OUTPUT FORMAT ===
Return a JSON array with ALL relevant queries. Each object must have:
- query_index: the index number of the query (0-{len(queries)-1})
- params: object with parameter names and values

Example: [{{"query_index": 0, "params": {{"x": 5}}}}, {{"query_index": 15, "params": {{"loyalty_program_level": "premier gold"}}}}]

IMPORTANT: 
- Return ALL queries that are relevant, not just one
- Return ONLY the JSON array, no explanation or markdown
- If multiple queries can answer different aspects of the question, include them all
- If no queries are relevant, return an empty array

=== USER QUESTION ===
{prompt}

JSON:"""
    
    response = gemini.invoke(full_prompt)
    response_text = response.content.strip()
    
    # Clean up response - remove markdown and extra whitespace
    response_text = response_text.replace('```json', '').replace('```', '').strip()
    
    print(response_text)

    # Find the FIRST valid JSON array (ignore any additional lines)
    for line in response_text.split('\n'):
        line = line.strip()
        if line.startswith('[') and line.endswith(']'):
            try:
                return json.loads(line)
            except json.JSONDecodeError:
                continue

    # Fallback: try to find any JSON array in the response
    json_match = re.search(r'\[\s*\{[^\[]*\}\s*\]', response_text, re.DOTALL)
    if json_match:
        try:
            return json.loads(json_match.group())
        except json.JSONDecodeError as e:
            print(f"JSON parse error: {e}")
            print(f"Response was: {response_text[:200]}")
    
    return []

In [9]:
def format_query_result(query_index: int, **params) -> str:
    """Run query and format result as context."""
    if query_index < 0 or query_index >= len(queries):
        return f"Error: Query index {query_index} out of range."
    
    description = query_descriptions[query_index]
    for name, value in params.items():
        description = description.replace(f"${{{name}}}", str(value))
    
    try:
        results = run_query(query_index, **params)
    except Exception as e:
        return f'Error for "{description}": {e}'
    
    if not results:
        return f'"{description}": No data found.'
    
    formatted = []
    for r in results:
        parts = [f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}" for k, v in r.items()]
        formatted.append("  - " + ", ".join(parts))
    
    return f'"{description}":\n' + "\n".join(formatted)

## 4. Embeddings Module

Vector embeddings for semantic search. Models:
- `minilm`: all-MiniLM-L6-v2 (384 dims, fast)
- `mpnet`: all-mpnet-base-v2 (768 dims, higher quality)

In [10]:
# Embedding model configurations
EMBEDDING_MODELS = {
    "minilm": {"name": "all-MiniLM-L6-v2", "dimensions": 384, "property_name": "embedding_minilm"},
    "mpnet": {"name": "all-mpnet-base-v2", "dimensions": 768, "property_name": "embedding_mpnet"}
}

_model_cache: Dict[str, SentenceTransformer] = {}

def get_model(model_key: str) -> SentenceTransformer:
    """Load and cache an embedding model."""
    if model_key not in EMBEDDING_MODELS:
        raise ValueError(f"Unknown model: {model_key}. Use 'minilm' or 'mpnet'")
    if model_key not in _model_cache:
        print(f"Loading {EMBEDDING_MODELS[model_key]['name']}...")
        _model_cache[model_key] = SentenceTransformer(EMBEDDING_MODELS[model_key]["name"])
        print("Model loaded!")
    return _model_cache[model_key]

In [11]:
def create_journey_sentences(props: Dict[str, Any]) -> List[str]:
    """Create individual focused sentences for a Journey."""
    sentences = {}
    
    # Sentence 1: Route & Flight info
    flight_number = props.get('flight_number', '')
    fleet_type = props.get('fleet_type', '')
    origin = props.get('origin', '')
    destination = props.get('destination', '')
    if flight_number and origin and destination:
        route_text = f"Flight {flight_number} from {origin} to {destination}"
        if fleet_type:
            route_text += f" operated by {fleet_type}"
        sentences['route'] = route_text + "."
    
    # Sentence 2: Passenger demographics & loyalty
    generation = props.get('generation', '')
    loyalty = props.get('loyalty_program_level', '')
    if generation or loyalty:
        passenger_text = f"Passenger is a {generation}" if generation else "Passenger"
        if loyalty:
            passenger_text += f" with {loyalty} loyalty level"
        sentences['passenger'] = passenger_text + "."
    
    # Sentence 3: Journey experience
    passenger_class = props.get('passenger_class', 'Economy')
    miles = props.get('actual_flown_miles', 0)
    delay = props.get('arrival_delay_minutes', 0)
    legs = props.get('number_of_legs', 1)
    food_score = props.get('food_satisfaction_score', 3)
    delay_text = f"arrived {abs(delay)} minutes early" if delay < 0 else "on time" if delay == 0 else f"delayed {delay} minutes"
    food_labels = {1: "very poor", 2: "poor", 3: "average", 4: "good", 5: "excellent"}
    sentences['experience'] = f"{passenger_class} class, {miles:.0f} miles, {legs} leg(s), {delay_text}, {food_labels.get(food_score, 'average')} food."
    
    return sentences


def create_journey_pairwise_texts(props: Dict[str, Any]) -> List[str]:
    """Create PAIRWISE combinations of journey sentences for better relationship capture."""
    sentences = create_journey_sentences(props)
    pairs = []
    
    # All pairwise combinations
    keys = list(sentences.keys())
    for i in range(len(keys)):
        for j in range(i + 1, len(keys)):
            pair = f"{sentences[keys[i]]} {sentences[keys[j]]}"
            pairs.append(pair)
    
    # Also include the full text (all 3 combined)
    full_text = " ".join(sentences.values())
    pairs.append(full_text)
    
    return pairs


def create_journey_text(props: Dict[str, Any]) -> str:
    """Create combined text from all journey sentences."""
    sentences = create_journey_sentences(props)
    return " ".join(sentences.values())


def create_flight_text(props: Dict[str, Any], origin: str = None, destination: str = None) -> str:
    """Create text representation of a Flight node."""
    flight_num = props.get('flight_number', 'Unknown')
    fleet = props.get('fleet_type_description', 'Unknown')
    route = f" from {origin} to {destination}" if origin and destination else ""
    return f"Flight {flight_num} operated by {fleet}{route}."


def create_passenger_text(props: Dict[str, Any]) -> str:
    """Create text representation of a Passenger node."""
    return f"A {props.get('generation', 'unknown')} passenger with {props.get('loyalty_program_level', 'unknown')} loyalty."

In [12]:
def generate_embeddings(texts: List[str], model_key: str = "minilm") -> np.ndarray:
    """Generate embeddings for a list of texts."""
    model = get_model(model_key)
    return model.encode(texts, show_progress_bar=True, convert_to_numpy=True)


def generate_single_embedding(text: str, model_key: str = "minilm") -> List[float]:
    """Generate embedding for a single text."""
    model = get_model(model_key)
    return model.encode(text, convert_to_numpy=True).tolist()

In [13]:
def fetch_journey_nodes(driver: Driver) -> List[Dict[str, Any]]:
    """Fetch Journey nodes with ENRICHED data including passenger and flight info."""
    query = """
    MATCH (p:Passenger)-[:TOOK]->(j:Journey)-[:ON]->(f:Flight)
    OPTIONAL MATCH (f)-[:DEPARTS_FROM]->(o:Airport)
    OPTIONAL MATCH (f)-[:ARRIVES_AT]->(d:Airport)
    RETURN j.feedback_ID AS feedback_ID,
           j.passenger_class AS passenger_class,
           j.food_satisfaction_score AS food_satisfaction_score,
           j.arrival_delay_minutes AS arrival_delay_minutes,
           j.actual_flown_miles AS actual_flown_miles,
           j.number_of_legs AS number_of_legs,
           p.generation AS generation,
           p.loyalty_program_level AS loyalty_program_level,
           f.flight_number AS flight_number,
           f.fleet_type_description AS fleet_type,
           o.station_code AS origin,
           d.station_code AS destination
    """
    with driver.session() as session:
        result = session.run(query)
        return [{"feedback_ID": r["feedback_ID"], "properties": dict(r)} for r in result]


def fetch_flight_nodes(driver: Driver) -> List[Dict[str, Any]]:
    """Fetch all Flight nodes from Neo4j with route info."""
    query = """
    MATCH (f:Flight)
    OPTIONAL MATCH (f)-[:DEPARTS_FROM]->(origin:Airport)
    OPTIONAL MATCH (f)-[:ARRIVES_AT]->(dest:Airport)
    RETURN f.flight_number AS flight_number, f.fleet_type_description AS fleet_type_description,
           origin.station_code AS origin, dest.station_code AS destination
    """
    with driver.session() as session:
        result = session.run(query)
        return [{
            "flight_number": r["flight_number"],
            "fleet_type_description": r["fleet_type_description"],
            "properties": {"flight_number": r["flight_number"], "fleet_type_description": r["fleet_type_description"]},
            "origin": r["origin"], "destination": r["destination"]
        } for r in result]

In [14]:
def create_vector_index(driver: Driver, model_key: str, node_label: str = "Journey"):
    """Create a vector index in Neo4j."""
    config = EMBEDDING_MODELS[model_key]
    index_name = f"{node_label.lower()}_{config['property_name']}"
    
    create_query = f"""
    CREATE VECTOR INDEX {index_name} IF NOT EXISTS
    FOR (n:{node_label}) ON n.{config['property_name']}
    OPTIONS {{indexConfig: {{
        `vector.dimensions`: {config['dimensions']},
        `vector.similarity_function`: 'cosine'
    }}}}
    """
    with driver.session() as session:
        try:
            session.run(f"DROP INDEX {index_name} IF EXISTS")
        except: pass
        session.run(create_query)
        print(f"Created index: {index_name}")

In [15]:
def store_journey_embeddings(driver: Driver, feedback_ids: List[str], embeddings: np.ndarray, 
                              model_key: str = "minilm", batch_size: int = 100):
    """Store embeddings for Journey nodes."""
    prop = EMBEDDING_MODELS[model_key]["property_name"]
    query = f"UNWIND $batch AS item MATCH (j:Journey {{feedback_ID: item.feedback_ID}}) SET j.{prop} = item.embedding"
    
    with driver.session() as session:
        for i in range(0, len(feedback_ids), batch_size):
            batch = [{"feedback_ID": feedback_ids[j], "embedding": embeddings[j].tolist()} 
                     for j in range(i, min(i + batch_size, len(feedback_ids)))]
            session.run(query, batch=batch)
            print(f"Stored {min(i + batch_size, len(feedback_ids))}/{len(feedback_ids)}...")


def store_journey_multi_embeddings(driver: Driver, journeys: List[Dict], all_embeddings: Dict[str, np.ndarray],
                                    model_key: str = "minilm", batch_size: int = 100):
    """Store multiple embeddings per Journey (route, passenger, experience)."""
    base_prop = EMBEDDING_MODELS[model_key]["property_name"]
    
    for emb_type in ['route', 'passenger', 'experience']:
        if emb_type not in all_embeddings:
            continue
        prop = f"{base_prop}_{emb_type}"
        query = f"UNWIND $batch AS item MATCH (j:Journey {{feedback_ID: item.feedback_ID}}) SET j.{prop} = item.embedding"
        embeddings = all_embeddings[emb_type]
        
        with driver.session() as session:
            for i in range(0, len(journeys), batch_size):
                batch = [{"feedback_ID": journeys[j]["feedback_ID"], "embedding": embeddings[j].tolist()} 
                         for j in range(i, min(i + batch_size, len(journeys)))]
                session.run(query, batch=batch)
        print(f"Stored {emb_type} embeddings")


def store_flight_embeddings(driver: Driver, flights: List[Dict], embeddings: np.ndarray,
                            model_key: str = "minilm", batch_size: int = 100):
    """Store embeddings for Flight nodes."""
    prop = EMBEDDING_MODELS[model_key]["property_name"]
    query = f"UNWIND $batch AS item MATCH (f:Flight {{flight_number: item.flight_number, fleet_type_description: item.fleet_type_description}}) SET f.{prop} = item.embedding"
    
    with driver.session() as session:
        for i in range(0, len(flights), batch_size):
            batch = [{"flight_number": flights[j]["flight_number"], 
                      "fleet_type_description": flights[j]["fleet_type_description"],
                      "embedding": embeddings[j].tolist()} 
                     for j in range(i, min(i + batch_size, len(flights)))]
            session.run(query, batch=batch)
            print(f"Stored {min(i + batch_size, len(flights))}/{len(flights)}...")

In [16]:
def semantic_search_journeys(driver: Driver, query_text: str, model_key: str = "minilm", top_k: int = 5) -> List[Dict]:
    """Semantic search on Journey nodes - returns enriched data."""
    query_embedding = generate_single_embedding(query_text, model_key)
    index_name = f"journey_{EMBEDDING_MODELS[model_key]['property_name']}"
    
    # Updated query to fetch connected entities
    search_query = f"""
    CALL db.index.vector.queryNodes('{index_name}', $top_k, $query_embedding)
    YIELD node, score
    MATCH (p:Passenger)-[:TOOK]->(node)-[:ON]->(f:Flight)
    OPTIONAL MATCH (f)-[:DEPARTS_FROM]->(o:Airport)
    OPTIONAL MATCH (f)-[:ARRIVES_AT]->(d:Airport)
    RETURN node.feedback_ID AS feedback_ID, 
           node.passenger_class AS passenger_class,
           node.food_satisfaction_score AS food_satisfaction_score,
           node.arrival_delay_minutes AS arrival_delay_minutes,
           node.actual_flown_miles AS actual_flown_miles,
           node.number_of_legs AS number_of_legs,
           p.generation AS generation,
           p.loyalty_program_level AS loyalty_program_level,
           f.flight_number AS flight_number,
           f.fleet_type_description AS fleet_type,
           o.station_code AS origin,
           d.station_code AS destination,
           score
    ORDER BY score DESC
    """
    with driver.session() as session:
        result = session.run(search_query, top_k=top_k, query_embedding=query_embedding)
        return [{**dict(r), "similarity_score": r["score"]} for r in result]


def semantic_search_flights(driver: Driver, query_text: str, model_key: str = "minilm", top_k: int = 5) -> List[Dict]:
    """Semantic search on Flight nodes."""
    query_embedding = generate_single_embedding(query_text, model_key)
    index_name = f"flight_{EMBEDDING_MODELS[model_key]['property_name']}"
    
    search_query = f"""
    CALL db.index.vector.queryNodes('{index_name}', $top_k, $query_embedding)
    YIELD node, score
    MATCH (node)-[:DEPARTS_FROM]->(origin:Airport)
    MATCH (node)-[:ARRIVES_AT]->(dest:Airport)
    RETURN node.flight_number AS flight_number, node.fleet_type_description AS fleet_type_description,
           origin.station_code AS origin, dest.station_code AS destination, score
    ORDER BY score DESC
    """
    with driver.session() as session:
        result = session.run(search_query, top_k=top_k, query_embedding=query_embedding)
        return [{**dict(r), "similarity_score": r["score"]} for r in result]

In [17]:
def format_embedding_results(results: List[Dict], node_type: str = "Journey") -> str:
    """Format embedding search results as context with enriched info."""
    if not results:
        return f"No similar {node_type} nodes found."
    
    lines = [f"Found {len(results)} relevant {node_type} records:"]
    for i, r in enumerate(results, 1):
        if node_type == "Journey":
            # Rich journey text with all connected info
            text = create_journey_text(r)
        else:
            text = create_flight_text({"flight_number": r.get("flight_number"), 
                                       "fleet_type_description": r.get("fleet_type_description")},
                                      r.get("origin"), r.get("destination"))
        lines.append(f"  {i}. (score: {r['similarity_score']:.3f}) {text}")
    return "\n".join(lines)


def get_embedding_context(driver: Driver, query: str, model_key: str = "minilm", top_k: int = 20) -> str:
    """Get context from embedding-based semantic search."""
    contexts = []
    try:
        contexts.append(format_embedding_results(semantic_search_journeys(driver, query, model_key, top_k), "Journey"))
    except Exception as e:
        contexts.append(f"Journey search error: {e}")
    try:
        contexts.append(format_embedding_results(semantic_search_flights(driver, query, model_key, top_k), "Flight"))
    except Exception as e:
        contexts.append(f"Flight search error: {e}")
    return "\n\n".join(contexts)

In [18]:
def generate_and_store_all_embeddings(driver: Driver, model_key: str = "minilm"):
    """Generate and store embeddings for all Journey and Flight nodes."""
    print(f"\n{'='*60}")
    print(f"Generating embeddings with {EMBEDDING_MODELS[model_key]['name']}")
    print(f"{'='*60}\n")
    
    # Journeys
    print("Fetching Journey nodes...")
    journeys = fetch_journey_nodes(driver)
    print(f"Found {len(journeys)} journeys")
    
    if journeys:
        texts = [create_journey_text(j["properties"]) for j in journeys]
        embeddings = generate_embeddings(texts, model_key)
        create_vector_index(driver, model_key, "Journey")
        store_journey_embeddings(driver, [j["feedback_ID"] for j in journeys], embeddings, model_key)
    
    # Flights
    print("\nFetching Flight nodes...")
    flights = fetch_flight_nodes(driver)
    print(f"Found {len(flights)} flights")
    
    if flights:
        texts = [create_flight_text(f["properties"], f["origin"], f["destination"]) for f in flights]
        embeddings = generate_embeddings(texts, model_key)
        create_vector_index(driver, model_key, "Flight")
        store_flight_embeddings(driver, flights, embeddings, model_key)
    
    print(f"\nEmbedding generation complete!")

## 5. Hybrid Retrieval

Combines Cypher queries with embedding-based semantic search.

In [19]:
def get_hybrid_context(driver: Driver, prompt: str, model_key: str = "minilm", top_k: int = 20) -> Dict[str, Any]:
    """Get context from both Cypher queries and embedding search."""
    results = {'cypher_context': [], 'embedding_context': '', 'combined_context': ''}
    
    # Cypher context
    try:
        for cq in get_context(prompt):
            results['cypher_context'].append(format_query_result(cq['query_index'], **cq['params']))
    except Exception as e:
        results['cypher_context'].append(f"Cypher error: {e}")
    
    # Embedding context
    try:
        results['embedding_context'] = get_embedding_context(driver, prompt, model_key, top_k)
    except Exception as e:
        results['embedding_context'] = f"Embedding error: {e}"
    
    # Combine
    cypher_text = '\n\n'.join(results['cypher_context'])
    results['combined_context'] = f"=== STRUCTURED QUERY RESULTS ===\n{cypher_text}\n\n=== SEMANTIC SEARCH RESULTS ===\n{results['embedding_context']}"
    
    return results

In [20]:
def answer_with_hybrid_context(driver: Driver, llm: Any, question: str, model_key: str = "minilm", debug = False) -> str:
    """Answer a question using hybrid retrieval."""
    context_object = get_hybrid_context(driver, question, model_key)
    
    if debug: 
        print(context_object)

    context = context_object['combined_context']
    prompt = f"""You are an AI assistant for an airline company analyzing flight data.

Based on this context from our knowledge graph, answer the user's question.
Only use information from the context. If insufficient, say so.

CONTEXT:
{context}

USER QUESTION: {question}

ANSWER:"""
    
    return llm.invoke(prompt).content

In [21]:
def compare_retrieval_methods(driver: Driver, question: str) -> Dict[str, Any]:
    """Compare results from different retrieval methods."""
    results = {'cypher_only': [], 'embedding_minilm': '', 'embedding_mpnet': '', 
               'hybrid_minilm': None, 'hybrid_mpnet': None}
    
    # Cypher only
    try:
        for cq in get_context(question):
            results['cypher_only'].append(format_query_result(cq['query_index'], **cq['params']))
    except Exception as e:
        results['cypher_only'] = [f"Error: {e}"]
    
    # Embeddings
    for key in ['minilm', 'mpnet']:
        try:
            results[f'embedding_{key}'] = get_embedding_context(driver, question, key)
        except Exception as e:
            results[f'embedding_{key}'] = f"Error: {e}"
    
    # Hybrid
    results['hybrid_minilm'] = get_hybrid_context(driver, question, "minilm")
    results['hybrid_mpnet'] = get_hybrid_context(driver, question, "mpnet")
    
    return results


def print_comparison(results: Dict[str, Any]):
    """Print comparison results."""
    print("=" * 80 + "\nRETRIEVAL METHOD COMPARISON\n" + "=" * 80)
    print("\n--- CYPHER ONLY ---")
    for ctx in results['cypher_only']: print(ctx + "\n")
    print("\n--- EMBEDDING (MiniLM) ---\n" + results['embedding_minilm'])
    print("\n--- EMBEDDING (MPNet) ---\n" + results['embedding_mpnet'])
    if results['hybrid_minilm']:
        print("\n--- HYBRID (MiniLM) ---\n" + results['hybrid_minilm']['combined_context'])
    if results['hybrid_mpnet']:
        print("\n--- HYBRID (MPNet) ---\n" + results['hybrid_mpnet']['combined_context'])

## 6. Interactive Q&A

In [22]:
def ask(question: str, llm: Any, use_hybrid: bool = True, model_key: str = "minilm", debug = False) -> str:
    """Ask a question using the full pipeline."""
    print(f"\n{'='*60}\nQ: {question}\nMode: {'Hybrid' if use_hybrid else 'Cypher Only'}\n{'='*60}\n")
    
    if use_hybrid:
        answer = answer_with_hybrid_context(driver, llm, question, model_key, debug)
    else:
        context_parts = [format_query_result(cq['query_index'], **cq['params']) for cq in get_context(question)]
        print({chr(10).join(context_parts)})
        prompt = f"""You are an AI assistant for an airline analyzing flight data.
Answer using only this context:

{chr(10).join(context_parts)}

Question: {question}

Answer:"""
        answer = llm.invoke(prompt).content
    
    print(f"ANSWER:\n{'-'*40}\n{answer}\n")
    return answer

## 7. Usage Examples

In [23]:
# Generate embeddings (run once)
# generate_and_store_all_embeddings(driver, "minilm")

print("Uncomment the line above to generate embeddings.")

Uncomment the line above to generate embeddings.


In [None]:
# Example questions:
# ask("What are the top 5 airports with the most delays?", llm, debug=False)
# ask("How do Millennials travel compared to Baby Boomers?", llm, debug=False)
# ask("Which aircraft type has the best on-time performance?")
# ask("What is the flight number of the journey that departs from LAX and arrives at IAX and has generation 'Millennials'?")
# ask("What are the different loyalty program levels for a journey that has flight number 2, mention all of them")

print("Uncomment an example to try the pipeline!")


Q: How do Millennials travel compared to Baby Boomers?
Mode: Hybrid

[
  {"query_index": 17, "params": {"generation": "Millennial", "threshold": 1000}},
  {"query_index": 18, "params": {"generation": "Millennial"}},
  {"query_index": 19, "params": {"x": 5, "generation": "Millennial"}},
  {"query_index": 17, "params": {"generation": "Boomer", "threshold": 1000}},
  {"query_index": 18, "params": {"generation": "Boomer"}},
  {"query_index": 19, "params": {"x": 5, "generation": "Boomer"}}
]
ANSWER:
----------------------------------------
Based on the provided context, here's a comparison of how Millennials travel compared to Baby Boomers:

1. **Aircraft Type**: Both Millennials and Baby Boomers most commonly use the B737-900 aircraft type for journeys exceeding 1000 miles, with usage counts of 50 and 154, respectively.

2. **Fleet Type**: The most frequently used fleet type for both generations is also the B737-900, with usage counts of 72 for Millennials and 212 for Baby Boomers.

3. **

## 8. LLMs part

In [25]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
import time
import pandas as pd

In [26]:
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [27]:
def setup_llm_models():
    """
    Setup multiple LLM models from different providers.
    """

    base_models = {
        "Mistral": HuggingFaceEndpoint(
            repo_id="mistralai/Mistral-7B-Instruct-v0.2",
            huggingfacehub_api_token=hf_token,
        ),
        "Llama3": HuggingFaceEndpoint(
            repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
            huggingfacehub_api_token=hf_token,
        ),
        "Gemma": HuggingFaceEndpoint(
            repo_id="google/gemma-2-2b-it",
            huggingfacehub_api_token=hf_token,
        )
    }
    models = {name: ChatHuggingFace(llm=model) for name, model in base_models.items()}
    
    return models

In [28]:
"""
LLM Evaluator Module - Extended with Qualitative Comparison

This module provides the LLMEvaluator class for comparing multiple LLM models
with both quantitative and qualitative metrics.

Usage:
    from llm_evaluator import LLMEvaluator
    
    evaluator = LLMEvaluator(models)
    evaluator.evaluate_all_models(questions)
    evaluator.evaluate_qualitatively_from_results(groq_llm)
    print(evaluator.get_qualitative_summary())
"""

class LLMEvaluator:
    """Comprehensive LLM evaluation system with quantitative and qualitative metrics"""
    
    def __init__(self, models: Dict, ask_function=None):
        """
        Initialize the evaluator.
        
        Args:
            models: Dict mapping model names to LLM instances
            ask_function: Function to use for asking questions (signature: ask(question, llm) -> str)
        """
        self.models = models
        self.results = []
        self.qualitative_results = []
        self.ask = ask_function
    
    def set_ask_function(self, ask_function):
        """Set the ask function to use for querying models."""
        self.ask = ask_function
    
    def run_single_test(self, model_name: str, model, question: str) -> Dict:
        """Run a single test case for one model"""
        if self.ask is None:
            raise ValueError("ask_function not set. Call set_ask_function() first.")
        
        start_time = time.time()
        try:
            response = self.ask(question, model)
            elapsed_time = time.time() - start_time
            
            return {
                "model": model_name,
                "question": question,
                "response": response,
                "time": elapsed_time,
                "success": True,
                "error": None,
                "tokens_estimate": len(response.split())  # Rough estimate
            }
        except Exception as e:
            elapsed_time = time.time() - start_time
            return {
                "model": model_name,
                "question": question,
                "response": None,
                "time": elapsed_time,
                "success": False,
                "error": str(e),
                "tokens_estimate": 0
            }
    
    def evaluate_all_models(self, questions: List[str]):
        """Run all test cases for all models"""
        print("Starting LLM Evaluation...\n")
        
        self.results = []

        for i, question in enumerate(questions, 1):
            print(f"Test Case {i}/{len(questions)}: {question}")
            
            for model_name, model in self.models.items():
                print(f"  Testing {model_name}...", end=" ")
                result = self.run_single_test(model_name, model, question)
                self.results.append(result)
                
                if result["success"]:
                    print(f"‚úì ({result['time']:.2f}s)")
                else:
                    print(f"‚úó Error: {result['error'][:50]}")
            
            print()
        
        return self.results
    
    def get_quantitative_metrics(self) -> pd.DataFrame:
        """Calculate quantitative metrics"""
        df = pd.DataFrame(self.results)
        
        metrics = df.groupby('model').agg({
            'time': ['mean', 'std', 'min', 'max'],
            'success': 'mean',
            'tokens_estimate': 'mean'
        }).round(3)
        
        metrics.columns = ['Avg Time (s)', 'Std Time', 'Min Time', 'Max Time', 
                          'Success Rate', 'Avg Tokens']
        
        return metrics
    
    def evaluate_qualitatively_from_results(self, evaluator_llm: Any) -> List[Dict[str, Any]]:
        """
        Qualitatively evaluate all stored results using Groq LLM as the judge.
        Uses self.results from evaluate_all_models() to get questions and responses.
        
        Evaluates on: Relevance, Completeness, Naturalness (1-5 scale each)
        
        Args:
            evaluator_llm: Groq LLM instance to evaluate the responses
        
        Returns:
            List of evaluation results for each question
        """
        if not self.results:
            print("No results available. Run evaluate_all_models() first.")
            return []
        
        # Group results by question
        questions_map = {}
        for r in self.results:
            q = r['question']
            if q not in questions_map:
                questions_map[q] = {}
            if r['success'] and r['response']:
                questions_map[q][r['model']] = r['response']
        
        self.qualitative_results = []
        
        for i, (question, responses) in enumerate(questions_map.items(), 1):
            print(f"\nEvaluating question {i}/{len(questions_map)}: {question[:50]}...")
            
            if len(responses) < 2:
                print("  Skipping - need at least 2 successful responses")
                continue
            
            # Format responses for evaluation
            responses_text = "\n\n".join([
                f"### MODEL: {name}\n{response}" 
                for name, response in responses.items()
            ])
            
            # Groq evaluation prompt (without correctness per user request)
            eval_prompt = f"""You are an expert evaluator comparing AI model responses for an airline data assistant.

QUESTION: {question}

RESPONSES TO EVALUATE:
{responses_text}

Evaluate each response on these criteria (1-5 scale):
1. **Relevance**: Does it directly answer the question?
2. **Completeness**: Is it thorough without being verbose?
3. **Naturalness**: Is it well-written and easy to understand?

Provide your evaluation as JSON:
{{
    "evaluations": {{
        "<model_name>": {{
            "relevance": <1-5>,
            "completeness": <1-5>,
            "naturalness": <1-5>,
            "total": <sum out of 15>,
            "reasoning": "<brief explanation>"
        }}
    }},
    "winner": "<model_name>",
    "summary": "<one sentence comparison>"
}}

Return ONLY valid JSON, no other text."""
            
            try:
                evaluation_response = evaluator_llm.invoke(eval_prompt).content
                
                # Parse evaluation
                json_match = re.search(r'\{[\s\S]*\}', evaluation_response)
                if json_match:
                    evaluation = json.loads(json_match.group())
                else:
                    evaluation = {"raw_response": evaluation_response, "parse_error": True}
            except Exception as e:
                evaluation = {"error": str(e), "parse_error": True}
            
            result = {
                "question": question,
                "responses": responses,
                "evaluation": evaluation
            }
            self.qualitative_results.append(result)
            
            # Print winner for this question
            if 'parse_error' not in evaluation:
                print(f"  Winner: {evaluation.get('winner', 'N/A')}")
        
        return self.qualitative_results
    
    def get_qualitative_summary(self) -> pd.DataFrame:
        """
        Get aggregated qualitative metrics across all evaluated questions.
        
        Returns:
            DataFrame with average scores per model
        """
        if not self.qualitative_results:
            print("No qualitative results. Run evaluate_qualitatively_from_results() first.")
            return pd.DataFrame()
        
        # Collect scores per model
        model_scores = {}
        model_wins = {}
        
        for result in self.qualitative_results:
            eval_data = result.get('evaluation', {})
            if 'parse_error' in eval_data:
                continue
            
            # Track winner
            winner = eval_data.get('winner')
            if winner:
                model_wins[winner] = model_wins.get(winner, 0) + 1
            
            # Collect scores
            for model, scores in eval_data.get('evaluations', {}).items():
                if model not in model_scores:
                    model_scores[model] = {'relevance': [], 'completeness': [], 'naturalness': [], 'total': []}
                
                for metric in ['relevance', 'completeness', 'naturalness', 'total']:
                    if metric in scores and isinstance(scores[metric], (int, float)):
                        model_scores[model][metric].append(scores[metric])
        
        # Calculate averages
        summary_data = []
        for model, scores in model_scores.items():
            avg_scores = {
                'Model': model,
                'Avg Relevance': np.mean(scores['relevance']) if scores['relevance'] else 0,
                'Avg Completeness': np.mean(scores['completeness']) if scores['completeness'] else 0,
                'Avg Naturalness': np.mean(scores['naturalness']) if scores['naturalness'] else 0,
                'Avg Total': np.mean(scores['total']) if scores['total'] else 0,
                'Wins': model_wins.get(model, 0)
            }
            summary_data.append(avg_scores)
        
        df = pd.DataFrame(summary_data)
        if not df.empty:
            df = df.sort_values('Avg Total', ascending=False).reset_index(drop=True)
            df = df.round(2)
        return df
    
    def print_qualitative_results(self):
        """Pretty print all qualitative evaluation results."""
        if not self.qualitative_results:
            print("No qualitative results. Run evaluate_qualitatively_from_results() first.")
            return
        
        for result in self.qualitative_results:
            print(f"\n{'='*70}")
            print(f"QUESTION: {result['question']}")
            print('='*70)
            
            print("\nüìù RESPONSES:")
            for model, response in result['responses'].items():
                print(f"\n--- {model} ---")
                print(response[:500] + "..." if len(response) > 500 else response)
            
            print(f"\n{'='*70}")
            print("üìä EVALUATION:")
            print('='*70)
            
            eval_data = result['evaluation']
            if 'parse_error' not in eval_data:
                for model, scores in eval_data.get('evaluations', {}).items():
                    print(f"\n{model}:")
                    print(f"  Relevance: {scores.get('relevance', 'N/A')}/5")
                    print(f"  Completeness: {scores.get('completeness', 'N/A')}/5")
                    print(f"  Naturalness: {scores.get('naturalness', 'N/A')}/5")
                    print(f"  TOTAL: {scores.get('total', 'N/A')}/15")
                    print(f"  Reasoning: {scores.get('reasoning', 'N/A')}")
                
                print(f"\nüèÜ WINNER: {eval_data.get('winner', 'N/A')}")
                print(f"üìã Summary: {eval_data.get('summary', 'N/A')}")
            else:
                print(eval_data.get('raw_response', eval_data.get('error', 'No evaluation available')))
    
    def export_results(self, filename: str = "llm_comparison_results.json"):
        """Export detailed results to JSON"""
        export_data = {
            'quantitative_results': self.results,
            'qualitative_results': self.qualitative_results
        }
        with open(filename, 'w') as f:
            json.dump(export_data, f, indent=2)
        print(f"Results exported to {filename}")


In [29]:
all_models = setup_llm_models()
    
selected_models = all_models

print(f"Selected models: {list(selected_models.keys())}\n")

# Create evaluator
evaluator = LLMEvaluator(selected_models)

test_cases = [
    "What are the top 5 airports with the most delays?",
    "How do Millennials travel compared to Baby Boomers?",
    "Which aircraft type has the best on-time performance?",
    "What is the flight number of the journey that departs from LAX and arrives at IAX and has generation 'Millennials'?",
    "What are the different loyalty program levels for a journey that has flight number 2, mention all of them"
]

evaluator.set_ask_function(ask)

# Run evaluation
results = evaluator.evaluate_all_models(test_cases)

# Print quantitative metrics
print("\n" + "="*80)
print("QUANTITATIVE METRICS")
print("="*80)
print(evaluator.get_quantitative_metrics())

evaluator.print_qualitative_results()

# Export results
evaluator.export_results()

Selected models: ['Mistral', 'Llama3', 'Gemma']

Starting LLM Evaluation...

Test Case 1/5: What are the top 5 airports with the most delays?
  Testing Mistral... 
Q: What are the top 5 airports with the most delays?
Mode: Hybrid

[
  {"query_index": 0, "params": {"x": 5}},
  {"query_index": 2, "params": {"x": 5}}
]
ANSWER:
----------------------------------------
 Based on the context provided, the top 5 destinations with the highest accumulated arrival delay minutes are CDX, JAX, SIX, FRX, and MUX. Similarly, the top 5 origin stations with the highest accumulated arrival delay minutes are DEX, EWX, MUX, SFX, and RSX. However, the user question asked for the top 5 airports with the most delays, which is not directly answered in the context.

‚úì (6.80s)
  Testing Llama3... 
Q: What are the top 5 airports with the most delays?
Mode: Hybrid

[
  {"query_index": 0, "params": {"x": 5}},
  {"query_index": 2, "params": {"x": 5}}
]
ANSWER:
----------------------------------------
I am unable

In [30]:
# Close driver when done
# driver.close()
# print("Closed.")