# Airline Flight Insights - Neo4j Database Connection

This notebook connects to the Neo4j database and provides utilities to run predefined queries.

In [None]:
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os

In [None]:
# Load environment variables from .env file
load_dotenv('../.env')

# Get Neo4j connection details from environment
NEO4J_URI = os.getenv('NEO4J_URI') or os.getenv('URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME') or os.getenv('USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD') or os.getenv('PASSWORD')

print(f"Loaded config for URI: {NEO4J_URI}")

In [None]:
# Create Neo4j driver
driver = GraphDatabase.driver(
    NEO4J_URI,
    auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
)

# Verify connection
driver.verify_connectivity()
print("Successfully connected to Neo4j database!")

In [None]:
queries = [
    # Intent 1: Operational Delay Diagnostics
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay DESC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay ASC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:DEPARTS_FROM]->(a:Airport) RETURN a.station_code AS origin, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay DESC LIMIT $x",
    "MATCH (j:Journey)-[:ON]->(f:Flight)-[:DEPARTS_FROM]->(a:Airport) RETURN a.station_code AS origin, SUM(j.arrival_delay_minutes) AS total_delay ORDER BY total_delay ASC LIMIT $x",
    "MATCH (o:Airport {station_code: $origin_station_code})<-[:DEPARTS_FROM]-(f:Flight)-[:ARRIVES_AT]->(d:Airport), (j:Journey)-[:ON]->(f) WITH o, d, AVG(j.arrival_delay_minutes) AS avg_delay WHERE avg_delay > $x RETURN o.station_code AS origin, d.station_code AS destination, avg_delay",
    "MATCH (j:Journey {number_of_legs: $x}) RETURN AVG(j.arrival_delay_minutes) AS avg_delay",

    # Intent 2: Service Quality & Product Optimization
    "MATCH (o:Airport)<-[:DEPARTS_FROM]-(f:Flight)-[:ARRIVES_AT]->(d:Airport), (j:Journey {passenger_class: $class_name})-[:ON]->(f) WITH o, d, AVG(j.food_satisfaction_score) AS avg_food_score WHERE avg_food_score < $threshold RETURN o.station_code AS origin, d.station_code AS destination, avg_food_score",
    "MATCH (j:Journey {food_satisfaction_score: 1})-[:ON]->(f:Flight) WHERE j.actual_flown_miles > $x RETURN DISTINCT f.flight_number",

    # Intent 3: Fleet Performance Monitoring
    "MATCH (j:Journey)-[:ON]->(f:Flight) WHERE j.arrival_delay_minutes > $x RETURN f.fleet_type_description AS aircraft_type, COUNT(j) AS delay_frequency ORDER BY delay_frequency DESC LIMIT 1",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) RETURN AVG(j.food_satisfaction_score) AS avg_food_score",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) RETURN AVG(j.actual_flown_miles) AS avg_miles",
    "MATCH (j:Journey)-[:ON]->(f:Flight {fleet_type_description: $x}) WITH COUNT(j) AS total_flights, COUNT(CASE WHEN j.arrival_delay_minutes < 0 THEN 1 END) AS early_flights RETURN (TOFLOAT(early_flights) / total_flights) * 100 AS early_arrival_percentage",

    # Intent 4: High-Value Customer (Loyalty) Retention
    "MATCH (p:Passenger {loyalty_program_level: $loyalty_program_level})-[:TOOK]->(j:Journey) RETURN AVG(j.arrival_delay_minutes) AS avg_delay",
    "MATCH (p:Passenger {loyalty_program_level: $loyalty_program_level})-[:TOOK]->(j:Journey) WHERE j.arrival_delay_minutes > $x RETURN p.record_locator AS passenger_id, j.arrival_delay_minutes AS delay",

    # Intent 5: Demographic Market Analysis
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight) WHERE j.actual_flown_miles > $threshold RETURN f.fleet_type_description AS aircraft_type, COUNT(f) AS usage_count ORDER BY usage_count DESC LIMIT 1",
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight) RETURN f.fleet_type_description AS fleet_type, COUNT(f) AS usage_count ORDER BY usage_count DESC LIMIT 1",
    "MATCH (p:Passenger {generation: $generation})-[:TOOK]->(j:Journey)-[:ON]->(f:Flight)-[:ARRIVES_AT]->(a:Airport) RETURN a.station_code AS destination, COUNT(p) AS passenger_volume ORDER BY passenger_volume DESC LIMIT $x"
]

print(f"Loaded {len(queries)} queries")

In [None]:
def run_query(query_index: int, **params) -> list:
    """
    Run a query from the queries list by index with the provided parameters.
    
    Args:
        query_index: The index of the query in the queries list (0-based)
        **params: Keyword arguments for query parameters (e.g., x=5, origin_station_code='LAX')
    
    Returns:
        List of records as dictionaries
    
    Example:
        # Run query 0 with x=5
        results = run_query(0, x=5)
        
        # Run query 4 with origin_station_code and x parameters
        results = run_query(4, origin_station_code='ORD', x=30)
    """
    if query_index < 0 or query_index >= len(queries):
        raise ValueError(f"Query index {query_index} is out of range. Valid range: 0-{len(queries)-1}")
    
    query = queries[query_index]
    
    with driver.session() as session:
        result = session.run(query, **params)
        records = [record.data() for record in result]
    
    return records

## Example Queries

The following cells demonstrate how to use the `run_query` function with different query types.

In [None]:
# Example 1: Query 0 - Top destinations by total delay
results = run_query(0, x=5)
print("Top 5 destinations by total delay:")
for r in results:
    print(f"  {r['destination']}: {r['total_delay']} minutes")

In [None]:
# Example 2: Query 5 - Average delay for journeys with specific number of legs
results = run_query(5, x=1)
print("Average delay for 1-leg journeys:")
for r in results:
    avg_delay = r['avg_delay']
    if avg_delay is not None:
        print(f"  {avg_delay:.2f} minutes")
    else:
        print("  No data available")

In [None]:
# Example 3: Query 8 - Aircraft type with most delays (over 30 minutes)
results = run_query(8, x=30)
print("Aircraft with most delays (>30 min):")
for r in results:
    print(f"  {r['aircraft_type']}: {r['delay_frequency']} delayed flights")

In [None]:
# Example 4: Query 15 - Most used fleet type by Millennials
results = run_query(15, generation='Millennial')
print("Most used fleet type by Millennials:")
for r in results:
    print(f"  {r['fleet_type']}: {r['usage_count']} flights")

In [None]:
# Example 5: Query 16 - Top destinations by passenger volume for Gen X
results = run_query(16, generation='Gen X', x=5)
print("Top 5 destinations for Gen X passengers:")
for r in results:
    print(f"  {r['destination']}: {r['passenger_volume']} passengers")

In [None]:
# Close driver when done (run this cell when finished)
# driver.close()
# print("Driver closed.")