In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

# Initialize Spark
spark = SparkSession.builder.appName("SQLPractice").getOrCreate()

# 1. Create the `airports` DataFrame
airports_data = [
    ('JFK', 'New York'),
    ('LGA', 'New York'),
    ('EWR', 'New York'),
    ('LAX', 'Los Angeles'),
    ('ORD', 'Chicago'),
    ('SFO', 'San Francisco'),
    ('HND', 'Tokyo'),
    ('NRT', 'Tokyo'),
    ('KIX', 'Osaka'),
]

airports_schema = StructType([
    StructField("port_code", StringType(), True),
    StructField("city_name", StringType(), True),
])

airports_df = spark.createDataFrame(airports_data, schema=airports_schema)
airports_df.createOrReplaceTempView("airports")


# 2. Create the `flights` DataFrame
flights_data = [
    (1, 'JFK', 'HND', '2025-06-15 06:00', '2025-06-15 18:00'),
    (2, 'JFK', 'LAX', '2025-06-15 07:00', '2025-06-15 10:00'),
    (3, 'LAX', 'NRT', '2025-06-15 10:00', '2025-06-15 22:00'),
    (4, 'JFK', 'LAX', '2025-06-15 08:00', '2025-06-15 11:00'),
    (5, 'LAX', 'KIX', '2025-06-15 11:30', '2025-06-15 22:00'),
    (6, 'LGA', 'ORD', '2025-06-15 09:00', '2025-06-15 12:00'),
    (7, 'ORD', 'HND', '2025-06-15 11:30', '2025-06-15 23:30'),
    (8, 'EWR', 'SFO', '2025-06-15 09:00', '2025-06-15 12:00'),
    (9, 'LAX', 'HND', '2025-06-15 13:00', '2025-06-15 23:00'),
    (10, 'KIX', 'NRT', '2025-06-15 08:00', '2025-06-15 10:00'),
]

flights_schema = StructType([
    StructField("flight_id", StringType(), True),
    StructField("start_port", StringType(), True),
    StructField("end_port", StringType(), True),
    StructField("start_time", TimestampType(), True),
    StructField("end_time", TimestampType(), True),
])

from datetime import datetime

flights_data_typed = [
    (str(f[0]), f[1], f[2], datetime.strptime(f[3], "%Y-%m-%d %H:%M"), datetime.strptime(f[4], "%Y-%m-%d %H:%M"))
    for f in flights_data
]

flights_df = spark.createDataFrame(flights_data_typed, schema=flights_schema)
flights_df.createOrReplaceTempView("flights")


In [7]:
spark.sql("""
    with all_flights as (
    select f.* , a.city_name as start_city, e.city_name as end_city
    from flights as f inner join airports as a on f.start_port = a.port_code
    inner join airports as e on f.end_port = e.port_code ), 
    direct as (
        select start_city, null as middle_city, end_city, flight_id,
        datediff(minute, start_time, end_time) as time_taken
        from all_flights where start_city = 'New York' and end_city = 'Tokyo'
    )
    select a.start_city, a.end_city as middle_city, b.end_city, concat(a.flight_id,';', b.flight_id) as flight_id, a.start_time as trip_start_time, a.end_time as first_flight_end, b.start_time as second_flight_start, 
    b.end_time as second_flight_end from
    (select * from all_flights where start_city = 'New York') a inner join 
    (select * from all_flights where end_city = 'Tokyo') b on a.end_city = b.start_city
    where b.start_time > a.end_time
""").show()

+----------+-----------+--------+---------+-------------------+-------------------+-------------------+-------------------+
|start_city|middle_city|end_city|flight_id|    trip_start_time|   first_flight_end|second_flight_start|  second_flight_end|
+----------+-----------+--------+---------+-------------------+-------------------+-------------------+-------------------+
|  New York|Los Angeles|   Tokyo|      4;9|2025-06-15 08:00:00|2025-06-15 11:00:00|2025-06-15 13:00:00|2025-06-15 23:00:00|
|  New York|Los Angeles|   Tokyo|      2;9|2025-06-15 07:00:00|2025-06-15 10:00:00|2025-06-15 13:00:00|2025-06-15 23:00:00|
+----------+-----------+--------+---------+-------------------+-------------------+-------------------+-------------------+

