In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("FlightDataAnalysis") \
    .getOrCreate()

In [0]:
data = spark.read.csv("/FileStore/tables/flights_sample_3m.csv", header=True, inferSchema=True) 

In [0]:
data.createOrReplaceTempView("flights")

In [0]:
%sql
/*Question 1: Top 10 Airlines with the Most Cancelled Flights*/
SELECT AIRLINE, COUNT(*) AS Cancelled_Flights
FROM flights
WHERE CANCELLED = 1
GROUP BY AIRLINE
ORDER BY Cancelled_Flights DESC
LIMIT 10;


AIRLINE,Cancelled_Flights
Southwest Airlines Co.,19465
American Airlines Inc.,10907
SkyWest Airlines Inc.,7745
Delta Air Lines Inc.,5982
United Air Lines Inc.,5536
Republic Airline,4646
Envoy Air,3633
PSA Airlines Inc.,3301
JetBlue Airways,3039
Endeavor Air Inc.,2394


In [0]:
%sql
/*Question 2: Find the Destination with the Lowest Avg Arrival Delay for Each Origin
*/
WITH AvgDelay AS (
    SELECT ORIGIN, DEST, AVG(ARR_DELAY) AS Avg_Arrival_Delay
    FROM flights
    GROUP BY ORIGIN, DEST
),
Ranked AS (
    SELECT ORIGIN, DEST, Avg_Arrival_Delay,
           RANK() OVER (PARTITION BY ORIGIN ORDER BY Avg_Arrival_Delay ASC) AS rnk
    FROM AvgDelay
)
SELECT ORIGIN, DEST, Avg_Arrival_Delay
FROM Ranked
WHERE rnk = 1;


ORIGIN,DEST,Avg_Arrival_Delay
ABE,CLT,-14.505330490405118
ABI,IAH,-5.844827586206897
ABQ,SFB,-46.75
ABR,MSP,7.625
ABY,ATL,-2.437209302325581
ACK,PHL,-0.6428571428571429
ACT,DFW,6.935672514619883
ACV,PHX,-4.571428571428571
ACY,SJU,-4.25
ADK,ANC,8.147058823529411


In [0]:
%sql
/*Question 3: Create a View for Delayed Flights & Find the Airline with Most Delays
*/
CREATE OR REPLACE TEMP VIEW DelayedFlights AS
SELECT * FROM flights WHERE (DEP_DELAY + ARR_DELAY) > 30;



In [0]:
%sql
SELECT AIRLINE, COUNT(*) AS Delayed_Flight_Count
FROM DelayedFlights
GROUP BY AIRLINE
ORDER BY Delayed_Flight_Count DESC
LIMIT 1;

AIRLINE,Delayed_Flight_Count
Southwest Airlines Co.,109488


In [0]:
%sql
/*Question 4: Calculate Percentage of Flights Between Each Origin & Destination
*/
WITH TotalFlights AS (
    SELECT ORIGIN, DEST, COUNT(*) AS Total_Flights
    FROM flights
    GROUP BY ORIGIN, DEST
),
Percentage AS (
    SELECT ORIGIN, DEST, Total_Flights,
           (Total_Flights * 100.0 / SUM(Total_Flights) OVER (PARTITION BY ORIGIN)) AS Percentage
    FROM TotalFlights
)
SELECT ORIGIN, DEST, Total_Flights, ROUND(Percentage, 2) AS Percentage
FROM Percentage
ORDER BY ORIGIN, Percentage DESC;


ORIGIN,DEST,Total_Flights,Percentage
ABE,CLT,483,23.88
ABE,ATL,398,19.67
ABE,DTW,292,14.43
ABE,ORD,271,13.4
ABE,SFB,216,10.68
ABE,PIE,108,5.34
ABE,PGD,75,3.71
ABE,BNA,48,2.37
ABE,MYR,48,2.37
ABE,FLL,35,1.73


In [0]:
%sql
/*Question 5: Top 3 Busiest Days of the Week*/
SELECT DATE_FORMAT(FL_DATE, 'EEEE') AS DayOfWeek, 
       COUNT(*) AS Flight_Count
FROM flights
GROUP BY DayOfWeek
ORDER BY Flight_Count DESC
LIMIT 3;


DayOfWeek,Flight_Count
Thursday,446925
Monday,446600
Friday,446292
