In [None]:
# Step 0: Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load dataset
df = pd.read_csv("../data/flights.csv")

# Step 2: Standardize column names
df = df.rename(columns={
    'flight':'FlightNum',
    'origin':'Origin',
    'dest':'Dest',
    'arr_delay':'ArrDelay',
    'carrier':'Carrier'
})

# Step 3: Compute by_carrier metrics
by_carrier = df.groupby('Carrier').agg(
    flights=('FlightNum','count'),
    avg_arrival_delay=('ArrDelay','mean'),
    pct_delayed=('ArrDelay', lambda x: (x>0).mean()*100)
).reset_index()

# Step 4: Compute route performance
df['route'] = df['Origin'] + "-" + df['Dest']
route_perf = df.groupby('route').agg(
    flights=('FlightNum','count'),
    avg_arrival_delay=('ArrDelay','mean'),
    pct_delayed=('ArrDelay', lambda x: (x>0).mean()*100)
).reset_index()

# Step 5: Summary KPIs
summary_kpis = pd.DataFrame({
    'avg_arrival_delay':[df['ArrDelay'].mean()],
    'median_arrival_delay':[df['ArrDelay'].median()],
    'total_flights':[len(df)]
})

# Step 6: Visualizations

# Avg Delay by Airline
plt.figure(figsize=(10,6))
sns.barplot(data=by_carrier, x='Carrier', y='avg_arrival_delay')
plt.title("Average Arrival Delay by Airline")
plt.ylabel("Avg Delay (minutes)")
plt.show()

# Top 10 delayed routes
top_routes = route_perf.sort_values('avg_arrival_delay', ascending=False).head(10)
plt.figure(figsize=(12,6))
sns.barplot(data=top_routes, x='route', y='avg_arrival_delay')
plt.title("Top 10 Routes by Avg Arrival Delay")
plt.xticks(rotation=45)
plt.ylabel("Avg Delay (minutes)")
plt.show()

# Summary KPIs
print("=== Summary KPIs ===")
print(summary_kpis)
