In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from core.EDA.FlightPerformance import FlightPerformance
from avstats.core.EDA_utils import *
from avstats.core.general_utils import *

In [None]:
# Delay Distribution
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 5))
sns.histplot(df['dep_delay'], bins=300, kde=True)
plt.title("Distribution of Departure Delays")
plt.xlabel("Delay (min.)")
plt.ylabel("Number of Flights")
plt.xlim(0, 200) 
plt.show()

# Boxplot of the top 10 delays by routes
top_routes = df['route_iata_code'].value_counts().nlargest(10).index
filtered_df = df[df['route_iata_code'].isin(top_routes)]
plt.figure(figsize=(10, 5))
sns.boxplot(x='route_iata_code', y='dep_delay', data=filtered_df)
plt.title("Delays by Top 10 Routes")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45, fontsize=8)
plt.ylim(-200, 500)
plt.tight_layout()
plt.show()

In [None]:
status_summary = get_status_summary(df) # Get the status summary DataFrame
status_summary

In [None]:
# The **proportion** of different **flight statuses** (active, canceled, scheduled, diverted) using a pie chart.
status_proportions = status_summary['Proportions (%)']  # Extract status proportions for plotting
sns.set_theme(style="whitegrid")
plt.figure(figsize=(6, 6))
plt.pie(status_proportions, labels=status_summary['Status'], autopct='%1.1f%%', startangle=50)
plt.title('Proportion of Different Status Types')
plt.show()

#### **2.2. Flight Performance**

Visualize the **distribution** of **delayed flights** across the dataset.
- Histogram for delayed minutes
- Histogram for delayed/not delayed flights

In [None]:
flight_performance = FlightPerformance(df)
percentages = flight_performance.overall_performance() # Calculate overall performance
print("Overall Flight Performance Percentage\n")
for label, percent in percentages.items():
    print(f"{label}: {percent:.2f}%")

# Define delay ranges and labels
delay_ranges = [
    (0, 60, "0 - 60 minutes"),
    (60, 120, "60 - 120 minutes"),
    (120, 180, "120 - 180 minutes"),
    (180, None, "over 180 minutes")
]

delay_percentages = flight_performance.delayed_flight_percentages(delay_ranges) # Calculate and print delay percentages for each range
print("\nPercentage of Delayed Flights\n")
for label, percent in delay_percentages.items():
    print(f"Between {label}: {percent:.2f}%")

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1) # Distribution of dep_delay
sns.histplot(df['dep_delay'], bins=100, kde=True)  # kde=True adds a smooth curve
plt.title('Distribution of Departure Delays')
plt.xlabel('dep_delay (min.)')
plt.ylabel('Number of Flights')
plt.xlim(0, 200) 

plt.subplot(1, 2, 2) # Distribution of dep_delay_15 
sns.histplot(df['dep_delay_15'], bins=2, kde=False)  # Binary values (0 and 1)
plt.title('Distribution of Delayed or Not')
plt.xlabel('dep_delay_15 (binary)')
plt.ylabel('Number of Flights')
plt.xticks(ticks=[0, 1], labels=['Not Delayed (<15 min.)', 'Delayed (>15 min.)'])

plt.tight_layout()
plt.show()

#### **2.3. Time window**
Understand how flight schedules (departure time sdt and arrival time sat) influence **delays** across different **time windows** (morning, afternoon, night)

In [None]:
dep_flight_summary = flight_summary_by_time_window(df, 'dep_time_window', summarize_delays=True)
arr_flight_summary = flight_summary_by_time_window(df, 'arr_time_window', summarize_delays=True)
time_window_proportions = calculate_time_window_percentages(df)
time_window_proportions

In [None]:
dep_flight_summary

In [None]:
arr_flight_summary

In [None]:
# New df for the bar plots
dep_bar_data = pd.melt(dep_flight_summary, id_vars='dep_time_window', value_vars=['delayed_flights', 'total_flights'],var_name='flight_type', value_name='count')
arr_bar_data = pd.melt(arr_flight_summary, id_vars='arr_time_window', value_vars=['delayed_flights', 'total_flights'],var_name='flight_type', value_name='count')

# Shift left for delayed flights and right for total flights
dep_bar_data['position'] = dep_bar_data['flight_type'].map({'delayed_flights': -0.2, 'total_flights': 0.2})
arr_bar_data['position'] = arr_bar_data['flight_type'].map({'delayed_flights': -0.2, 'total_flights': 0.2})

# Plot for Total and Delayed Arrival Flights
plt.figure(figsize=(10, 6))
ax=sns.barplot(data=dep_bar_data, x='dep_time_window', y='count', hue='flight_type', dodge=True, palette=['#35b779', '#31688e'])
for index, row in dep_flight_summary.iterrows():
    if row['delayed_flights'] > 0:  # Only display if there are delayed flights
        ax.text(index, row['delayed_flights'], f"{row['dep_time_window_percentage_delayed']}", color='black', ha='right', va='bottom', fontsize=12)
plt.title('Total and Delayed Flights by Departure Time Window')
plt.ylabel('Number of Flights')
plt.xlabel('Departure Time Window')
plt.xticks(rotation=45)
plt.legend(title='Flight Type', labels=['Delayed Flights', 'Total Flights'])
plt.show()

# Plot for Total and Delayed Departure Flights
plt.figure(figsize=(10, 6))
bx=sns.barplot(data=arr_bar_data, x='arr_time_window', y='count', hue='flight_type', dodge=True, palette=['#35b779', '#31688e'])
for index, row in arr_flight_summary.iterrows():
    if row['delayed_flights'] > 0:  # Only display if there are delayed flights
        bx.text(index, row['delayed_flights'], f"{row['arr_time_window_percentage_delayed']}", color='black', ha='right', va='bottom', fontsize=12)
plt.title('Total and Arrival Flights by Departure Time Window')
plt.ylabel('Number of Flights')
plt.xlabel('Arrival Time Window')
plt.xticks(rotation=45)
plt.legend(title='Flight Type', labels=['Delayed Flights', 'Total Flights'])
plt.show()

#### **2.4. Flight Category Comparison**
Comparison of the **on-time performance** of cargo, private, & commercial flights and the **distribution** of delay times within them.

In [None]:
on_time_performance = calculate_on_time_performance(df)
flight_percentages = calculate_flight_percentages(df)

# Combine both DataFrames
combined_performance = pd.merge(on_time_performance, flight_percentages, on='Flight Category', how='left')
combined_performance

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='dep_delay', hue='flight_cat', kde=True, bins=50)
plt.xlim(0, 300)  # 300 minutes for better visualization
plt.title('Distribution of Departure Delays by Flight Category')
plt.xlabel('Departure Delay (minutes)')
plt.ylabel('Number of flights')
plt.show()

#### **2.5. Airline Delays**
The total volume of flights and the percentage of delayed flights by airlines.

In [None]:
# Total volume of flights by airline
total_flights_airline = df.groupby('airline_name')['uuid'].count().reset_index()
total_flights_airline.columns = ['airline_name', 'total_flights']

# Delayed flights by airline
delayed_flights_airline = df[df['dep_delay_15'] == 1].groupby('airline_name')['uuid'].count().reset_index()
delayed_flights_airline.columns = ['airline_name', 'delayed_flights']

# Percentage of delayed flights
flights_summary = pd.merge(total_flights_airline, delayed_flights_airline, on='airline_name', how='left')
flights_summary['delayed_flights'] = flights_summary['delayed_flights'].fillna(0).astype(int)  # Handling NaNs
flights_summary['percent_delayed(%)'] = ((flights_summary['delayed_flights'] / flights_summary['total_flights']) * 100).round(2)

sorted_flights_summary = flights_summary.sort_values(by='percent_delayed(%)', ascending=False)

sns.set_theme(style="whitegrid")
plt.figure(figsize=(20,7))
plt.bar(sorted_flights_summary['airline_name'], sorted_flights_summary['percent_delayed(%)'], color='skyblue')
plt.xticks(rotation=90, fontsize=4)
plt.xlabel('Airline')
plt.ylabel('Percentage of Delayed Flights (%)')
plt.title('Percentage of Delayed Flights by Airline')
plt.tight_layout()
plt.show()

#### **2.6. Frequent Routes and Airports**
Routes and airports in Brussels with the highest percentage of delays.

In [None]:
# Calculate and print average delays
average_delay_sorted = calculate_average_delay(df)
print(average_delay_sorted.head())

top_delays = average_delay_sorted.head(10) # Plot the top 10 average delays
plt.figure(figsize=(10, 6))
sns.barplot(x='average_dep_delay', y='route_iata_code', hue='airline_name', data=top_delays, dodge=False)
plt.title('Top Average Delays by Route and Airline')
plt.xlabel('Average Delay (min.)')
plt.ylabel('Route IATA Code')
plt.show()

#### **2.8. EDA Conclusions**

1. Most of the flights in the dataset are delayed. **Delayed Flights: 61.97%**
2. Most of the delays happen within the first hour. **Between 0 - 60 minutes: 88.05%**
3. Top 10 airline & route delays
4. Most flights are scheduled to depart in the morning. **Departure (Morning): 50.35%**
5. Most flights are scheduled to arrive in the morning. **Arrival (Morning): 40.07%**
6. The delay within departure flights throughout the day is similar
    * **Afternoon: 63.31%** (of the flights are delayed)
    * Evening: 62.15% 
    * Morning: 61.07% 
7. The delay within arrival flights throughout the day is similar
    * **Afternoon: 65.06%** (of the flights are delayed)
    * Evening: 62.5%
    * Morning: 58.08%
8. Most of the flights are commercial. **Commercial Flights: 92.81%**
9. The delay within departure flight throughout the day is similar
    * Cargo: 31.7%  (of the flights are delayed)
    * Commercial: 34.62%
    * **Private: 38.47%**
10. Most of the flights in the dataset are active. **Active flights: 51.04%**