In [None]:
import yaml

with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Example usage:
viz_dir = config['visualizations']['delay_distributions']


In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Create visualization directories
import os
os.makedirs('../visualizations/delay_distributions', exist_ok=True)
os.makedirs('../visualizations/temporal_patterns', exist_ok=True)
os.makedirs('../visualizations/root_cause_analysis', exist_ok=True)
os.makedirs('../visualizations/model_results', exist_ok=True)


In [None]:
# Cell 2: Load cleaned data
df = pd.read_csv('../data/processed/cleaned_flight_data.csv')
print(f"Data loaded. Shape: {df.shape}")


In [None]:
# Cell 3: Arrival delay distribution
plt.figure(figsize=(15, 5))

# Subplot 1: Overall delay distribution
plt.subplot(1, 3, 1)
plt.hist(df['arr_delay'], bins=100, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Overall Arrival Delay Distribution')
plt.xlabel('Delay (minutes)')
plt.ylabel('Frequency')
plt.xlim(-100, 500)

# Subplot 2: Delay distribution (zoomed in)
plt.subplot(1, 3, 2)
delay_filtered = df[(df['arr_delay'] >= -60) & (df['arr_delay'] <= 300)]
plt.hist(delay_filtered['arr_delay'], bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
plt.title('Delay Distribution (Filtered: -60 to 300 min)')
plt.xlabel('Delay (minutes)')
plt.ylabel('Frequency')

# Subplot 3: On-time vs Delayed
plt.subplot(1, 3, 3)
on_time = (df['arr_delay'] < 15).sum()
delayed = (df['arr_delay'] >= 15).sum()
plt.pie([on_time, delayed], labels=['On-time (<15 min)', 'Delayed (≥15 min)'], 
        autopct='%1.1f%%', colors=['lightgreen', 'salmon'])
plt.title('On-time vs Delayed Flights')

plt.tight_layout()
plt.savefig('../visualizations/delay_distributions/arrival_delay_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Cell 4: Delay causes analysis
plt.figure(figsize=(15, 10))

# Calculate total delays by cause
delay_causes = {
    'Carrier': df['carrier_delay'].sum(),
    'Weather': df['weather_delay'].sum(),
    'NAS': df['nas_delay'].sum(),
    'Security': df['security_delay'].sum(),
    'Late Aircraft': df['late_aircraft_delay'].sum()
}

# Subplot 1: Total delay minutes by cause
plt.subplot(2, 2, 1)
causes = list(delay_causes.keys())
minutes = list(delay_causes.values())
bars = plt.bar(causes, minutes, color=['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC'])
plt.title('Total Delay Minutes by Cause')
plt.ylabel('Total Delay Minutes')
plt.xticks(rotation=45)
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}', ha='center', va='bottom')

# Subplot 2: Average delay per incident
plt.subplot(2, 2, 2)
avg_delays = {
    'Carrier': df[df['carrier_ct'] > 0]['carrier_delay'].mean(),
    'Weather': df[df['weather_ct'] > 0]['weather_delay'].mean(),
    'NAS': df[df['nas_ct'] > 0]['nas_delay'].mean(),
    'Security': df[df['security_ct'] > 0]['security_delay'].mean(),
    'Late Aircraft': df[df['late_aircraft_ct'] > 0]['late_aircraft_delay'].mean()
}
avg_delays = {k: v for k, v in avg_delays.items() if not np.isnan(v)}
plt.bar(avg_delays.keys(), avg_delays.values(), color=['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC'])
plt.title('Average Delay per Incident by Cause')
plt.ylabel('Average Delay (minutes)')
plt.xticks(rotation=45)

# Subplot 3: Frequency of delay causes
plt.subplot(2, 2, 3)
cause_counts = {
    'Carrier': df['carrier_ct'].sum(),
    'Weather': df['weather_ct'].sum(),
    'NAS': df['nas_ct'].sum(),
    'Security': df['security_ct'].sum(),
    'Late Aircraft': df['late_aircraft_ct'].sum()
}
plt.bar(cause_counts.keys(), cause_counts.values(), color=['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC'])
plt.title('Frequency of Delay Causes')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=45)

# Subplot 4: Pie chart of delay distribution
plt.subplot(2, 2, 4)
plt.pie(minutes, labels=causes, autopct='%1.1f%%', 
        colors=['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC'])
plt.title('Distribution of Total Delay Minutes')

plt.tight_layout()
plt.savefig('../visualizations/root_cause_analysis/delay_causes_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Cell 5: Temporal patterns analysis
plt.figure(figsize=(20, 12))

# Subplot 1: Delays by month
plt.subplot(2, 3, 1)
monthly_delays = df.groupby('month')['arr_delay'].mean()
plt.plot(monthly_delays.index, monthly_delays.values, marker='o', linewidth=2, markersize=8)
plt.title('Average Delay by Month')
plt.xlabel('Month')
plt.ylabel('Average Delay (minutes)')
plt.xticks(range(1, 13))
plt.grid(True, alpha=0.3)

# Subplot 2: Flight volume by month
plt.subplot(2, 3, 2)
monthly_flights = df.groupby('month')['arr_flights'].sum()
plt.bar(monthly_flights.index, monthly_flights.values, color='lightblue', alpha=0.7)
plt.title('Total Flights by Month')
plt.xlabel('Month')
plt.ylabel('Total Flights')
plt.xticks(range(1, 13))

# Subplot 3: Delay percentage by month
plt.subplot(2, 3, 3)
monthly_delay_pct = df.groupby('month').apply(lambda x: (x['arr_delay'] >= 15).mean() * 100)
plt.bar(monthly_delay_pct.index, monthly_delay_pct.values, color='salmon', alpha=0.7)
plt.title('Percentage of Delayed Flights by Month')
plt.xlabel('Month')
plt.ylabel('% Delayed Flights')
plt.xticks(range(1, 13))

# Subplot 4: Top 10 airports by delay
plt.subplot(2, 3, 4)
airport_delays = df.groupby('airport')['arr_delay'].mean().sort_values(ascending=False).head(10)
plt.barh(range(len(airport_delays)), airport_delays.values, color='orange', alpha=0.7)
plt.yticks(range(len(airport_delays)), airport_delays.index)
plt.title('Top 10 Airports by Average Delay')
plt.xlabel('Average Delay (minutes)')

# Subplot 5: Top 10 carriers by delay
plt.subplot(2, 3, 5)
carrier_delays = df.groupby('carrier')['arr_delay'].mean().sort_values(ascending=False).head(10)
plt.barh(range(len(carrier_delays)), carrier_delays.values, color='lightcoral', alpha=0.7)
plt.yticks(range(len(carrier_delays)), carrier_delays.index)
plt.title('Top 10 Carriers by Average Delay')
plt.xlabel('Average Delay (minutes)')

# Subplot 6: Cancellation and diversion rates
plt.subplot(2, 3, 6)
total_flights = df['arr_flights'].sum()
cancelled = df['arr_cancelled'].sum()
diverted = df['arr_diverted'].sum()
completed = total_flights - cancelled - diverted

labels = ['Completed', 'Cancelled', 'Diverted']
sizes = [completed, cancelled, diverted]
colors = ['lightgreen', 'red', 'yellow']
plt.pie(sizes, labels=labels, autopct='%1.2f%%', colors=colors)
plt.title('Flight Completion Status')

plt.tight_layout()
plt.savefig('../visualizations/temporal_patterns/temporal_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Cell 6: Correlation analysis
plt.figure(figsize=(16, 12))

# Select numerical columns for correlation
numerical_cols = ['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 
                 'security_ct', 'late_aircraft_ct', 'arr_delay', 'carrier_delay', 
                 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

# Calculate correlation matrix
corr_matrix = df[numerical_cols].corr()

# Create heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of Flight Delay Features')
plt.tight_layout()
plt.savefig('../visualizations/model_results/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Cell 7: Summary statistics
print("=== EDA SUMMARY STATISTICS ===")
print(f"Total records analyzed: {len(df):,}")
print(f"Total flights: {df['arr_flights'].sum():,}")
print(f"Total delayed flights (≥15 min): {(df['arr_delay'] >= 15).sum():,}")
print(f"Delay rate: {((df['arr_delay'] >= 15).sum() / len(df)) * 100:.2f}%")
print(f"Average delay (all flights): {df['arr_delay'].mean():.2f} minutes")
print(f"Average delay (delayed flights only): {df[df['arr_delay'] >= 15]['arr_delay'].mean():.2f} minutes")
print(f"Most delayed airport: {df.groupby('airport')['arr_delay'].mean().idxmax()}")
print(f"Most delayed carrier: {df.groupby('carrier')['arr_delay'].mean().idxmax()}")
