AIRLINES FLIGHT DELAY ANALYSIS

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load dataset
df = pd.read_csv("DelayedFlights.csv")

# 2. Basic info
print("Total records:", len(df))
print("\nMissing values per column:\n", df.isnull().sum())


In [None]:
# 3. Fill missing delay reason counts with zero (safe fallback)
delay_cols = ['CarrierDelay', 'WeatherDelay', 'NASDelay',
              'SecurityDelay', 'LateAircraftDelay']
for col in delay_cols:
    df[col] = df[col].fillna(0)

# 4. Percentage of each delay cause per record
df['total_delay'] = df[delay_cols].sum(axis=1)
for col in delay_cols:
    df[f'{col}_pct'] = df[col] / df['total_delay'] * 100
    df[f'{col}_pct'] = df[f'{col}_pct'].fillna(0)

In [None]:
# 5. Average delay percentages by month
avg_pct_by_month = df.groupby('Month')[[f'{col}_pct' for col in delay_cols]].mean()
print("\nAverage Delay % by Month:\n", avg_pct_by_month)

In [None]:
# 6. Bar chart — average delay percentage by month
avg_pct_by_month.plot(kind='bar', figsize=(10,5), stacked=True, colormap='tab20')
plt.title("Average Delay Percentage by Cause per Month")
plt.xlabel("Month")
plt.ylabel("Average Delay Percentage (%)")
plt.legend(title="Delay Cause", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()



In [None]:
# 7. Correlation heatmap — delay causes relationship
plt.figure(figsize=(6,5))
sns.heatmap(df[delay_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation: Delay Causes")
plt.show()