In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv(r"../data/cleaned_data.csv")

In [None]:
sns.set(style="whitegrid")

In [None]:
print(data.describe())

In [None]:
numerical_features = ["DayOfWeekEncoded", "Month", "DepTimeMinutes", "DepHour", "CRSDepTimeMinutes", "CRSDepHour", "ArrTimeMinutes", "ArrHour", "CRSArrTimeMinutes", "CRSArrHour", "Distance", "ActualElapsedTime", "CRSElapsedTime", "AirTime", "TaxiOut", "TaxiIn", "Origin_Dep_Count", "Dest_Arr_Count"]
print(data[numerical_features].describe())

In [None]:
delay_features = ["DepDelay", "ArrDelay", "CarrierDelay", "NonCarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"]
print(data[delay_features].describe())

In [None]:
categorical_features = ["DayOfWeek", "UniqueCarrier", "Origin", "Dest", "FlightID"]
print(data[categorical_features].describe())

In [None]:
plt.figure(figsize=(25, 20))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(6, 3, i)
    sns.histplot(data[feature], kde=True)
    plt.title(f"Distribution of {feature}")
plt.tight_layout()
plt.savefig(r"../figures/numerical_features_histogram.png")
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
for i, feature in enumerate(delay_features, 1):
    plt.subplot(2, 4, i)
    sns.histplot(data[feature], kde=True)
    plt.title(f"Distribution of {feature}")
plt.tight_layout()
plt.savefig(r"../figures/delay_features_histogram.png")
plt.show()

In [None]:
days_order = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]
data["DayOfWeek"] = pd.Categorical(data["DayOfWeek"], categories=days_order, ordered=True)

plt.figure(figsize=(12, 5))
sns.countplot(data=data, x="DayOfWeek", order=days_order)
plt.title("Flights by Day of Week")
plt.xlabel("Day of the Week")
plt.ylabel("Flight Count")
plt.savefig(r"../figures/flights_per_day_countplot.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=data, x="UniqueCarrier", order=sorted(data["UniqueCarrier"].unique()))
plt.title("Flights by Carrier")
plt.xlabel("Carrier")
plt.ylabel("Flight Count")
plt.xticks(rotation=90)
plt.savefig(r"../figures/flights_per_carrier_countplot.png")
plt.show()

In [None]:
categorical_columns = ["Origin", "Dest"]
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(2, 2, i)
    top_10 = data[col].value_counts().head(10)
    sns.barplot(x=top_10.index, y=top_10.values)
    plt.title(f"Top 10 Most Common {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(rf"../figures/most_common_dest_origin_barplot.png")
plt.show()

In [None]:
categorical_columns = ["Origin", "Dest"]
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(2, 2, i)
    bot_10 = data[col].value_counts().tail(10)
    sns.barplot(x=bot_10.index, y=bot_10.values)
    plt.title(f"Top 10 Least Common {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(rf"../figures/least_common_dest_origin_barplot.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data["ArrDelay"], kde=True, bins=30)
plt.title("Distribution of Arrival Delays")
plt.xlabel("Arrival Delay (minutes)")
plt.ylabel("Frequency")
plt.savefig(r"../figures/arr_delay_histogram.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.boxplot(data=data[delay_features])
plt.title("Box Plot of Delay Types")
plt.ylabel("Delay Time (minutes)")
plt.xticks(rotation=45)
plt.savefig(r"../figures/delay_types_boxplot.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data["DepTimeMinutes_DayOfWeekEncoded"], kde=True, bins=50)
plt.title("Distribution of DepHour_DayOfWeek Interaction")
plt.xlabel("DepHour_DayOfWeek")
plt.ylabel("Count")
plt.savefig(r"../figures/dep_hour_day_histogram.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data["CRSArrTimeMinutes_DayOfWeekEncoded"], kde=True, bins=50)
plt.title("Distribution of CRSArrHour_DayOfWeek Interaction")
plt.xlabel("CRSArrHour_DayOfWeek")
plt.ylabel("Count")
plt.savefig(r"../figures/crs_arr_hour_day_histogram.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data["Origin_Dep_Count"], kde=True, bins=50)
plt.title("Distribution of Origin Departure Counts")
plt.xlabel("Origin_Dep_Count")
plt.ylabel("Count")
plt.savefig(r"../figures/origin_dep_counts_histogram.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data["Dest_Arr_Count"], kde=True, bins=50)
plt.title("Distribution of Destination Arrival Counts")
plt.xlabel("Dest_Arr_Count")
plt.ylabel("Count")
plt.savefig(r"../figures/dest_arr_counts_histogram.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.boxplot(data=data, x="Month", y="ArrDelay")
plt.title("Arrival Delays by Month")
plt.xlabel("Month")
plt.ylabel("Arrival Delay (minutes)")
plt.savefig(r"../figures/arr_delay_month_boxplot.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x="Distance", y="AirTime", data=data, alpha=0.5)
plt.title("Relation of Distance and Airtime")
plt.xlabel("Distance (miles)")
plt.ylabel("AirTime (minutes)")
plt.grid(True)
plt.savefig(r"../figures/distance_vs_airtime_scatter.png")
plt.show()

In [None]:
correlation_matrix = data[numerical_features + delay_features].corr()
plt.figure(figsize=(30, 20))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix")
plt.savefig(r"../figures/correlation_matrix.png")
plt.show()