In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("airline_delay.csv")


print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())


print("\nMissing Values:")
print(df.isnull().sum())

# handeling numeric missing values
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

#  missing values after handling
print("\nMissing Values After Handling:")
print(df.isnull().sum())

# Histograms for numeric features
numeric_features = ["DEPARTURE_DELAY", "ARRIVAL_DELAY", "SCHEDULED_ARRIVAL", "DISTANCE", "SCHEDULED_DEPARTURE", "WHEELS_ON"]
df[numeric_features].hist(bins=30, figsize=(15, 10))
plt.suptitle("Histograms of Numeric Features")
plt.show()

# Scatter plots for pairs of numeric features
sns.pairplot(df[numeric_features])
plt.suptitle("Scatter Plots of Numeric Features", y=1.02)
plt.show()

# Correlation 
corr_matrix = df[numeric_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# Boxplots 
plt.figure(figsize=(15, 10))  # Adjust figure size for better visualization
rows, cols = 2, 3  # 2 rows and 3 columns for 6 features
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(rows, cols, i)
    sns.boxplot(y=df[feature])
    plt.title(f"Boxplot of {feature}")
plt.tight_layout()
plt.show()

# outliers 
def detect_outliers(feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    return outliers

for feature in numeric_features:
    outliers = detect_outliers(feature)
    print(f"Outliers in {feature}: {len(outliers)}")

#  important features using correlation with target variable
target_corr = df[numeric_columns].corr()["ARRIVAL_DELAY"].sort_values(ascending=False)
print("Correlation with ARRIVAL_DELAY:\n", target_corr)

#  correlation with target variable
plt.figure(figsize=(10, 6))
sns.barplot(x=target_corr.index, y=target_corr.values)
plt.xticks(rotation=90)
plt.title("Correlation with ARRIVAL_DELAY")
plt.show()