In [1]:
import polars as pl
import pandas as pd
import duckdb
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import polars as pl

class DataLoader:
    def __init__(self, flights_path, airlines_path, airports_path):
        self.flights_path = flights_path
        self.airlines_path = airlines_path
        self.airports_path = airports_path
        self.flights_df = None
        self.airlines_df = None
        self.airports_df = None

    def load_data(self):
        self.flights_df = pl.read_csv(self.flights_path)
        self.airlines_df = pl.read_csv(self.airlines_path)
        self.airports_df = pl.read_csv(self.airports_path)
        return self.flights_df, self.airlines_df, self.airports_df

In [3]:
# Create an instance of DataLoader
loader = DataLoader('flights.csv', 'airlines.csv', 'airports.csv')

# Load the data
flights_df, airlines_df, airports_df = loader.load_data()

# Print some information about the loaded data
print("Flights DataFrame:")
print(flights_df.head())
print(f"Shape: {flights_df.shape}")

print("\nAirlines DataFrame:")
print(airlines_df.head())
print(f"Shape: {airlines_df.shape}")

print("\nAirports DataFrame:")
print(airports_df.head())
print(f"Shape: {airports_df.shape}")

Flights DataFrame:
shape: (5, 31)
┌──────┬───────┬─────┬─────────────┬───┬────────────────┬───────────────┬─────────────────────┬───────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DELAY ┆ AIRLINE_DELAY ┆ LATE_AIRCRAFT_DELAY ┆ WEATHER_DELAY │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ ---            ┆ ---           ┆ ---                 ┆ ---           │
│ i64  ┆ i64   ┆ i64 ┆ i64         ┆   ┆ i64            ┆ i64           ┆ i64                 ┆ i64           │
╞══════╪═══════╪═════╪═════════════╪═══╪════════════════╪═══════════════╪═════════════════════╪═══════════════╡
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null           ┆ null          ┆ null                ┆ null          │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null           ┆ null          ┆ null                ┆ null          │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null           ┆ null          ┆ null                ┆ null          │
│ 2015 ┆ 1     ┆ 1   ┆ 4           ┆ … ┆ null           ┆ null        

In [4]:
# Basic EDA for Airlines
print("\nAirlines EDA:")
print(airlines_df.describe())



Airlines EDA:
shape: (9, 3)
┌────────────┬───────────┬──────────────────────┐
│ statistic  ┆ IATA_CODE ┆ AIRLINE              │
│ ---        ┆ ---       ┆ ---                  │
│ str        ┆ str       ┆ str                  │
╞════════════╪═══════════╪══════════════════════╡
│ count      ┆ 14        ┆ 14                   │
│ null_count ┆ 0         ┆ 0                    │
│ mean       ┆ null      ┆ null                 │
│ std        ┆ null      ┆ null                 │
│ min        ┆ AA        ┆ Alaska Airlines Inc. │
│ 25%        ┆ null      ┆ null                 │
│ 50%        ┆ null      ┆ null                 │
│ 75%        ┆ null      ┆ null                 │
│ max        ┆ WN        ┆ Virgin America       │
└────────────┴───────────┴──────────────────────┘


In [5]:
# Count of airlines
print("\nNumber of airlines:", len(airlines_df))


Number of airlines: 14


In [6]:
# Display all airlines
print("\nList of all airlines:")
print(airlines_df.sort('AIRLINE'))


List of all airlines:
shape: (14, 2)
┌───────────┬──────────────────────────────┐
│ IATA_CODE ┆ AIRLINE                      │
│ ---       ┆ ---                          │
│ str       ┆ str                          │
╞═══════════╪══════════════════════════════╡
│ AS        ┆ Alaska Airlines Inc.         │
│ AA        ┆ American Airlines Inc.       │
│ MQ        ┆ American Eagle Airlines Inc. │
│ EV        ┆ Atlantic Southeast Airlines  │
│ DL        ┆ Delta Air Lines Inc.         │
│ …         ┆ …                            │
│ WN        ┆ Southwest Airlines Co.       │
│ NK        ┆ Spirit Air Lines             │
│ US        ┆ US Airways Inc.              │
│ UA        ┆ United Air Lines Inc.        │
│ VX        ┆ Virgin America               │
└───────────┴──────────────────────────────┘


In [7]:
# Basic EDA for Airports
print("\nAirports EDA:")
print(airports_df.describe())


Airports EDA:
shape: (9, 8)
┌────────────┬───────────┬────────────────────────────┬──────────┬───────┬─────────┬───────────┬────────────┐
│ statistic  ┆ IATA_CODE ┆ AIRPORT                    ┆ CITY     ┆ STATE ┆ COUNTRY ┆ LATITUDE  ┆ LONGITUDE  │
│ ---        ┆ ---       ┆ ---                        ┆ ---      ┆ ---   ┆ ---     ┆ ---       ┆ ---        │
│ str        ┆ str       ┆ str                        ┆ str      ┆ str   ┆ str     ┆ f64       ┆ f64        │
╞════════════╪═══════════╪════════════════════════════╪══════════╪═══════╪═════════╪═══════════╪════════════╡
│ count      ┆ 322       ┆ 322                        ┆ 322      ┆ 322   ┆ 322     ┆ 319.0     ┆ 319.0      │
│ null_count ┆ 0         ┆ 0                          ┆ 0        ┆ 0     ┆ 0       ┆ 3.0       ┆ 3.0        │
│ mean       ┆ null      ┆ null                       ┆ null     ┆ null  ┆ null    ┆ 38.981244 ┆ -98.378964 │
│ std        ┆ null      ┆ null                       ┆ null     ┆ null  ┆ null    ┆ 8.6167

In [8]:
# Display all airlines
print("\nList of all AIRPORTS:")
print(airports_df.sort('AIRPORT'))


List of all AIRPORTS:
shape: (322, 7)
┌───────────┬─────────────────────────────────┬─────────────┬───────┬─────────┬──────────┬────────────┐
│ IATA_CODE ┆ AIRPORT                         ┆ CITY        ┆ STATE ┆ COUNTRY ┆ LATITUDE ┆ LONGITUDE  │
│ ---       ┆ ---                             ┆ ---         ┆ ---   ┆ ---     ┆ ---      ┆ ---        │
│ str       ┆ str                             ┆ str         ┆ str   ┆ str     ┆ f64      ┆ f64        │
╞═══════════╪═════════════════════════════════╪═════════════╪═══════╪═════════╪══════════╪════════════╡
│ ABR       ┆ Aberdeen Regional Airport       ┆ Aberdeen    ┆ SD    ┆ USA     ┆ 45.44906 ┆ -98.42183  │
│ ABI       ┆ Abilene Regional Airport        ┆ Abilene     ┆ TX    ┆ USA     ┆ 32.41132 ┆ -99.6819   │
│ SPI       ┆ Abraham Lincoln Capital Airpor… ┆ Springfield ┆ IL    ┆ USA     ┆ 39.84395 ┆ -89.67762  │
│ ADK       ┆ Adak Airport                    ┆ Adak        ┆ AK    ┆ USA     ┆ 51.87796 ┆ -176.64603 │
│ CAK       ┆ Akron-Canto

In [9]:
# Basic EDA for fLIGHTS
print("\nAirlines EDA:")
print(flights_df.describe())


Airlines EDA:
shape: (9, 32)
┌────────────┬────────────┬────────────┬────────────┬───┬───────────────┬───────────────┬───────────────┬──────────────┐
│ statistic  ┆ YEAR       ┆ MONTH      ┆ DAY        ┆ … ┆ SECURITY_DELA ┆ AIRLINE_DELAY ┆ LATE_AIRCRAFT ┆ WEATHER_DELA │
│ ---        ┆ ---        ┆ ---        ┆ ---        ┆   ┆ Y             ┆ ---           ┆ _DELAY        ┆ Y            │
│ str        ┆ f64        ┆ f64        ┆ f64        ┆   ┆ ---           ┆ f64           ┆ ---           ┆ ---          │
│            ┆            ┆            ┆            ┆   ┆ f64           ┆               ┆ f64           ┆ f64          │
╞════════════╪════════════╪════════════╪════════════╪═══╪═══════════════╪═══════════════╪═══════════════╪══════════════╡
│ count      ┆ 5.819079e6 ┆ 5.819079e6 ┆ 5.819079e6 ┆ … ┆ 1.063439e6    ┆ 1.063439e6    ┆ 1.063439e6    ┆ 1.063439e6   │
│ null_count ┆ 0.0        ┆ 0.0        ┆ 0.0        ┆ … ┆ 4.75564e6     ┆ 4.75564e6     ┆ 4.75564e6     ┆ 4.75564e6    │
│ 

In [10]:
flights_df.null_count() #Check Distribution of Airlines

YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,14721,0,0,0,86153,86153,89047,89047,6,105071,105071,0,92513,92513,0,92513,105071,0,0,5729195,4755640,4755640,4755640,4755640,4755640


In [11]:
flights_df["AIRLINE"].value_counts()


AIRLINE,count
str,u32
"""HA""",76272
"""US""",198715
"""B6""",267048
"""NK""",117379
"""EV""",571977
…,…
"""OO""",588353
"""UA""",515723
"""VX""",61903
"""AA""",725984


In [12]:
# Analyze Flight Delays

In [13]:
flights_df.select([
    pl.col("DEPARTURE_DELAY").mean().alias("Avg Departure Delay"),
    pl.col("ARRIVAL_DELAY").mean().alias("Avg Arrival Delay")
])


Avg Departure Delay,Avg Arrival Delay
f64,f64
9.370158,4.407057


In [14]:
#  Find the Busiest Airports

In [15]:
flights_df["ORIGIN_AIRPORT"].value_counts().sort("count", descending=True).head(10)

ORIGIN_AIRPORT,count
str,u32
"""ATL""",346836
"""ORD""",285884
"""DFW""",239551
"""DEN""",196055
"""LAX""",194673
"""SFO""",148008
"""PHX""",146815
"""IAH""",146622
"""LAS""",133181
"""MSP""",112117


In [16]:
#Check Delays by Airline

flights_df.group_by("AIRLINE").agg(
    pl.col("ARRIVAL_DELAY").mean().alias("Avg Arrival Delay")
).sort("Avg Arrival Delay", descending=True)



AIRLINE,Avg Arrival Delay
str,f64
"""NK""",14.4718
"""F9""",12.504706
"""B6""",6.677861
"""EV""",6.585379
"""MQ""",6.457873
…,…
"""US""",3.706209
"""AA""",3.451372
"""HA""",2.023093
"""DL""",0.186754


In [17]:
#Analyze Cancellations

flights_df["CANCELLATION_REASON"].value_counts()

CANCELLATION_REASON,count
str,u32
"""D""",22
"""A""",25262
"""C""",15749
"""B""",48851
,5729195


In [18]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load the data
flights_df = pl.read_csv("flights.csv")  # Replace with your actual file path

# Basic EDA
print("Dataset Shape:", flights_df.shape)
print("\nColumn Names:", flights_df.columns)
print("\nData Types:\n", flights_df.dtypes)

# Check for missing values
missing_values = flights_df.null_count()
print("\nMissing Values:\n", missing_values)

# Remove rows with missing values
flights_df = flights_df.drop_nulls()

# Convert date columns to datetime
flights_df = flights_df.with_columns([
    pl.date(pl.col('YEAR'), pl.col('MONTH'), pl.col('DAY')).alias('DATE')])

Dataset Shape: (5819079, 31)

Column Names: ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']

Data Types:
 [Int64, Int64, Int64, Int64, String, Int64, String, String, String, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, String, Int64, Int64, Int64, Int64, Int64]

Missing Values:
 shape: (1, 31)
┌──────┬───────┬─────┬─────────────┬───┬────────────────┬───────────────┬─────────────────────┬───────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DELAY ┆ AIRLINE_DELAY ┆ LATE_AIRCRA

In [19]:
# Basic EDA
print("Dataset Shape:", flights_df.shape)
print("\nColumn Names:", flights_df.columns)
print("\nData Types:\n", flights_df.dtypes)

Dataset Shape: (0, 32)

Column Names: ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DATE']

Data Types:
 [Int64, Int64, Int64, Int64, String, Int64, String, String, String, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, String, Int64, Int64, Int64, Int64, Int64, Date]


In [20]:
# Check for missing values
missing_values = flights_df.null_count()
print("\nMissing Values:\n", missing_values)


Missing Values:
 shape: (1, 32)
┌──────┬───────┬─────┬─────────────┬───┬───────────────┬─────────────────────┬───────────────┬──────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ AIRLINE_DELAY ┆ LATE_AIRCRAFT_DELAY ┆ WEATHER_DELAY ┆ DATE │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ ---           ┆ ---                 ┆ ---           ┆ ---  │
│ u32  ┆ u32   ┆ u32 ┆ u32         ┆   ┆ u32           ┆ u32                 ┆ u32           ┆ u32  │
╞══════╪═══════╪═════╪═════════════╪═══╪═══════════════╪═════════════════════╪═══════════════╪══════╡
│ 0    ┆ 0     ┆ 0   ┆ 0           ┆ … ┆ 0             ┆ 0                   ┆ 0             ┆ 0    │
└──────┴───────┴─────┴─────────────┴───┴───────────────┴─────────────────────┴───────────────┴──────┘


In [21]:
# Remove rows with missing values
flights_df = flights_df.drop_nulls()

In [22]:
# One-hot encoding for categorical columns
categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'CANCELLATION_REASON']

for col in categorical_columns:
    dummies = flights_df.select(pl.col(col)).to_dummies()
    flights_df = flights_df.hstack(dummies)

# Remove original categorical columns
flights_df = flights_df.drop(categorical_columns)

In [23]:
# Convert date columns to datetime
flights_df = flights_df.with_columns([
    pl.date(pl.col('YEAR'), pl.col('MONTH'), pl.col('DAY')).alias('DATE')
])

# Convert time columns to minutes since midnight
for col in ['SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME']:
    flights_df = flights_df.with_columns([
        pl.when(pl.col(col).is_not_null())
          .then(pl.col(col).cast(pl.Int32).mod(2400).truediv(100).floor() * 60 + pl.col(col).cast(pl.Int32).mod(100))
          .otherwise(None)
          .alias(col + '_MINUTES')
    ])

# Calculate actual elapsed time
flights_df = flights_df.with_columns([
    (pl.col('ARRIVAL_TIME_MINUTES') - pl.col('DEPARTURE_TIME_MINUTES')).alias('ACTUAL_ELAPSED_TIME')
])

In [25]:
# Function to remove outliers safely
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    if Q1 is not None and Q3 is not None:
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return df.filter((pl.col(column) >= lower_bound) & (pl.col(column) <= upper_bound))
    else:
        print(f"Warning: Could not calculate quartiles for {column}. Skipping outlier removal.")
        return df
# Function to check if dtype is numeric
def is_numeric_dtype(dtype):
    return dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]

# Remove outliers
for col in ['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'ACTUAL_ELAPSED_TIME']:
    if is_numeric_dtype(flights_df[col].dtype):
        flights_df = remove_outliers(flights_df, col)
    else:
        print(f"Warning: {col} is not numeric. Skipping outlier removal.")


# Basic statistics after cleaning and encoding
print("\nBasic Statistics After Cleaning and Encoding:\n", flights_df.describe())


Basic Statistics After Cleaning and Encoding:
 shape: (9, 34)
┌────────────┬──────┬───────┬──────┬───┬───────────────────┬───────────────────┬───────────────────┬───────────────────┐
│ statistic  ┆ YEAR ┆ MONTH ┆ DAY  ┆ … ┆ DEPARTURE_TIME_MI ┆ SCHEDULED_ARRIVAL ┆ ARRIVAL_TIME_MINU ┆ ACTUAL_ELAPSED_TI │
│ ---        ┆ ---  ┆ ---   ┆ ---  ┆   ┆ NUTES             ┆ _MINUTES          ┆ TES               ┆ ME                │
│ str        ┆ f64  ┆ f64   ┆ f64  ┆   ┆ ---               ┆ ---               ┆ ---               ┆ ---               │
│            ┆      ┆       ┆      ┆   ┆ f64               ┆ f64               ┆ f64               ┆ f64               │
╞════════════╪══════╪═══════╪══════╪═══╪═══════════════════╪═══════════════════╪═══════════════════╪═══════════════════╡
│ count      ┆ 0.0  ┆ 0.0   ┆ 0.0  ┆ … ┆ 0.0               ┆ 0.0               ┆ 0.0               ┆ 0.0               │
│ null_count ┆ 0.0  ┆ 0.0   ┆ 0.0  ┆ … ┆ 0.0               ┆ 0.0               ┆ 0.0      