In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler  
import seaborn as sns

In [2]:
dropped_data = os.path.join("dataset", "Cleaned_flight_data.csv")
dropped_data = pd.read_csv(dropped_data)

#Checking total NA
#data.isna().sum()

#data.describe()

In [3]:
# Adjust delay calculations for edge cases
def calculate_departure_delay(departure_time, scheduled_departure):
    # If DEPARTURE_TIME is greater than SCHEDULED_DEPARTURE but it's still the day before
    if departure_time > 1300 and scheduled_departure < 300:
        return (departure_time - 1440) - scheduled_departure  # Adjust by subtracting a full day (1440 minutes)
    else:
        return departure_time - scheduled_departure

def calculate_arrival_delay(arrival_time, scheduled_arrival):
    # If ARRIVAL_TIME is after midnight but the scheduled arrival was late the day before
    if arrival_time < 300 and scheduled_arrival > 1300:
        return arrival_time + 1440 - scheduled_arrival  # Adjust by adding a full day (1440 minutes)
    else:
        return arrival_time - scheduled_arrival

In [4]:
# AIR_TIME = (WHEELS_OFF - WHEELS_ON) + (DESTINATION_TZ - SOURCE_TZ) * 60
dropped_data.loc[:, 'AIR_TIME'] = (dropped_data['WHEELS_ON'] - dropped_data['WHEELS_OFF']) + ((dropped_data['ORIGIN_AIRPORT_TZ'] - dropped_data['DESTINATION_AIRPORT_TZ']) * 60)

In [5]:
# ELAPSED_TIME = AIR_TIME + TAXI_OUT + TAXI_IN
dropped_data.loc[:, 'ELAPSED_TIME'] = dropped_data['AIR_TIME'] + dropped_data['TAXI_OUT'] + dropped_data['TAXI_IN']

In [6]:
# Calculate DEPARTURE_DELAY using the custom function to handle edge cases
dropped_data.loc[:, 'DEPARTURE_DELAY'] = dropped_data.apply(
    lambda row: calculate_departure_delay(row['DEPARTURE_TIME'], row['SCHEDULED_DEPARTURE']),
    axis=1
)

In [7]:
# Calculate ARRIVAL_DELAY using the custom function to handle edge cases
dropped_data.loc[:, 'ARRIVAL_DELAY'] = dropped_data.apply(
    lambda row: calculate_arrival_delay(row['ARRIVAL_TIME'], row['SCHEDULED_ARRIVAL']),
    axis=1
)

In [8]:
#dropped_data.loc[:, "ARRIVAL_DELAY"]

In [13]:
#Defining catergorical_cols, numerial_cols and target
categorical_cols = ["MONTH", "DAY_OF_WEEK", 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', "ORIGIN_AIRPORT_TZ", "DESTINATION_AIRPORT_TZ"]
Not_to_included_ft = ["AIR_SYSTEM_DELAY", "SECURITY_DELAY", "AIRLINE_DELAY", "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"]
target = ["DEPARTURE_DELAY", "ARRIVAL_DELAY"]
numerical_cols = [col for col in dropped_data.columns if col not in categorical_cols + target + Not_to_included_ft]

In [18]:
#print(numerical_cols)

In [15]:
# 1. OneHotEncoder for categorical_encoded
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use sparse_output=False for newer versions of scikit-learn
categorical_encoded = pd.DataFrame(encoder.fit_transform(dropped_data[categorical_cols]),
                                   columns=encoder.get_feature_names_out(categorical_cols))


In [16]:
# 2. StandardScaler for numerical columns
scaler = StandardScaler()
numerical_scaled = pd.DataFrame(scaler.fit_transform(dropped_data[numerical_cols]), 
                                columns=numerical_cols)

In [17]:
final_data = pd.concat([numerical_scaled, categorical_encoded, dropped_data[target].reset_index(drop=True)], axis=1)

In [19]:
dropped_data.loc[:, "DEPARTURE"]

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,ORIGIN_AIRPORT_TZ,DESTINATION_AIRPORT,DESTINATION_AIRPORT_TZ,...,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,ANC,-8.0,SEA,-7.0,...,244,4.0,270,248,-22.0,,,,,
1,2015,1,1,4,AA,2336,LAX,-7.0,PBI,-4.0,...,457,4.0,470,461,-9.0,,,,,
2,2015,1,1,4,US,840,SFO,-7.0,CLT,-4.0,...,480,11.0,486,491,5.0,,,,,
3,2015,1,1,4,AA,258,LAX,-7.0,MIA,-4.0,...,468,8.0,485,476,-9.0,,,,,
4,2015,1,1,4,AS,135,SEA,-7.0,ANC,-8.0,...,174,5.0,200,179,-21.0,,,,,
