# ML4UVA

---

## Jupyter Setup

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import pandas as pd
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns

import sys, os, math

from pandas.core.interchange import dataframe

%matplotlib inline

RANDOM_SEED = 42
sklearn.random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
keras.utils.set_random_seed(RANDOM_SEED)

In [2]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

---

## Data Cleaning
### Import Flight Data
Imports and summarizes the dataset

In [3]:
def load_flight_data():
    flight_data = []
    for year in range(2009, 2010):
        flight_data.append(pd.read_csv('kaggle_flight_delay_' + str(year) + '.csv'))

    return pd.concat(flight_data)


data = load_flight_data()

# SIZE (MB)
print('The training set uses', '%.1f' % (data.memory_usage(deep=True).sum() / (1024 ** 3)), '\bGB of memory.')

# ENTRIES
print('The training set has', data.shape[0], 'entries.')

# FEATURES
print('The training set has', data.shape[1], 'features.')

# CATEGORICAL DATA
print('The training set', 'has' if not data.select_dtypes(include=['category', 'object']).empty else 'does not have',
      'categorical data.')

# MISSING DATA
print('The training set', 'has' if data.isnull().any(axis=None) else 'does not have', 'missing data.', end='\n\n')

# BASIC STATISTICS
data.info()
data.describe()

The training set uses 2.8 GB of memory.
The training set has 6429338 entries.
The training set has 28 features.
The training set has categorical data.
The training set has missing data.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6429338 entries, 0 to 6429337
Data columns (total 28 columns):
 #   Column               Dtype  
---  ------               -----  
 0   FL_DATE              object 
 1   OP_CARRIER           object 
 2   OP_CARRIER_FL_NUM    int64  
 3   ORIGIN               object 
 4   DEST                 object 
 5   CRS_DEP_TIME         int64  
 6   DEP_TIME             float64
 7   DEP_DELAY            float64
 8   TAXI_OUT             float64
 9   WHEELS_OFF           float64
 10  WHEELS_ON            float64
 11  TAXI_IN              float64
 12  CRS_ARR_TIME         int64  
 13  ARR_TIME             float64
 14  ARR_DELAY            float64
 15  CANCELLED            float64
 16  CANCELLATION_CODE    object 
 17  DIVERTED             float64
 18  CRS_ELAPSED_TI

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
count,6429338.0,6429338.0,6346471.0,6346471.0,6343551.0,6343551.0,6340016.0,6340017.0,6429338.0,6340016.0,...,6429338.0,6326976.0,6326977.0,6429338.0,1170501.0,1170501.0,1170501.0,1170501.0,1170501.0,0.0
mean,2293.712,1319.447,1326.026,7.598179,16.03613,1350.798,1478.506,6.894225,1496.722,1485.662,...,129.3446,126.2434,103.3257,724.97,15.46616,2.755585,16.44337,0.06408282,19.53755,
std,2046.497,457.7529,468.894,31.76494,10.56383,470.0515,487.8461,4.908826,473.3178,490.4066,...,69.66364,69.50774,67.06977,561.0229,39.985,17.90554,29.79917,1.745892,37.05886,
min,1.0,1.0,1.0,-96.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,14.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,
25%,618.0,926.0,928.0,-5.0,10.0,944.0,1107.0,4.0,1119.0,1112.0,...,80.0,76.0,55.0,325.0,0.0,0.0,0.0,0.0,0.0,
50%,1629.0,1315.0,1322.0,-2.0,13.0,1336.0,1510.0,6.0,1519.0,1515.0,...,111.0,109.0,85.0,581.0,0.0,0.0,5.0,0.0,0.0,
75%,3536.0,1710.0,1719.0,5.0,19.0,1734.0,1857.0,8.0,1903.0,1903.0,...,159.0,155.0,131.0,948.0,16.0,0.0,21.0,0.0,25.0,
max,7829.0,2359.0,2400.0,2445.0,458.0,2400.0,2400.0,197.0,2400.0,2400.0,...,660.0,799.0,699.0,4962.0,2439.0,1114.0,1305.0,345.0,1391.0,


### Data Pre-Processing
Sets up two data pre-processors: the first coverts datatypes and correct erroneous values, and the second drops duplicate and unused data columns.

In [4]:
from sklearn.preprocessing import FunctionTransformer

def convert_datatype(dataframe):
    dataframe = dataframe.astype({
        'CANCELLATION_CODE': str,

        # DURATION (NULLABLE)
        'DEP_DELAY': 'Int64',
        'ARR_DELAY': 'Int64',
        'TAXI_OUT': 'Int64',
        'TAXI_IN': 'Int64',
        'CRS_ELAPSED_TIME': 'Int64',
        'ACTUAL_ELAPSED_TIME': 'Int64',
        'AIR_TIME': 'Int64',
        'CARRIER_DELAY': 'Int64',
        'WEATHER_DELAY': 'Int64',
        'NAS_DELAY': 'Int64',
        'SECURITY_DELAY': 'Int64',
        'LATE_AIRCRAFT_DELAY': 'Int64',

        # TIME (NULLABLE)
        'DEP_TIME': 'Int64',
        'ARR_TIME': 'Int64',
        'WHEELS_OFF': 'Int64',
        'WHEELS_ON': 'Int64',

        # Changed it to Int64 to handle NaN and prevent errors
        'DISTANCE': 'Int64',

        'CANCELLED': bool,
        'DIVERTED': bool,
    })

    # CONVERT TO DAY, MONTH, YEAR
    # Add coerce which cverts any invalid dates formats to NAT and drop those dates.
    dataframe['FLIGHT_DATE'] = pd.to_datetime(dataframe['FL_DATE'], errors ='coerce')
    dataframe = dataframe.dropna(subset=["FLIGHT_DATE"])
    dataframe['DAY'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.day)
    dataframe['MONTH'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.month)
    dataframe['YEAR'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.year)

    '''
    # Converts the time (HHMM) to the number of minutes since midnight, which improves model consistency.
    # Otherwise, 1159 (11:59AM) and 1200 (12:00PM) are treated as 41 minutes apart, which is incorrect.
    # '''
    for column in [
        'CRS_DEP_TIME',
        'DEP_TIME',
        'CRS_ARR_TIME',
        'ARR_TIME',
        'WHEELS_OFF',
        'WHEELS_ON',
    ]:
        dataframe[column] = dataframe[column].apply(lambda x: x % 100 + 60 * (x // 100))

    # CONVERTED FOR CLARITY
    cancellation_codes = {
        'nan': 'NONE',
        'A': 'AIRLINE',
        'B': 'WEATHER',
        'C': 'NAS',
        'D': 'SECURITY'
    }
    dataframe['CANCELLATION_CODE'] = dataframe['CANCELLATION_CODE'].apply(lambda x: cancellation_codes[x])

    airport_region_map = {
        'BOS': 'New England', 'BDL': 'New England', 'PWM': 'New England', 'PVD': 'New England',
        'ALB': 'New England', 'MHT': 'New England', 'BGR': 'New England',

        'JFK': 'Mid-Atlantic', 'LGA': 'Mid-Atlantic', 'EWR': 'Mid-Atlantic', 'PHL': 'Mid-Atlantic',
        'DCA': 'Mid-Atlantic', 'BWI': 'Mid-Atlantic', 'ACY': 'Mid-Atlantic',

        'ORD': 'Midwest', 'MDW': 'Midwest', 'DTW': 'Midwest', 'CLE': 'Midwest',
        'IND': 'Midwest', 'CMH': 'Midwest', 'MKE': 'Midwest', 'GRR': 'Midwest',

        'MSP': 'Great Plains', 'OMA': 'Great Plains', 'DSM': 'Great Plains', 'MCI': 'Great Plains',
        'STL': 'Great Plains', 'BIS': 'Great Plains', 'FSD': 'Great Plains',

        'ATL': 'Southeast', 'CLT': 'Southeast', 'MIA': 'Southeast', 'FLL': 'Southeast', 'TPA': 'Southeast',
        'BNA': 'Southeast', 'RDU': 'Southeast', 'CHS': 'Southeast', 'JAX': 'Southeast', 'MEM': 'Southeast',
        'SAV': 'Southeast', 'MSY': 'Southeast', 'BHM': 'Southeast',

        'DFW': 'Southwest', 'DAL': 'Southwest', 'HOU': 'Southwest', 'IAH': 'Southwest',
        'PHX': 'Southwest', 'AUS': 'Southwest', 'ABQ': 'Southwest', 'SAT': 'Southwest',
        'ELP': 'Southwest', 'LBB': 'Southwest', 'TUL': 'Southwest', 'OKC': 'Southwest',

        'DEN': 'Mountain', 'SLC': 'Mountain', 'COS': 'Mountain', 'BOI': 'Mountain',
        'MSO': 'Mountain', 'BZN': 'Mountain', 'GJT': 'Mountain', 'JAC': 'Mountain',

        'LAX': 'Pacific', 'SFO': 'Pacific', 'SEA': 'Pacific', 'PDX': 'Pacific',
        'SAN': 'Pacific', 'SJC': 'Pacific', 'SMF': 'Pacific', 'ONT': 'Pacific',
        'OGG': 'Pacific', 'KOA': 'Pacific', 'HNL': 'Pacific', 'ANC': 'Pacific',
        'FAI': 'Pacific', 'LIH': 'Pacific'
    }

    # Example application to the DataFrame
    dataframe['DEST'] = dataframe['DEST'].map(airport_region_map)
    dataframe['ORIGIN'] = dataframe['ORIGIN'].map(airport_region_map)


    return dataframe

datatype_converter = FunctionTransformer(convert_datatype)

In [5]:
def extract_features(dataframe):
    # DUPLICATE OR NULL DATA
    dataframe = dataframe.drop(columns=['Unnamed: 27', 'FL_DATE', 'FLIGHT_DATE'])

    # DATA MEASURED AFTER TAKEOFF
    dataframe = dataframe.drop(columns=['DEP_TIME', 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME'])

    '''
    This data measures the cause of delay, in minutes, of the plane's arrival. While potentially useful, this data DOES NOT explain the reason for the departure delay. Unfortunately, this data is only reported if the aircraft arrives less than fifteen minutes late, significantly reducing its utility. Each of these is described in more detail below (paraphrased from the Bureau of Transportation Statistics):

    - Air Carrier (CARRIER_DELAY): Circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling)
    - Extreme Weather (WEATHER_DELAY): Extreme meteorological conditions that delays or prevents the operation of a flight (e.g. tornado, blizzard or hurricane).
    - National Aviation System (NAS_DELAY): Delays and cancellations attributable to the national aviation system, such as non-extreme weather conditions (like thunderstorms), airport operations, heavy traffic volume, and air traffic control.
    - Late-arriving aircraft (LATE_AIRCRAFT_DELAY): The previous flight arrived late, causing the present flight to depart late.
    - Security (SECURITY_DELAY): Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas.

    In my opinion, this data is not very useful because it is impractical to determine whether weather was the cause of the delay.
    '''
    arrival_delay_data = dataframe[['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]
    dataframe = dataframe.drop(columns=['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY'])

    # DIVERSIONS (THESE USUALLY OCCUR AFTER TAKEOFF)
    dataframe = dataframe.drop(columns=['DIVERTED'])

    '''
    Since we have a lot of clean, usable data, I decided to drop any rows with missing flight numbers, origins, destinations, date information, and labels. The other columns can be imputed without losing much data integrity.
    '''
    dataframe = dataframe.dropna(subset=['OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'DEP_DELAY', 'CANCELLED', 'DAY', 'MONTH', 'YEAR'])

    return dataframe

feature_extractor = FunctionTransformer(extract_features)

### Data Pipeline
Uses two preprocessing steps, generates labels, then transforms numeric and categorical data.

In [6]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

column_transformer = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, make_column_selector(dtype_include=[np.number])),
        ('categorical', categorical_transformer, make_column_selector(dtype_include=['object', 'category'])),
    ]
)

preprocessor = Pipeline(steps=[
    ('datatype_converter', datatype_converter),
    ('feature_extractor', feature_extractor),
])
transformer = Pipeline(steps=[
    ('transformer', column_transformer),
])

In [7]:
# PREPROCESS
preprocessed_data = preprocessor.fit_transform(data)

# LABELS
cancellation_data = preprocessed_data['CANCELLATION_CODE']
delay_data = preprocessed_data['DEP_DELAY']
preprocessed_data = preprocessed_data.drop(columns=['CANCELLATION_CODE', 'CANCELLED', 'DEP_DELAY'])

# TRANSFORMER
processed_data = transformer.fit_transform(preprocessed_data)

In [8]:
feature_names = column_transformer.get_feature_names_out()

processed_df = pd.DataFrame(processed_data, columns=feature_names)

print("One hot encoder new columns:")
print(processed_df.columns.tolist())

One hot encoder new columns:
['numerical__OP_CARRIER_FL_NUM', 'numerical__CRS_DEP_TIME', 'numerical__CRS_ARR_TIME', 'numerical__CRS_ELAPSED_TIME', 'numerical__DISTANCE', 'numerical__DAY', 'numerical__MONTH', 'numerical__YEAR', 'categorical__OP_CARRIER_9E', 'categorical__OP_CARRIER_AA', 'categorical__OP_CARRIER_AS', 'categorical__OP_CARRIER_B6', 'categorical__OP_CARRIER_CO', 'categorical__OP_CARRIER_DL', 'categorical__OP_CARRIER_EV', 'categorical__OP_CARRIER_F9', 'categorical__OP_CARRIER_FL', 'categorical__OP_CARRIER_HA', 'categorical__OP_CARRIER_MQ', 'categorical__OP_CARRIER_NW', 'categorical__OP_CARRIER_OH', 'categorical__OP_CARRIER_OO', 'categorical__OP_CARRIER_UA', 'categorical__OP_CARRIER_US', 'categorical__OP_CARRIER_WN', 'categorical__OP_CARRIER_XE', 'categorical__OP_CARRIER_YV', 'categorical__ORIGIN_Great Plains', 'categorical__ORIGIN_Mid-Atlantic', 'categorical__ORIGIN_Midwest', 'categorical__ORIGIN_Mountain', 'categorical__ORIGIN_New England', 'categorical__ORIGIN_Pacific', 'c

### Training, Testing, and Validation Datasets & Labels
Creates training labels for the amount of delay in minutes and a boolean delay status with a fifteen-minute threshold. The dataset is split into training, validation, and test datasets.

In [10]:
delay_labels = delay_data.combine(cancellation_data, lambda delay, code: delay if not pd.isna(code) else 120)
delay_statuses = delay_labels > 15

In [12]:
from sklearn.model_selection import train_test_split

full_training, test, full_training_delays, test_delays, full_training_statuses, test_statuses = train_test_split(
    processed_data, delay_labels, delay_statuses, test_size=0.2, random_state=RANDOM_SEED
)
training, validation, training_delays, validation_delays, training_statuses, validations_statuses = train_test_split(
    full_training, full_training_delays, full_training_statuses, test_size=0.2, random_state=RANDOM_SEED
)

---

## Models
### Linear Regression

In [13]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

linear_regressor = SGDRegressor(random_state=RANDOM_SEED)
linear_regressor.fit(training, training_delays)

linear_delay_predictions = linear_regressor.predict(validation)
linear_rmse = root_mean_squared_error(validation_delays, linear_delay_predictions)

In [14]:
print(linear_rmse)

31.333839070780027
