# ML4UVA

---

## Setup

In [26]:
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import pandas as pd
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns

import sys, os, math

%matplotlib inline

RANDOM_SEED = 42
sklearn.random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
keras.utils.set_random_seed(RANDOM_SEED)

In [27]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [28]:
def load_flight_data():
    flight_data = []
    for year in range(2009, 2010):
        flight_data.append(pd.read_csv('Datasets/kaggle_flight_delay_' + str(year) + '.csv'))

    return pd.concat(flight_data)


data = load_flight_data()

# SIZE (MB)
print('The training set uses', '%.1f' % (data.memory_usage(deep=True).sum() / (1024 ** 3)), '\bGB of memory.')

# ENTRIES 
print('The training set has', data.shape[0], 'entries.')

# FEATURES
print('The training set has', data.shape[1], 'features.')

# CATEGORICAL DATA
print('The training set', 'has' if not data.select_dtypes(include=['category', 'object']).empty else 'does not have',
      'categorical data.')

# MISSING DATA
print('The training set', 'has' if data.isnull().any(axis=None) else 'does not have', 'missing data.', end='\n\n')

# BASIC STATISTICS
data.info()
data.describe()

The training set uses 2.6GB of memory.
The training set has 6429338 entries.
The training set has 28 features.
The training set has categorical data.
The training set has missing data.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6429338 entries, 0 to 6429337
Data columns (total 28 columns):
 #   Column               Dtype  
---  ------               -----  
 0   FL_DATE              object 
 1   OP_CARRIER           object 
 2   OP_CARRIER_FL_NUM    int64  
 3   ORIGIN               object 
 4   DEST                 object 
 5   CRS_DEP_TIME         int64  
 6   DEP_TIME             float64
 7   DEP_DELAY            float64
 8   TAXI_OUT             float64
 9   WHEELS_OFF           float64
 10  WHEELS_ON            float64
 11  TAXI_IN              float64
 12  CRS_ARR_TIME         int64  
 13  ARR_TIME             float64
 14  ARR_DELAY            float64
 15  CANCELLED            float64
 16  CANCELLATION_CODE    object 
 17  DIVERTED             float64
 18  CRS_ELAPSED_TIME

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
count,6429338.0,6429338.0,6346471.0,6346471.0,6343551.0,6343551.0,6340016.0,6340017.0,6429338.0,6340016.0,...,6429338.0,6326976.0,6326977.0,6429338.0,1170501.0,1170501.0,1170501.0,1170501.0,1170501.0,0.0
mean,2293.712,1319.447,1326.026,7.598179,16.03613,1350.798,1478.506,6.894225,1496.722,1485.662,...,129.3446,126.2434,103.3257,724.97,15.46616,2.755585,16.44337,0.06408282,19.53755,
std,2046.497,457.7529,468.894,31.76494,10.56383,470.0515,487.8461,4.908826,473.3178,490.4066,...,69.66364,69.50774,67.06977,561.0229,39.985,17.90554,29.79917,1.745892,37.05886,
min,1.0,1.0,1.0,-96.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,14.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,
25%,618.0,926.0,928.0,-5.0,10.0,944.0,1107.0,4.0,1119.0,1112.0,...,80.0,76.0,55.0,325.0,0.0,0.0,0.0,0.0,0.0,
50%,1629.0,1315.0,1322.0,-2.0,13.0,1336.0,1510.0,6.0,1519.0,1515.0,...,111.0,109.0,85.0,581.0,0.0,0.0,5.0,0.0,0.0,
75%,3536.0,1710.0,1719.0,5.0,19.0,1734.0,1857.0,8.0,1903.0,1903.0,...,159.0,155.0,131.0,948.0,16.0,0.0,21.0,0.0,25.0,
max,7829.0,2359.0,2400.0,2445.0,458.0,2400.0,2400.0,197.0,2400.0,2400.0,...,660.0,799.0,699.0,4962.0,2439.0,1114.0,1305.0,345.0,1391.0,


## Data Cleaning
This is currently here for testing. Once we are happy with the data cleaning, we can covert these steps into a pipeline.

In [39]:
test_data = data.copy()

test_data = test_data.astype({
    'CANCELLATION_CODE': str,
    
    # DURATION (NULLABLE)
    'DEP_DELAY': 'Int64',
    'ARR_DELAY': 'Int64',
    'TAXI_OUT': 'Int64',
    'TAXI_IN': 'Int64',
    'CRS_ELAPSED_TIME': 'Int64',
    'ACTUAL_ELAPSED_TIME': 'Int64',
    'AIR_TIME': 'Int64',
    'CARRIER_DELAY': 'Int64',
    'WEATHER_DELAY': 'Int64',
    'NAS_DELAY': 'Int64',
    'SECURITY_DELAY': 'Int64',
    'LATE_AIRCRAFT_DELAY': 'Int64',
    
    # TIME (NULLABLE)
    'DEP_TIME': 'Int64',
    'ARR_TIME': 'Int64',
    'WHEELS_OFF': 'Int64',
    'WHEELS_ON': 'Int64',

    'DISTANCE': int,

    'CANCELLED': bool,
    'DIVERTED': bool,
})

# CONVERT TO DATETIME OBJECT
test_data['FLIGHT_DATE'] = pd.to_datetime(test_data['FL_DATE'])

'''
Converts the time (HHMM) to the number of minutes since midnight, which improves model consistency.
Otherwise, 1159 (11:59AM) and 1200 (12:00PM) are treated as 41 minutes apart, which is incorrect.
'''
for column in [
    'CRS_DEP_TIME',
    'DEP_TIME',
    'CRS_ARR_TIME',
    'ARR_TIME',
    'WHEELS_OFF',
    'WHEELS_ON',
]:
    test_data[column] = test_data[column].apply(lambda x: x % 100 + 60 * (x // 100))

# CONVERTED FOR CLARITY
cancellation_codes = {
    'nan': None,
    'A': 'AIRLINE',
    'B': 'WEATHER',
    'C': 'NAS',
    'D': 'SECURITY'
}
test_data['CANCELLATION_CODE'] = test_data['CANCELLATION_CODE'].apply(lambda x: cancellation_codes[x])

### Feature Extraction

In [40]:
# DUPLICATE OR NULL DATA
test_data = test_data.drop(columns=['Unnamed: 27', 'FL_DATE'])

# DATA MEASURED AFTER TAKEOFF
test_data = test_data.drop(columns=['DEP_TIME', 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME'])

'''
This data measures the cause of delay, in minutes, of the plane's arrival. While potentially useful, this data DOES NOT explain the reason for the departure delay. Unfortunately, this data is only reported if the aircraft arrives less than fifteen minutes late, significantly reducing its utility. Each of these is described in more detail below (paraphrased from the Bureau of Transportation Statistics):

- Air Carrier (CARRIER_DELAY): Circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling)
- Extreme Weather (WEATHER_DELAY): Extreme meteorological conditions that delays or prevents the operation of a flight (e.g. tornado, blizzard or hurricane).
- National Aviation System (NAS_DELAY): Delays and cancellations attributable to the national aviation system, such as non-extreme weather conditions (like thunderstorms), airport operations, heavy traffic volume, and air traffic control.
- Late-arriving aircraft (LATE_AIRCRAFT_DELAY): The previous flight arrived late, causing the present flight to depart late.
- Security (SECURITY_DELAY): Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas.

In my opinion, this data is not very useful because it is impractical to determine whether weather was the cause of the delay.
'''
arrival_delay_data = test_data[['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]
test_data = test_data.drop(columns=['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY'])

# DIVERSIONS (THESE USUALLY OCCUR AFTER TAKEOFF)
test_data = test_data.drop(columns=['DIVERTED'])

# LABELS
cancellation_labels = test_data['CANCELLATION_CODE']
delay_labels = test_data['DEP_DELAY']
test_data = test_data.drop(columns=['CANCELLATION_CODE', 'CANCELLED', 'DEP_DELAY'])

test_data.head(100)

Unnamed: 0,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE,FLIGHT_DATE
0,XE,1204,DCA,EWR,660,722,62,199,2009-01-01
1,XE,1206,EWR,IAD,910,992,82,213,2009-01-01
2,XE,1207,EWR,DCA,660,730,70,199,2009-01-01
3,XE,1208,DCA,EWR,760,837,77,199,2009-01-01
4,XE,1209,IAD,EWR,1035,1140,105,213,2009-01-01
...,...,...,...,...,...,...,...,...,...
995,YV,1011,HNL,OGG,575,610,35,100,2009-01-01
996,YV,1012,HNL,OGG,695,730,35,100,2009-01-01
997,YV,1017,HNL,OGG,1055,1090,35,100,2009-01-01
998,YV,1018,HNL,OGG,1180,1215,35,100,2009-01-01
