# ML4UVA

---

## Jupyter Setup

In [4]:
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import pandas as pd
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns

import sys, os, math

%matplotlib inline

RANDOM_SEED = 42
sklearn.random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
keras.utils.set_random_seed(RANDOM_SEED)

In [5]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
        
class StopExecution(Exception):
    def _render_traceback_(self):
        return []

---

## Data Cleaning
### Import Flight Data
Imports and summarizes the dataset

In [None]:
def load_flight_data():
    flight_data = []
    for year in range(2010, 2011):
        flight_data.append(pd.read_csv('Datasets/' + str(year) + '_with_weather.csv',
            dtype={
                # CANCELLATION CODE
                'CANCELLATION_CODE': str,
    
                # DURATION (NULL WHEN CANCELLED)
                'DEP_DELAY': 'Int64',
                'ARR_DELAY': 'Int64',
                'TAXI_OUT': 'Int64',
                'TAXI_IN': 'Int64',
                'ACTUAL_ELAPSED_TIME': 'Int64',
                'AIR_TIME': 'Int64',
                'CARRIER_DELAY': 'Int64',
                'WEATHER_DELAY': 'Int64',
                'NAS_DELAY': 'Int64',
                'SECURITY_DELAY': 'Int64',
                'LATE_AIRCRAFT_DELAY': 'Int64',
            
                # TIME (NULL WHEN CANCELLED)
                'DEP_TIME': 'Int64',
                'ARR_TIME': 'Int64',
                'WHEELS_OFF': 'Int64',
                'WHEELS_ON': 'Int64',
            
                'DISTANCE': int,
            
                'CANCELLED': bool,
                'DIVERTED': bool,
                
                # STRINGS / OBJECTS
                'origin_forecast_day_condition_text': str,
                'origin_forecast_astro_sunrise': str,
                'origin_forecast_astro_sunset': str,
                'origin_forecast_astro_moonrise': str,
                'origin_forecast_astro_moonset': str,
                'origin_forecast_astro_moon_phase': str,
            }
        ))

    return pd.concat(flight_data)

data = load_flight_data()
data = data.sample(frac=0.1)

# SIZE (MB)
print('The training set uses', '%.1f' % (data.memory_usage(deep=1).sum() / (1024 ** 3)), '\bGB of memory.')

# ENTRIES 
print('The training set has', data.shape[0], 'entries.')

# FEATURES
print('The training set has', data.shape[1], 'features.')

# CATEGORICAL DATA
print('The training set', 'has' if not data.select_dtypes(include=['category', 'object']).empty else 'does not have',
      'categorical data.')

# MISSING DATA
print('The training set', 'has' if data.isnull().any(axis=None) else 'does not have', 'missing data.', end='\n\n')

# BASIC STATISTICS
data.info()
data.describe()

### Data Pre-Processing
Sets up two data pre-processors: the first coverts datatypes and correct erroneous values, and the second drops duplicate and unused data columns.

In [None]:
from sklearn.preprocessing import FunctionTransformer

def convert_datatype(dataframe):
    # dataframe = dataframe.astype({
    #     'CANCELLATION_CODE': str,
    
    #     # DURATION (NULLABLE)
    #     'DEP_DELAY': 'Int64',
    #     'ARR_DELAY': 'Int64',
    #     'TAXI_OUT': 'Int64',
    #     'TAXI_IN': 'Int64',
    #     'ACTUAL_ELAPSED_TIME': 'Int64',
    #     'AIR_TIME': 'Int64',
    #     'CARRIER_DELAY': 'Int64',
    #     'WEATHER_DELAY': 'Int64',
    #     'NAS_DELAY': 'Int64',
    #     'SECURITY_DELAY': 'Int64',
    #     'LATE_AIRCRAFT_DELAY': 'Int64',
    
    #     # TIME (NULLABLE)
    #     'DEP_TIME': 'Int64',
    #     'ARR_TIME': 'Int64',
    #     'WHEELS_OFF': 'Int64',
    #     'WHEELS_ON': 'Int64',
    
    #     'DISTANCE': int,
    
    #     'CANCELLED': bool,
    #     'DIVERTED': bool,
        
    #     'origin_forecast_day_condition_text': str,
    #     'origin_forecast_astro_sunrise': str,
    #     'origin_forecast_astro_sunset': str,
    #     'origin_forecast_astro_moonrise': str,
    #     'origin_forecast_astro_moonset': str,
    #     'origin_forecast_astro_moon_phase': str,
    # })
    
    # CONVERT TO DAY, MONTH, YEAR
    dataframe['FLIGHT_DATE'] = pd.to_datetime(dataframe['FL_DATE'])
    dataframe['DAY'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.day)
    dataframe['MONTH'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.month)
    dataframe['YEAR'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.year)
    
    '''
    # Converts the time (HHMM) to the number of minutes since midnight, which improves model consistency.
    # Otherwise, 1159 (11:59AM) and 1200 (12:00PM) are treated as 41 minutes apart, which is incorrect.
    # '''
    for column in [
        'CRS_DEP_TIME',
        'DEP_TIME',
        'CRS_ARR_TIME',
        'ARR_TIME',
        'WHEELS_OFF',
        'WHEELS_ON',
    ]:
        dataframe[column] = dataframe[column].apply(lambda x: x % 100 + 60 * (x // 100))
    
    # CONVERTED FOR CLARITY
    cancellation_codes = {
        np.nan: 'NONE',
        'nan': 'NONE',
        'A': 'AIRLINE',
        'B': 'WEATHER',
        'C': 'NAS',
        'D': 'SECURITY'
    }
    dataframe['CANCELLATION_CODE'] = dataframe['CANCELLATION_CODE'].apply(lambda x: cancellation_codes[x])
    
    return dataframe

datatype_converter = FunctionTransformer(convert_datatype)

In [None]:
def extract_features(dataframe):
    # DROP ID COLUMNS
    dataframe = dataframe.drop(columns=['origin_id', 'weather_id', 'OP_CARRIER_FL_NUM'])
    
    # DROP UNRELATED DATA
    dataframe = dataframe.drop(columns=['origin_location_localtime_epoch', 'origin_location_localtime'])
    
    # DROP DATA MEASURED AFTER TAKEOFF
    dataframe = dataframe.drop(columns=['DEP_TIME', 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', ])
    
    '''
    These columns contain duplicate weather, location, and datetime data. The following data was kept:
    
    - Location: latitude, longitude, airport code
    - Datetime: departure time, day, month, year
    - Weather: Imperial data
    '''
    # LOCATION
    dataframe = dataframe.drop(columns=['origin_location_name', 'origin_location_region', 'origin_location_country', ])
    
    # DATETIME
    dataframe = dataframe.drop(columns=['FL_DATE', 'FLIGHT_DATE', 'origin_location_tz_id', 'origin_forecast_date', 'origin_forecast_date_epoch', 'origin_forecast_hour_time', ])
    
    # WEATHER
    dataframe = dataframe.drop(columns=['origin_forecast_day_maxtemp_c', 'origin_forecast_day_mintemp_c', 'origin_forecast_day_avgtemp_c', 'origin_forecast_day_maxwind_kph', 'origin_forecast_day_totalprecip_mm', 'origin_forecast_day_avgvis_km', 'origin_forecast_hour_temp_c',  'origin_forecast_hour_wind_kph', 'origin_forecast_hour_wind_dir', 'origin_forecast_hour_pressure_mb', 'origin_forecast_hour_precip_mm', 'origin_forecast_hour_feelslike_c', 'origin_forecast_hour_windchill_c', 'origin_forecast_hour_heatindex_c', 'origin_forecast_hour_dewpoint_c', 'origin_forecast_hour_vis_km', 'origin_forecast_hour_gust_kph'])
    dataframe = dataframe.drop(columns=['origin_forecast_day_condition_icon', 'origin_forecast_day_condition_code'])
    dataframe = dataframe.drop(columns=['origin_forecast_astro_sunrise', 'origin_forecast_astro_sunset', 'origin_forecast_astro_moonrise','origin_forecast_astro_moonset'])# DUPLICATE OR NULL DATA
     
    '''
    This data measures the cause of delay, in minutes, of the plane's arrival. While potentially useful, this data DOES NOT explain the reason for the departure delay. Unfortunately, this data is only reported if the aircraft arrives less than fifteen minutes late, significantly reducing its utility. Each of these is described in more detail below (paraphrased from the Bureau of Transportation Statistics):
    
    - Air Carrier (CARRIER_DELAY): Circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling)
    - Extreme Weather (WEATHER_DELAY): Extreme meteorological conditions that delays or prevents the operation of a flight (e.g. tornado, blizzard or hurricane).
    - National Aviation System (NAS_DELAY): Delays and cancellations attributable to the national aviation system, such as non-extreme weather conditions (like thunderstorms), airport operations, heavy traffic volume, and air traffic control.
    - Late-arriving aircraft (LATE_AIRCRAFT_DELAY): The previous flight arrived late, causing the present flight to depart late.
    - Security (SECURITY_DELAY): Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas.
    
    In my opinion, this data is not very useful because it is impractical to determine whether weather was the cause of the delay.
    '''
    # arrival_delay_data = dataframe[['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]
    dataframe = dataframe.drop(columns=['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY'])
    
    # DIVERSIONS (THESE USUALLY OCCUR AFTER TAKEOFF)
    dataframe = dataframe.drop(columns=['DIVERTED'])
    
    '''
    Since we have a lot of clean, usable data, I decided to drop any rows with missing flight numbers, origins, destinations, date information, and labels. The other columns can be imputed without losing much data integrity.
    '''
    dataframe = dataframe.dropna(subset=['ORIGIN', 'DEST', 'DEP_DELAY', 'CANCELLED', 'DAY', 'MONTH', 'YEAR'])
    
    return dataframe

feature_extractor = FunctionTransformer(extract_features)

### Data Pipeline
Uses two preprocessing steps, generates labels, then transforms numeric and categorical data.

In [None]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder())
])

column_transformer = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, make_column_selector(dtype_include=[np.number])),
        ('categorical', categorical_transformer, make_column_selector(dtype_include=['object', 'category'])),
    ]
)

preprocessor = Pipeline(steps=[
    ('datatype_converter', datatype_converter),
    ('feature_extractor', feature_extractor),
])
transformer = Pipeline(steps=[
    ('transformer', column_transformer),
])

In [None]:
# PREPROCESS
data = preprocessor.fit_transform(data)

# LABELS
cancellation_data = data['CANCELLATION_CODE']
delay_data = data['DEP_DELAY']
data = data.drop(columns=['CANCELLATION_CODE', 'CANCELLED', 'DEP_DELAY'])

# TRANSFORMER
data = transformer.fit_transform(data)

MemoryError: Unable to allocate 48.4 MiB for an array with shape (6338168, 1) and data type float64

### Training, Testing, and Validation Datasets & Labels
Creates training labels for the amount of delay in minutes and a boolean delay status with a fifteen-minute threshold. The dataset is split into training, validation, and test datasets. 

In [None]:
'''
Treat all cancelled flights as 2-hour delays. 
Flights are only considered delayed if they depart at least fifteen minutes after their intended departure.
'''
delay_labels = delay_data.where(cancellation_data.notna(), 120)
delay_statuses = delay_labels > 15

In [None]:
from sklearn.model_selection import train_test_split

full_training, test, full_training_delays, test_delays, full_training_statuses, test_statuses = train_test_split(
    data, delay_labels, delay_statuses, test_size=0.2, random_state=RANDOM_SEED
)
training, validation, training_delays, validation_delays, training_statuses, validation_statuses = train_test_split(
    full_training, full_training_delays, full_training_statuses, test_size=0.2, random_state=RANDOM_SEED
)

In [None]:
# PRINTS THE NUMBER OF DELAYED FLIGHTS IN EACH DATASET
print("TRAINING:", np.nonzero(training_statuses.to_numpy())[0].shape[0])
print("VALIDATION:", np.nonzero(validation_statuses.to_numpy())[0].shape[0])
print("TEST:", np.nonzero(test_statuses.to_numpy())[0].shape[0])

TRAINING: 56
VALIDATION: 15
TEST: 17


---

## Models
### Linear Regressor

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

linear_regressor = SGDRegressor(random_state=RANDOM_SEED)
linear_regressor.fit(training, training_delays)

In [None]:
linear_delay_predictions = linear_regressor.predict(validation)
linear_rmse = root_mean_squared_error(validation_delays, linear_delay_predictions)
print(linear_rmse)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

random_forest_regressor = RandomForestRegressor(max_features=5, random_state=RANDOM_SEED)

grid_search = GridSearchCV(
    random_forest_regressor, 
    [{'n_estimators': [5, 10, 50, 100, 200], 'max_features': [5, 10, 20, 50]}], 
    scoring='neg_mean_squared_error',
    return_train_score=True,
    verbose=3
)

grid_search.fit(training, training_delays)
random_forest_regressor = grid_search.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END max_features=5, n_estimators=5;, score=(train=-134.422, test=-404.150) total time=   0.0s
[CV 2/5] END max_features=5, n_estimators=5;, score=(train=-104.171, test=-376.160) total time=   0.0s
[CV 3/5] END max_features=5, n_estimators=5;, score=(train=-133.955, test=-398.767) total time=   0.0s
[CV 4/5] END max_features=5, n_estimators=5;, score=(train=-122.692, test=-690.588) total time=   0.0s
[CV 5/5] END max_features=5, n_estimators=5;, score=(train=-108.789, test=-1147.451) total time=   0.0s
[CV 1/5] END max_features=5, n_estimators=10;, score=(train=-97.144, test=-362.831) total time=   0.0s
[CV 2/5] END max_features=5, n_estimators=10;, score=(train=-99.025, test=-361.507) total time=   0.0s
[CV 3/5] END max_features=5, n_estimators=10;, score=(train=-104.047, test=-303.589) total time=   0.0s
[CV 4/5] END max_features=5, n_estimators=10;, score=(train=-92.275, test=-636.294) total time=   0.0s
[CV 5/5] 

  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
random_forest_delay_predictions = random_forest_regressor.predict(validation)
random_forest_rmse = root_mean_squared_error(validation_delays, random_forest_delay_predictions)
print(random_forest_rmse)

27.32933203880951


### KMeans-Linear Regressor

In [None]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

# Train a KMeans clustering algorithm using KMeans
CLUSTER_COUNT = 5
kmeans = KMeans(CLUSTER_COUNT, random_state=RANDOM_SEED)
kmeans.fit(training)

# Train linear models based on each training cluster
kmeans_models = {
    cluster: SGDRegressor(random_state=RANDOM_SEED).fit(
        training[kmeans.labels_ == cluster],
        training_delays[kmeans.labels_ == cluster]
    )
    for cluster in range(CLUSTER_COUNT)
}



In [None]:
# Predict and analyze validation data
validation_clusters = kmeans.predict(validation)

kmeans_predictions = np.concatenate([
    kmeans_models[cluster].predict(validation[validation_clusters == cluster])
    for cluster in range(CLUSTER_COUNT)
])

kmeans_delays = np.concatenate([
    validation_delays[validation_clusters == cluster]
    for cluster in range(CLUSTER_COUNT)
])

kmeans_rmse = root_mean_squared_error(kmeans_delays, kmeans_predictions)
print(kmeans_rmse)

30.094872707799837


### SVM-Linear Regressor

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

# Train a non-linear support vector machine
svm = SVC(random_state=RANDOM_SEED)

parameters = [
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1]
    },
    {
        'kernel': ['poly'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1],
        'degree': [1, 3, 5]
    }
]
# parameters = [
#     {
#         'kernel': ['poly'],
#         'C': [10],
#         'gamma': [0.1],
#         'degree': [3]
#     }
# ]

grid_search = GridSearchCV(
    svm, 
    parameters, 
    scoring='accuracy',
    return_train_score=True,
    verbose=3
)

grid_search.fit(training, training_statuses)
svm = grid_search.best_estimator_

# Train a linear model based on the SVM's predicted delay statuses
svm_status_predictions = svm.predict(training)

if np.nonzero(svm_status_predictions)[0].shape[0] == 0:
    raise StopExecution("Error: SVM predicted zero delayed flights in the training data. Unable to train a linear model.")

svm_linear_regressor = SGDRegressor(random_state=RANDOM_SEED).fit(
    training[svm_status_predictions == 1],
    training_delays[svm_status_predictions == 1]
)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 2/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 3/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 4/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 5/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.914, test=0.906) total time=   0.0s
[CV 1/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 2/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 3/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 4/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 5/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.914, test

StopExecution: Error: SVM predicted zero delayed flights in the training data. Unable to train a linear model.

In [None]:
# Predict and analyze validation data
svm_validation_status_predictions = svm.predict(validation)

svm_predictions = np.zeros(validation.shape[0])
svm_predictions[svm_validation_status_predictions == 1] = svm_linear_regressor.predict(validation[svm_validation_status_predictions == 1])

svm_rmse = root_mean_squared_error(validation_delays, svm_predictions)
print(svm_rmse)