# ML4VA

---

## Jupyter Setup

In [120]:
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import pandas as pd
import dask.dataframe as dd
import dask_ml as dml
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns

import sys, os, math

%matplotlib inline

RANDOM_SEED = 42
sklearn.random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
keras.utils.set_random_seed(RANDOM_SEED)

In [121]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout


class StopExecution(Exception):
    def _render_traceback_(self):
        return []

---

## Data Processing
### Import Flight Data
Imports and summarizes the dataset.

In [122]:
def load_flight_data():
    flight_data = []

    # CAN REPLACE FOR LOOP WITH WILDCARD
    for year in range(2010, 2011):
        flight_data.append(dd.read_csv('Datasets/' + str(year) + '_with_weather.csv',
                                       assume_missing=True,
                                       dtype={
                                           # CANCELLATION CODE
                                           'CANCELLATION_CODE': str,

                                           # DURATION (NULL WHEN CANCELLED)
                                           'DEP_DELAY': 'Int64',
                                           'ARR_DELAY': 'Int64',
                                           'TAXI_OUT': 'Int64',
                                           'TAXI_IN': 'Int64',
                                           'ACTUAL_ELAPSED_TIME': 'Int64',
                                           'AIR_TIME': 'Int64',
                                           'CARRIER_DELAY': 'Int64',
                                           'WEATHER_DELAY': 'Int64',
                                           'NAS_DELAY': 'Int64',
                                           'SECURITY_DELAY': 'Int64',
                                           'LATE_AIRCRAFT_DELAY': 'Int64',

                                           # TIME (NULL WHEN CANCELLED)
                                           'DEP_TIME': 'Int64',
                                           'ARR_TIME': 'Int64',
                                           'WHEELS_OFF': 'Int64',
                                           'WHEELS_ON': 'Int64',

                                           'DISTANCE': int,

                                           'CANCELLED': bool,
                                           'DIVERTED': bool,

                                           # STRINGS / OBJECTS
                                           'origin_forecast_day_condition_text': str,
                                           'origin_forecast_astro_sunrise': str,
                                           'origin_forecast_astro_sunset': str,
                                           'origin_forecast_astro_moonrise': str,
                                           'origin_forecast_astro_moonset': str,
                                           'origin_forecast_astro_moon_phase': str,
                                       }
                                       ))

    return dd.concat(flight_data)


data = load_flight_data()
data = dd.from_pandas(data.head(10000))

# BASIC STATISTICS
data.info()
data.head()

<class 'dask_expr.DataFrame'>
Columns: 100 entries, origin_id to weather_id
dtypes: Int64(15), bool(2), float64(61), int64(1), string(21)

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Unnamed: 0,origin_id,origin_location_name,origin_location_region,origin_location_country,origin_location_lat,origin_location_lon,origin_location_tz_id,origin_location_localtime_epoch,origin_location_localtime,origin_forecast_date,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,weather_id
0,2010-1-1_19_LGA,New York,,United States of America,40.7761,-73.872704,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,110.0,106,70,431,0.0,0.0,0.0,0.0,77.0,2010-1-1_19_LGA
1,2010-1-1_9_DCA,Washington,,United States of America,38.852299,-77.037201,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,70.0,60,38,213,,,,,,2010-1-1_9_DCA
2,2010-1-1_9_LGA,New York,,United States of America,40.7761,-73.872704,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,120.0,91,77,431,,,,,,2010-1-1_9_LGA
3,2010-1-1_19_RDU,Durham,,United States of America,35.879501,-78.787102,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,100.0,80,64,431,,,,,,2010-1-1_19_RDU
4,2010-1-1_20_JFK,New York,,United States of America,40.639801,-73.778702,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,85.0,66,47,213,0.0,0.0,0.0,0.0,102.0,2010-1-1_20_JFK


### Pre-Processing
Sets up two data pre-processors: the first coverts datatypes and correct erroneous values, and the second drops duplicate and unused data columns.

In [123]:
def convert_data(dataframe):
    # CONVERT TO DAY, MONTH, YEAR
    dataframe['FLIGHT_DATE'] = dd.to_datetime(dataframe['FL_DATE'])
    dataframe['DAY'] = dataframe['FLIGHT_DATE'].dt.day
    dataframe['MONTH'] = dataframe['FLIGHT_DATE'].dt.month
    dataframe['YEAR'] = dataframe['FLIGHT_DATE'].dt.year

    '''
    Converts the time (HHMM) to the number of minutes since midnight, which improves model consistency.
    Otherwise, 1159 (11:59AM) and 1200 (12:00PM) are treated as 41 minutes apart, which is incorrect.
    '''

    def convert_24H(time):
        return time % 100 + 60 * (time // 100) if not pd.isna(time) else None

    for column in [
        'CRS_DEP_TIME',
        'DEP_TIME',
        'CRS_ARR_TIME',
        'ARR_TIME',
        'WHEELS_OFF',
        'WHEELS_ON',
    ]:
        dataframe[column] = dataframe[column].map_partitions(
            lambda df: df.apply(convert_24H)
        )

    # CONVERTED FOR CLARITY
    cancellation_codes = {
        np.nan: 'NONE',
        'nan': 'NONE',
        'A': 'AIRLINE',
        'B': 'WEATHER',
        'C': 'NAS',
        'D': 'SECURITY'
    }
    dataframe['CANCELLATION_CODE'] = dataframe.map_partitions(
        lambda df: df['CANCELLATION_CODE'].replace(cancellation_codes)
    )

    return dataframe

In [124]:
def drop_features(dataframe):
    # DROP ID COLUMNS
    dataframe = dataframe.drop(columns=['origin_id', 'weather_id', 'OP_CARRIER_FL_NUM'])

    # DROP UNRELATED DATA
    dataframe = dataframe.drop(columns=['origin_location_localtime_epoch', 'origin_location_localtime'])

    # DROP DATA MEASURED AFTER TAKEOFF
    dataframe = dataframe.drop(
        columns=['DEP_TIME', 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_TIME', 'ACTUAL_ELAPSED_TIME',
                 'AIR_TIME', ])

    '''
    These columns contain duplicate weather, location, and datetime data. The following data was kept:
    
    - Location: latitude, longitude, airport code
    - Datetime: departure time, day, month, year
    - Weather: Imperial data
    '''
    # LOCATION
    dataframe = dataframe.drop(columns=['origin_location_name', 'origin_location_region', 'origin_location_country', ])

    # DATETIME
    dataframe = dataframe.drop(columns=['FL_DATE', 'FLIGHT_DATE', 'origin_location_tz_id', 'origin_forecast_date',
                                        'origin_forecast_date_epoch', 'origin_forecast_hour_time', 'origin_forecast_hour_time_epoch', ])

    # WEATHER
    dataframe = dataframe.drop(
        columns=['origin_forecast_day_maxtemp_c', 'origin_forecast_day_mintemp_c', 'origin_forecast_day_avgtemp_c',
                 'origin_forecast_day_maxwind_kph', 'origin_forecast_day_totalprecip_mm',
                 'origin_forecast_day_avgvis_km', 'origin_forecast_hour_temp_c', 'origin_forecast_hour_wind_kph',
                 'origin_forecast_hour_wind_dir', 'origin_forecast_hour_pressure_mb', 'origin_forecast_hour_precip_mm',
                 'origin_forecast_hour_feelslike_c', 'origin_forecast_hour_windchill_c',
                 'origin_forecast_hour_heatindex_c', 'origin_forecast_hour_dewpoint_c', 'origin_forecast_hour_vis_km',
                 'origin_forecast_hour_gust_kph'])
    dataframe = dataframe.drop(columns=['origin_forecast_day_condition_icon', 'origin_forecast_day_condition_code'])
    dataframe = dataframe.drop(
        columns=['origin_forecast_astro_sunrise', 'origin_forecast_astro_sunset', 'origin_forecast_astro_moonrise',
                 'origin_forecast_astro_moonset'])

    '''
    This data measures the cause of delay, in minutes, of the plane's arrival. While potentially useful, this data DOES NOT explain the reason for the departure delay. Unfortunately, this data is only reported if the aircraft arrives less than fifteen minutes late, significantly reducing its utility. Each of these is described in more detail below (paraphrased from the Bureau of Transportation Statistics):
    
    - Air Carrier (CARRIER_DELAY): Circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling)
    - Extreme Weather (WEATHER_DELAY): Extreme meteorological conditions that delays or prevents the operation of a flight (e.g. tornado, blizzard or hurricane).
    - National Aviation System (NAS_DELAY): Delays and cancellations attributable to the national aviation system, such as non-extreme weather conditions (like thunderstorms), airport operations, heavy traffic volume, and air traffic control.
    - Late-arriving aircraft (LATE_AIRCRAFT_DELAY): The previous flight arrived late, causing the present flight to depart late.
    - Security (SECURITY_DELAY): Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas.
    
    In my opinion, this data is not very useful because it is impractical to determine whether weather was the cause of the delay.
    '''
    # arrival_delay_data = dataframe[['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]
    dataframe = dataframe.drop(
        columns=['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY'])

    # DIVERSIONS (THESE USUALLY OCCUR AFTER TAKEOFF)
    dataframe = dataframe.drop(columns=['DIVERTED'])

    '''
    Since we have a lot of clean, usable data, I decided to drop any rows with missing flight numbers, origins, destinations, date information, and labels. 
    The other columns can be imputed without losing much data integrity.
    '''
    dataframe = dataframe.dropna(subset=['ORIGIN', 'DEST', 'DEP_DELAY', 'CANCELLED', 'DAY', 'MONTH', 'YEAR'])

    return dataframe

### Transformer
Combines the preprocessing steps, generates labels, then transforms numeric and categorical data.

In [125]:
from dask_ml.impute import SimpleImputer
from dask_ml.preprocessing import StandardScaler, DummyEncoder, Categorizer


class DaskTransformer:
    def __init__(self):
        self.CATEGORIZER = Categorizer()
        self.CATEGORY_IMPUTER = SimpleImputer(strategy='most_frequent')
        self.CATEGORY_ENCODER = DummyEncoder()

        self.NUMERIC_IMPUTER = SimpleImputer(strategy='median')
        self.NUMERIC_ENCODER = StandardScaler()

    def preprocess(self, dataframe):
        return drop_features(convert_data(dataframe))

    def extract_labels(self, dataframe):
        return (
            dataframe['CANCELLATION_CODE'],
            dataframe['DEP_DELAY'],
            dataframe.drop(columns=['CANCELLATION_CODE', 'CANCELLED', 'DEP_DELAY'])
        )

    def fit_transform(self, dataframe):
        categorical = \
            self.CATEGORY_ENCODER.fit_transform(
            self.CATEGORY_IMPUTER.fit_transform(
            self.CATEGORIZER.fit_transform(
                dataframe[dataframe.select_dtypes(include=['string', 'object', 'category']).columns]
            )))

        numeric = \
            self.NUMERIC_ENCODER.fit_transform(
            self.NUMERIC_IMPUTER.fit_transform(
                dataframe[dataframe.select_dtypes(include=['number', 'Int64']).columns]
            ))

        for col in numeric.select_dtypes(include=['Int64']).columns:
            numeric[col] = numeric[col].astype(float)

        other = dataframe[dataframe.select_dtypes(exclude=['string', 'object', 'category', 'number', 'Int64']).columns]

        return dd.concat([categorical, numeric, other], axis=1)

    def transform(self, dataframe):
        categorical = \
            self.CATEGORY_ENCODER.transform(
            self.CATEGORY_IMPUTER.transform(
            self.CATEGORIZER.transform(
                dataframe[dataframe.select_dtypes(include=['string', 'object', 'category']).columns]
            )))

        numeric = \
            self.NUMERIC_ENCODER.transform(
            self.NUMERIC_IMPUTER.transform(
                dataframe[dataframe.select_dtypes(include=['number', 'Int64']).columns]
            ))

        for col in numeric.select_dtypes(include=['Int64']).columns:
            numeric[col] = numeric[col].astype(float)

        other = dataframe[dataframe.select_dtypes(exclude=['string', 'object', 'category', 'number', 'Int64']).columns]

        return dd.concat([categorical, numeric, other], axis=1)

In [126]:
transformer = DaskTransformer()

data = transformer.preprocess(data)
cancellation_data, delay_data, data = transformer.extract_labels(data)

Creates two types of labels: one for the amount of delay in minutes, and a boolean 'delay status.' Flights are considered delayed if they are cancelled or leave at least fifteen minutes after their scheduled departure.

In [127]:
from dask_ml.model_selection import train_test_split

'''
Treat all cancelled flights as 2-hour delays.
Flights are only considered delayed if they depart at least fifteen minutes after their intended departure.
'''
delay_labels = delay_data.where(cancellation_data.notnull(), 120).astype(int)
delay_statuses = delay_labels > 15

# SPLIT INTO TRAINING AND TESTING DATA
full_training, test, full_training_delays, test_delays, full_training_statuses, test_statuses = train_test_split(
    data, delay_labels, delay_statuses, test_size=0.2, random_state=RANDOM_SEED, shuffle=True
)
training, validation, training_delays, validation_delays, training_statuses, validation_statuses = train_test_split(
    full_training, full_training_delays, full_training_statuses, test_size=0.2, random_state=RANDOM_SEED, shuffle=True
)

training = transformer.fit_transform(training)
validation = transformer.transform(validation)
test = transformer.transform(test)

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


### Summarization

In [128]:
training.head(1000)

Unnamed: 0,origin_forecast_day_condition_text_Cloudy,origin_forecast_day_condition_text_Heavy rain at times,origin_forecast_day_condition_text_Light freezing rain,origin_forecast_day_condition_text_Moderate or heavy rain shower,origin_forecast_day_condition_text_Moderate or heavy snow showers,origin_forecast_day_condition_text_Moderate rain at times,origin_forecast_day_condition_text_Moderate snow,origin_forecast_day_condition_text_Overcast,origin_forecast_day_condition_text_Partly cloudy,origin_forecast_day_condition_text_Patchy moderate snow,...,origin_forecast_hour_vis_miles,origin_forecast_hour_gust_mph,origin_forecast_hour_uv,CRS_DEP_TIME,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE,DAY,MONTH,YEAR
6061,False,False,False,False,False,False,False,False,False,False,...,0.563237,-0.927186,-1.044297,1.799737,1.534754,-1.154305,-0.925154,0.0,0.0,0.0
7913,False,False,False,False,False,False,False,False,False,False,...,0.563237,-0.009116,0.133133,-0.848926,-0.697428,0.215662,0.011809,0.0,0.0,0.0
5307,False,False,False,False,False,False,False,False,False,False,...,0.563237,-0.097962,0.721849,-0.695381,-0.383353,-0.020031,0.256155,0.0,0.0,0.0
73,False,False,False,False,False,False,False,False,True,False,...,0.563237,0.479534,0.133133,0.456212,0.383142,-0.535610,-0.541183,0.0,0.0,0.0
6137,False,False,False,False,False,False,False,False,False,False,...,0.563237,-1.475066,0.721849,-0.023618,-0.162752,-0.844957,-0.875550,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6498,False,False,False,False,False,False,False,False,False,False,...,0.563237,-0.379306,-1.044297,1.089588,0.962687,-0.682918,-0.693669,0.0,0.0,0.0
3136,False,False,False,True,False,False,False,False,False,False,...,-0.033088,0.138960,1.310564,0.053155,0.005503,-0.476687,-0.414417,0.0,0.0,0.0
4688,False,False,False,False,False,False,False,True,False,False,...,0.563237,-0.068346,-0.455582,0.437019,0.850517,0.495548,0.695241,0.0,0.0,0.0
8575,False,False,False,False,False,False,False,False,False,False,...,0.563237,-1.075262,-1.044297,1.051202,0.850517,-0.093685,-0.118631,0.0,0.0,0.0


In [129]:
validation.head(1000)

Unnamed: 0,origin_forecast_day_condition_text_Cloudy,origin_forecast_day_condition_text_Heavy rain at times,origin_forecast_day_condition_text_Light freezing rain,origin_forecast_day_condition_text_Moderate or heavy rain shower,origin_forecast_day_condition_text_Moderate or heavy snow showers,origin_forecast_day_condition_text_Moderate rain at times,origin_forecast_day_condition_text_Moderate snow,origin_forecast_day_condition_text_Overcast,origin_forecast_day_condition_text_Partly cloudy,origin_forecast_day_condition_text_Patchy moderate snow,...,origin_forecast_hour_vis_miles,origin_forecast_hour_gust_mph,origin_forecast_hour_uv,CRS_DEP_TIME,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE,DAY,MONTH,YEAR
3299,False,False,False,False,False,False,False,True,False,False,...,-2.418388,-0.823533,0.133133,-0.848926,-0.869422,0.421894,-0.028609,0.0,0.0,0.0
6144,False,False,False,False,False,False,False,False,True,False,...,0.563237,0.375881,0.133133,-0.465062,-0.372136,0.024161,-0.067190,0.0,0.0,0.0
8885,False,False,False,False,False,False,False,False,True,False,...,0.563237,0.953376,0.133133,0.521469,0.442966,-0.550341,-0.528323,0.0,0.0,0.0
9918,False,False,False,False,False,False,False,False,False,False,...,0.563237,-0.275652,-1.044297,1.319906,1.299197,-0.240993,-0.037795,0.0,0.0,0.0
8477,False,False,False,False,False,False,False,False,False,False,...,0.563237,-0.897571,0.133133,-0.695381,-0.495522,-0.461956,-0.295000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9478,False,False,False,False,False,False,False,False,True,False,...,0.563237,-0.705072,2.487994,0.575210,0.300884,-1.316344,-1.090501,0.0,0.0,0.0
5801,False,False,True,False,False,False,False,False,False,False,...,-0.629413,0.272228,0.133133,0.264280,0.140107,-0.756572,-0.645902,0.0,0.0,0.0
7897,False,False,False,False,False,False,False,False,False,False,...,0.563237,0.153767,0.721849,-0.196357,0.289667,0.716510,0.702590,0.0,0.0,0.0
7019,False,False,False,False,False,False,False,False,True,False,...,0.563237,0.479534,0.721849,0.214377,0.069066,-0.844957,-0.803900,0.0,0.0,0.0


In [130]:
print("TRAINING: ", training_statuses.head(1000).value_counts())
print("\nVALIDATION: ", validation_statuses.head(1000).value_counts())
print("\nTEST: ", test_statuses.head(1000).value_counts())

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


TRAINING:  DEP_DELAY
False    827
True     173
Name: count, dtype: int64


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.



VALIDATION:  DEP_DELAY
False    824
True     176
Name: count, dtype: int64

TEST:  DEP_DELAY
False    827
True     173
Name: count, dtype: int64


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


---

## Models
### Linear Regressor

In [131]:
from dask_ml.linear_model import LinearRegression

linear_regressor = LinearRegression()
linear_regressor.fit(training.to_dask_array(), training_delays.to_dask_array())

ValueError: Multiple constant columns detected!

In [None]:
from sklearn.metrics import root_mean_squared_error

linear_delay_predictions = linear_regressor.predict(validation)
linear_rmse = root_mean_squared_error(validation_delays, linear_delay_predictions)
print(linear_rmse)

In [32]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

linear_regressor = SGDRegressor(random_state=RANDOM_SEED)
linear_regressor.fit(training, training_delays)

In [None]:
linear_delay_predictions = linear_regressor.predict(validation)
linear_rmse = root_mean_squared_error(validation_delays, linear_delay_predictions)
print(linear_rmse)

### Random Forest Regressor

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

random_forest_regressor = RandomForestRegressor(max_features=5, random_state=RANDOM_SEED)

grid_search = GridSearchCV(
    random_forest_regressor,
    [{'n_estimators': [5, 10, 50, 100, 200], 'max_features': [5, 10, 20, 50]}],
    scoring='neg_mean_squared_error',
    return_train_score=True,
    verbose=3
)

grid_search.fit(training, training_delays)
random_forest_regressor = grid_search.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END max_features=5, n_estimators=5;, score=(train=-134.422, test=-404.150) total time=   0.0s
[CV 2/5] END max_features=5, n_estimators=5;, score=(train=-104.171, test=-376.160) total time=   0.0s
[CV 3/5] END max_features=5, n_estimators=5;, score=(train=-133.955, test=-398.767) total time=   0.0s
[CV 4/5] END max_features=5, n_estimators=5;, score=(train=-122.692, test=-690.588) total time=   0.0s
[CV 5/5] END max_features=5, n_estimators=5;, score=(train=-108.789, test=-1147.451) total time=   0.0s
[CV 1/5] END max_features=5, n_estimators=10;, score=(train=-97.144, test=-362.831) total time=   0.0s
[CV 2/5] END max_features=5, n_estimators=10;, score=(train=-99.025, test=-361.507) total time=   0.0s
[CV 3/5] END max_features=5, n_estimators=10;, score=(train=-104.047, test=-303.589) total time=   0.0s
[CV 4/5] END max_features=5, n_estimators=10;, score=(train=-92.275, test=-636.294) total time=   0.0s
[CV 5/5] 

  _data = np.array(data, dtype=dtype, copy=copy,


In [35]:
random_forest_delay_predictions = random_forest_regressor.predict(validation)
random_forest_rmse = root_mean_squared_error(validation_delays, random_forest_delay_predictions)
print(random_forest_rmse)

27.32933203880951


### KMeans-Linear Regressor

In [None]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

# Train a KMeans clustering algorithm using KMeans
CLUSTER_COUNT = 5
kmeans = KMeans(CLUSTER_COUNT, random_state=RANDOM_SEED)
kmeans.fit(training)

# Train linear models based on each training cluster
kmeans_models = {
    cluster: SGDRegressor(random_state=RANDOM_SEED).fit(
        training[kmeans.labels_ == cluster],
        training_delays[kmeans.labels_ == cluster]
    )
    for cluster in range(CLUSTER_COUNT)
}



In [41]:
# Predict and analyze validation data
validation_clusters = kmeans.predict(validation)

kmeans_predictions = np.concatenate([
    kmeans_models[cluster].predict(validation[validation_clusters == cluster])
    for cluster in range(CLUSTER_COUNT)
])

kmeans_delays = np.concatenate([
    validation_delays[validation_clusters == cluster]
    for cluster in range(CLUSTER_COUNT)
])

kmeans_rmse = root_mean_squared_error(kmeans_delays, kmeans_predictions)
print(kmeans_rmse)

30.094872707799837


### SVM-Linear Regressor

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

# Train a non-linear support vector machine
svm = SVC(random_state=RANDOM_SEED)

parameters = [
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1]
    },
    {
        'kernel': ['poly'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1],
        'degree': [1, 3, 5]
    }
]
# parameters = [
#     {
#         'kernel': ['poly'],
#         'C': [10],
#         'gamma': [0.1],
#         'degree': [3]
#     }
# ]

grid_search = GridSearchCV(
    svm,
    parameters,
    scoring='accuracy',
    return_train_score=True,
    verbose=3
)

grid_search.fit(training, training_statuses)
svm = grid_search.best_estimator_

# Train a linear model based on the SVM's predicted delay statuses
svm_status_predictions = svm.predict(training)

if np.nonzero(svm_status_predictions)[0].shape[0] == 0:
    raise StopExecution(
        "Error: SVM predicted zero delayed flights in the training data. Unable to train a linear model.")

svm_linear_regressor = SGDRegressor(random_state=RANDOM_SEED).fit(
    training[svm_status_predictions == 1],
    training_delays[svm_status_predictions == 1]
)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 2/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 3/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 4/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 5/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.914, test=0.906) total time=   0.0s
[CV 1/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 2/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 3/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 4/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 5/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.914, test

StopExecution: Error: SVM predicted zero delayed flights in the training data. Unable to train a linear model.

In [None]:
# Predict and analyze validation data
svm_validation_status_predictions = svm.predict(validation)

svm_predictions = np.zeros(validation.shape[0])
svm_predictions[svm_validation_status_predictions == 1] = svm_linear_regressor.predict(
    validation[svm_validation_status_predictions == 1])

svm_rmse = root_mean_squared_error(validation_delays, svm_predictions)
print(svm_rmse)