# ML4VA

---

## Jupyter Setup

In [50]:
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import pandas as pd
import dask.dataframe as dd
import dask_ml as dml
import xgboost as xgb
import lightgbm
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns

import sys, os, math

from lightgbm import DaskLGBMClassifier

%matplotlib inline

RANDOM_SEED = 42
sklearn.random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
keras.utils.set_random_seed(RANDOM_SEED)

from dask.distributed import Client

client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 58295 instead
  next(self.gen)


---

## Data Processing
### Import Flight Data
Imports and summarizes the dataset.

In [51]:
def load_flight_data():
    return dd.read_csv('Datasets/*_with_weather.csv',
                       blocksize='100MB',
                       assume_missing=True,
                       dtype={
                           # CANCELLATION CODE
                           'CANCELLATION_CODE': str,

                           # DURATION (NULL WHEN CANCELLED)
                           'DEP_DELAY': 'Int64',
                           'ARR_DELAY': 'Int64',
                           'TAXI_OUT': 'Int64',
                           'TAXI_IN': 'Int64',
                           'ACTUAL_ELAPSED_TIME': 'Int64',
                           'AIR_TIME': 'Int64',
                           'CARRIER_DELAY': 'Int64',
                           'WEATHER_DELAY': 'Int64',
                           'NAS_DELAY': 'Int64',
                           'SECURITY_DELAY': 'Int64',
                           'LATE_AIRCRAFT_DELAY': 'Int64',

                           # TIME (NULL WHEN CANCELLED)
                           'DEP_TIME': 'Int64',
                           'ARR_TIME': 'Int64',
                           'WHEELS_OFF': 'Int64',
                           'WHEELS_ON': 'Int64',

                           'DISTANCE': int,

                           'CANCELLED': bool,
                           'DIVERTED': bool,
                       }
                       )


data = load_flight_data()

# TODO: Remove sample for final analysis
data = dd.from_pandas(data.head(1000))

# BASIC STATISTICS
data.info()
data.head()

<class 'dask_expr.DataFrame'>
Columns: 100 entries, origin_id to weather_id
dtypes: Int64(15), bool(2), float64(61), int64(1), string(21)

Unnamed: 0,origin_id,origin_location_name,origin_location_region,origin_location_country,origin_location_lat,origin_location_lon,origin_location_tz_id,origin_location_localtime_epoch,origin_location_localtime,origin_forecast_date,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,weather_id
0,2010-1-1_19_LGA,New York,,United States of America,40.7761,-73.872704,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,110.0,106,70,431,0.0,0.0,0.0,0.0,77.0,2010-1-1_19_LGA
1,2010-1-1_9_DCA,Washington,,United States of America,38.852299,-77.037201,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,70.0,60,38,213,,,,,,2010-1-1_9_DCA
2,2010-1-1_9_LGA,New York,,United States of America,40.7761,-73.872704,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,120.0,91,77,431,,,,,,2010-1-1_9_LGA
3,2010-1-1_19_RDU,Durham,,United States of America,35.879501,-78.787102,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,100.0,80,64,431,,,,,,2010-1-1_19_RDU
4,2010-1-1_20_JFK,New York,,United States of America,40.639801,-73.778702,America/New_York,1730877000.0,2024-11-06 02:08,2010-01-01,...,85.0,66,47,213,0.0,0.0,0.0,0.0,102.0,2010-1-1_20_JFK


### Pre-Processing
Sets up two data pre-processors: the first coverts datatypes and correct erroneous values, and the second drops duplicate and unused data columns.

In [52]:
def convert_data(dataframe):
    # CONVERT TO DAY, MONTH, YEAR
    dataframe['FLIGHT_DATE'] = dd.to_datetime(dataframe['FL_DATE'], errors='coerce')
    dataframe['DAY'] = dataframe['FLIGHT_DATE'].dt.day
    dataframe['MONTH'] = dataframe['FLIGHT_DATE'].dt.month
    dataframe['YEAR'] = dataframe['FLIGHT_DATE'].dt.year

    def convert_24H(time):
        """
        Converts the time (HHMM) to the number of minutes since midnight, which improves model consistency.
        Otherwise, 1159 (11:59AM) and 1200 (12:00PM) are treated as 41 minutes apart, which is incorrect.
        """
        return time % 100 + 60 * (time // 100) if pd.notna(time) else np.nan

    for column in [
        'CRS_DEP_TIME',
        'DEP_TIME',
        'CRS_ARR_TIME',
        'ARR_TIME',
        'WHEELS_OFF',
        'WHEELS_ON',
    ]:
        dataframe[column] = dataframe[column].map(convert_24H, meta=('x', 'f8'))

    # CONVERTED FOR CLARITY
    cancellation_codes = {
        np.nan: 'NONE',
        'nan': 'NONE',
        'A': 'AIRLINE',
        'B': 'WEATHER',
        'C': 'NAS',
        'D': 'SECURITY'
    }
    dataframe['CANCELLATION_CODE'] = dataframe['CANCELLATION_CODE'].replace(cancellation_codes)

    return dataframe

In [53]:
def drop_features(dataframe):
    dataframe = dataframe.drop(
        columns=[
            # ID COLUMNS
            'origin_id', 'weather_id', 'OP_CARRIER_FL_NUM',

            # UNRELATED DATA
            'origin_location_localtime_epoch', 'origin_location_localtime',

            # DATA MEASURED AFTER TAKEOFF
            'DEP_TIME', 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME',

            # DUPLICATE LOCATION DATA
            'origin_location_name', 'origin_location_region', 'origin_location_country',

            # DUPLICATE DATETIME DATA
            'FL_DATE', 'FLIGHT_DATE', 'origin_location_tz_id', 'origin_forecast_date',
            'origin_forecast_date_epoch', 'origin_forecast_hour_time', 'origin_forecast_hour_time_epoch',

            # DUPLICATE WEATHER DATA
            'origin_forecast_day_maxtemp_c', 'origin_forecast_day_mintemp_c', 'origin_forecast_day_avgtemp_c',
            'origin_forecast_day_maxwind_kph', 'origin_forecast_day_totalprecip_mm', 'origin_forecast_day_avgvis_km',
            'origin_forecast_hour_temp_c', 'origin_forecast_hour_wind_kph', 'origin_forecast_hour_wind_dir',
            'origin_forecast_hour_pressure_mb', 'origin_forecast_hour_precip_mm', 'origin_forecast_hour_feelslike_c',
            'origin_forecast_hour_windchill_c', 'origin_forecast_hour_heatindex_c', 'origin_forecast_hour_dewpoint_c',
            'origin_forecast_hour_vis_km', 'origin_forecast_hour_gust_kph',
            'origin_forecast_day_condition_icon', 'origin_forecast_day_condition_code',
            'origin_forecast_astro_sunrise', 'origin_forecast_astro_sunset', 'origin_forecast_astro_moonrise',
            'origin_forecast_astro_moonset',
        ])

    '''
    This data measures the cause of delay, in minutes, of the plane's arrival. While potentially useful, this data DOES NOT explain the reason for the departure delay. Unfortunately, this data is only reported if the aircraft arrives less than fifteen minutes late, significantly reducing its utility. Each of these is described in more detail below (paraphrased from the Bureau of Transportation Statistics):
    
    - Air Carrier (CARRIER_DELAY): Circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling)
    - Extreme Weather (WEATHER_DELAY): Extreme meteorological conditions that delays or prevents the operation of a flight (e.g. tornado, blizzard or hurricane).
    - National Aviation System (NAS_DELAY): Delays and cancellations attributable to the national aviation system, such as non-extreme weather conditions (like thunderstorms), airport operations, heavy traffic volume, and air traffic control.
    - Late-arriving aircraft (LATE_AIRCRAFT_DELAY): The previous flight arrived late, causing the present flight to depart late.
    - Security (SECURITY_DELAY): Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas.
    
    In my opinion, this data is not very useful because it is impractical to determine whether weather was the cause of the delay.

    Additionally, I have removed diversion data because those often occur after takeoff.
    '''
    # arrival_delay_data = dataframe[['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]
    dataframe = dataframe.drop(
        columns=['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY',
                 'DIVERTED'])

    # TODO: Can be removed for final version
    # DROP COLUMNS WITH IDENTICAL VALUES

    columns = (dataframe.min() != dataframe.max()).compute()
    columns['CANCELLATION_CODE'] = True
    columns['CANCELLED'] = True
    columns['DEP_DELAY'] = True

    dataframe = dataframe.loc[:, columns]

    return dataframe

### Transformer
Combines the preprocessing steps, generates labels, then transforms numeric and categorical data.

In [54]:
from dask_ml.impute import SimpleImputer
from dask_ml.preprocessing import StandardScaler, DummyEncoder, Categorizer


class DaskTransformer:
    def __init__(self):
        self.CATEGORIZER = Categorizer()
        self.CATEGORY_IMPUTER = SimpleImputer(strategy='most_frequent')
        self.CATEGORY_ENCODER = DummyEncoder()

        self.NUMERIC_IMPUTER = SimpleImputer(strategy='median')
        self.NUMERIC_ENCODER = StandardScaler()

    def preprocess(self, dataframe):
        return drop_features(convert_data(dataframe))

    def extract_labels(self, dataframe):
        return (
            dataframe['CANCELLATION_CODE'],
            dataframe['DEP_DELAY'],
            dataframe.drop(columns=['CANCELLATION_CODE', 'CANCELLED', 'DEP_DELAY'])
        )

    def fit_transform(self, dataframe):
        categorical = \
            self.CATEGORY_ENCODER.fit_transform(
                self.CATEGORY_IMPUTER.fit_transform(
                    self.CATEGORIZER.fit_transform(
                        dataframe[dataframe.select_dtypes(include=['string', 'object', 'category']).columns]
                    )))

        numeric = \
            self.NUMERIC_ENCODER.fit_transform(
                self.NUMERIC_IMPUTER.fit_transform(
                    dataframe[dataframe.select_dtypes(include=['number', 'Int64']).columns]
                )).astype(float)

        other = dataframe[dataframe.select_dtypes(exclude=['string', 'object', 'category', 'number', 'Int64']).columns]

        return dd.concat([categorical, numeric, other], axis=1)

    def transform(self, dataframe):
        categorical = \
            self.CATEGORY_ENCODER.transform(
                self.CATEGORY_IMPUTER.transform(
                    self.CATEGORIZER.transform(
                        dataframe[dataframe.select_dtypes(include=['string', 'object', 'category']).columns]
                    )))

        numeric = \
            self.NUMERIC_ENCODER.transform(
                self.NUMERIC_IMPUTER.transform(
                    dataframe[dataframe.select_dtypes(include=['number', 'Int64']).columns]
                )).astype(float)

        other = dataframe[dataframe.select_dtypes(exclude=['string', 'object', 'category', 'number', 'Int64']).columns]

        return dd.concat([categorical, numeric, other], axis=1)

In [55]:
transformer = DaskTransformer()

data = transformer.preprocess(data)
cancellation_data, delay_data, data = transformer.extract_labels(data);

Creates two types of labels: one for the amount of delay in minutes, and a boolean 'delay status.' Flights are considered delayed if they are cancelled or leave at least fifteen minutes after their scheduled departure.

In [56]:
from dask_ml.model_selection import train_test_split

'''
Treat all cancelled flights as 2-hour delays.
Flights are only considered delayed if they depart at least fifteen minutes after their intended departure.
'''
delay_labels = delay_data.where(cancellation_data.notnull(), 120).astype(int)
delay_statuses = delay_labels > 15

# SPLIT INTO TRAINING AND TESTING DATA
full_training, test, full_training_delays, test_delays, full_training_statuses, test_statuses = train_test_split(
    data, delay_labels, delay_statuses, test_size=0.2, random_state=RANDOM_SEED, shuffle=True
)
training, validation, training_delays, validation_delays, training_statuses, validation_statuses = train_test_split(
    full_training, full_training_delays, full_training_statuses, test_size=0.2, random_state=RANDOM_SEED, shuffle=True
)

training = transformer.fit_transform(training)
validation = transformer.transform(validation)
test = transformer.transform(test)

### Summarization

In [57]:
training.head(1000)

Unnamed: 0,origin_forecast_day_condition_text_Cloudy,origin_forecast_day_condition_text_Heavy rain at times,origin_forecast_day_condition_text_Light freezing rain,origin_forecast_day_condition_text_Moderate or heavy rain shower,origin_forecast_day_condition_text_Moderate or heavy snow showers,origin_forecast_day_condition_text_Moderate rain at times,origin_forecast_day_condition_text_Moderate snow,origin_forecast_day_condition_text_Overcast,origin_forecast_day_condition_text_Partly cloudy,origin_forecast_day_condition_text_Patchy moderate snow,...,origin_forecast_hour_chance_of_rain,origin_forecast_hour_will_it_snow,origin_forecast_hour_chance_of_snow,origin_forecast_hour_vis_miles,origin_forecast_hour_gust_mph,origin_forecast_hour_uv,CRS_DEP_TIME,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE
265,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,-0.293095,0.186378,-0.987463,1.137843,1.402465,0.386002,0.490856
125,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,-0.293095,-0.326688,-0.987463,-1.797183,-1.679662,0.702767,0.251152
607,True,False,False,False,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,0.715478,0.677372,0.539621,0.351656,1.864240,2.022098
487,False,False,False,True,False,False,False,False,False,False,...,3.208287,-0.087706,-0.087706,0.235768,2.623445,-0.987463,-1.441988,-1.550161,-0.881059,-0.782794
763,False,False,False,True,False,False,False,False,False,False,...,3.208287,-0.087706,-0.087706,0.235768,1.228545,0.677372,-0.167028,-0.273650,-0.775470,-0.929478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,-0.821958,1.581278,-0.155046,0.352677,0.262855,0.966739,0.687628
950,False,False,True,False,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,-0.182388,-0.987463,-1.628933,-1.442860,2.788139,2.494351
889,False,False,False,False,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,-0.374788,2.342207,-0.638128,-0.643653,-0.089146,0.172444
746,False,False,False,False,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,-0.919922,0.677372,-0.589522,-0.554852,0.201223,0.236842


In [58]:
validation.head(1000)

Unnamed: 0,origin_forecast_day_condition_text_Cloudy,origin_forecast_day_condition_text_Heavy rain at times,origin_forecast_day_condition_text_Light freezing rain,origin_forecast_day_condition_text_Moderate or heavy rain shower,origin_forecast_day_condition_text_Moderate or heavy snow showers,origin_forecast_day_condition_text_Moderate rain at times,origin_forecast_day_condition_text_Moderate snow,origin_forecast_day_condition_text_Overcast,origin_forecast_day_condition_text_Partly cloudy,origin_forecast_day_condition_text_Patchy moderate snow,...,origin_forecast_hour_chance_of_rain,origin_forecast_hour_will_it_snow,origin_forecast_hour_chance_of_snow,origin_forecast_hour_vis_miles,origin_forecast_hour_gust_mph,origin_forecast_hour_uv,CRS_DEP_TIME,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE
480,False,False,False,False,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,-0.358755,0.677372,-0.888633,-1.124657,-1.752163,-1.380265
0,True,False,False,False,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,-1.879685,-0.358755,-0.987463,1.287398,1.298864,0.174825,-0.099459
70,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,1.517145,-0.155046,-0.039906,-0.033148,0.042840,0.068691
706,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,-2.408548,1.709545,-0.987463,1.623898,1.716967,-0.801867,-0.618221
908,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,-0.821958,0.202412,0.677372,-1.210177,-1.313359,0.755562,0.501589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,-0.293095,1.613345,-0.155046,-0.189462,-0.288450,-0.722676,-0.582444
540,False,False,False,True,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,0.235768,0.362745,0.677372,0.277899,0.125954,-1.065839,-0.847192
220,True,False,False,False,False,False,False,False,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,-0.615288,-0.987463,1.081760,1.136062,0.465194,-0.611066
278,False,False,False,False,False,False,False,True,False,False,...,-0.311693,-0.087706,-0.087706,0.764631,0.795645,0.677372,0.745260,1.084262,0.887547,1.295832


In [59]:
print("TRAINING: ", training_statuses.head(1000).value_counts())
print("\nVALIDATION: ", validation_statuses.head(1000).value_counts())
print("\nTEST: ", test_statuses.head(1000).value_counts())

TRAINING:  DEP_DELAY
False    509
True     146
Name: count, dtype: int64

VALIDATION:  DEP_DELAY
False    130
True      25
Name: count, dtype: int64

TEST:  DEP_DELAY
False    151
True      39
Name: count, dtype: int64


---

## Models
### Linear Regressor

In [60]:
from dask_ml.linear_model import LinearRegression

linear_regressor = LinearRegression()
linear_regressor.fit(training.to_dask_array(lengths=True), training_delays.to_dask_array(lengths=True))

In [61]:
from sklearn.metrics import root_mean_squared_error

linear_delay_predictions = linear_regressor.predict(validation.to_dask_array(lengths=True))
linear_rmse = root_mean_squared_error(validation_delays, linear_delay_predictions)
print(linear_rmse)

50.63918535575993


### Random Forest Regressor

In [62]:
from distributed import as_completed
from sklearn.model_selection import ParameterGrid


def train_forest(params):
    model = DaskLGBMClassifier(**params)
    model.fit(training.to_dask_array(lengths=True), training_delays.to_dask_array(lengths=True))
    score = model.score(validation.to_dask_array(lengths=True), validation_delays.to_dask_array(lengths=True))
    return params, model, score


def forest_grid_search():
    param_grid = {
        'n_estimators': [5, 10, 50, 100],
        'max_depth': [5, 10],
        'learning_rate': [0.1, 0.01],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    param_combinations = list(ParameterGrid(param_grid))

    best_model = None
    best_score = -float('inf')

    futures = [client.submit(train_forest, params) for params in param_combinations]
    for future in as_completed(futures):
        model, score = future.result()
        print(f"Accuracy with params {model.get_params()}: {score}")

        if score > best_score:
            best_model = model
            best_score = score

    return best_model, best_score

In [None]:
forest_model, forest_score = forest_grid_search()
print(f"Best Model: {forest_model}")
print(f"Training Accuracy: {forest_score}")

forest_delay_predictions = forest_model.predict(validation.to_dask_array(lengths=True))
random_forest_rmse = root_mean_squared_error(validation_delays, forest_delay_predictions)
print(random_forest_rmse)

### KMeans-Linear Regressor

In [None]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

# Train a KMeans clustering algorithm using KMeans
CLUSTER_COUNT = 5
kmeans = KMeans(CLUSTER_COUNT, random_state=RANDOM_SEED)
kmeans.fit(training)

# Train linear models based on each training cluster
kmeans_models = {
    cluster: SGDRegressor(random_state=RANDOM_SEED).fit(
        training[kmeans.labels_ == cluster],
        training_delays[kmeans.labels_ == cluster]
    )
    for cluster in range(CLUSTER_COUNT)
}



In [41]:
# Predict and analyze validation data
validation_clusters = kmeans.predict(validation)

kmeans_predictions = np.concatenate([
    kmeans_models[cluster].predict(validation[validation_clusters == cluster])
    for cluster in range(CLUSTER_COUNT)
])

kmeans_delays = np.concatenate([
    validation_delays[validation_clusters == cluster]
    for cluster in range(CLUSTER_COUNT)
])

kmeans_rmse = root_mean_squared_error(kmeans_delays, kmeans_predictions)
print(kmeans_rmse)

30.094872707799837


### SVM-Linear Regressor

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

# Train a non-linear support vector machine
svm = SVC(random_state=RANDOM_SEED)

parameters = [
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1]
    },
    {
        'kernel': ['poly'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1],
        'degree': [1, 3, 5]
    }
]
# parameters = [
#     {
#         'kernel': ['poly'],
#         'C': [10],
#         'gamma': [0.1],
#         'degree': [3]
#     }
# ]

grid_search = GridSearchCV(
    svm,
    parameters,
    scoring='accuracy',
    return_train_score=True,
    verbose=3
)

grid_search.fit(training, training_statuses)
svm = grid_search.best_estimator_

# Train a linear model based on the SVM's predicted delay statuses
svm_status_predictions = svm.predict(training)

if np.nonzero(svm_status_predictions)[0].shape[0] == 0:
    raise StopExecution(
        "Error: SVM predicted zero delayed flights in the training data. Unable to train a linear model.")

svm_linear_regressor = SGDRegressor(random_state=RANDOM_SEED).fit(
    training[svm_status_predictions == 1],
    training_delays[svm_status_predictions == 1]
)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 2/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 3/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 4/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 5/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.914, test=0.906) total time=   0.0s
[CV 1/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 2/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 3/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 4/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.912, test=0.914) total time=   0.0s
[CV 5/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.914, test

StopExecution: Error: SVM predicted zero delayed flights in the training data. Unable to train a linear model.

In [None]:
# Predict and analyze validation data
svm_validation_status_predictions = svm.predict(validation)

svm_predictions = np.zeros(validation.shape[0])
svm_predictions[svm_validation_status_predictions == 1] = svm_linear_regressor.predict(
    validation[svm_validation_status_predictions == 1])

svm_rmse = root_mean_squared_error(validation_delays, svm_predictions)
print(svm_rmse)