# ML4UVA

---

## Jupyter Setup

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import pandas as pd
import numpy as np

import sys, os

%matplotlib inline

RANDOM_SEED = 42
sklearn.random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
keras.utils.set_random_seed(RANDOM_SEED)

In [2]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

class StopExecution(Exception):
    def _render_traceback_(self):
        return []

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


---

## Data Cleaning
### Import Flight Data
Imports and summarizes the dataset

In [4]:
def load_flight_data():
    flight_data = []
    for year in range(2010, 2011):
        flight_data.append(pd.read_csv('/content/drive/MyDrive/Datasets/' + str(year) + '_with_weather.csv',
            dtype={
                # CANCELLATION CODE
                'CANCELLATION_CODE': str,

                # DURATION (NULL WHEN CANCELLED)
                'DEP_DELAY': 'Int64',
                'ARR_DELAY': 'Int64',
                'TAXI_OUT': 'Int64',
                'TAXI_IN': 'Int64',
                'ACTUAL_ELAPSED_TIME': 'Int64',
                'AIR_TIME': 'Int64',
                'CARRIER_DELAY': 'Int64',
                'WEATHER_DELAY': 'Int64',
                'NAS_DELAY': 'Int64',
                'SECURITY_DELAY': 'Int64',
                'LATE_AIRCRAFT_DELAY': 'Int64',

                # TIME (NULL WHEN CANCELLED)
                'DEP_TIME': 'Int64',
                'ARR_TIME': 'Int64',
                'WHEELS_OFF': 'Int64',
                'WHEELS_ON': 'Int64',

                'DISTANCE': int,

                'CANCELLED': bool,
                'DIVERTED': bool,

                # STRINGS / OBJECTS
                'origin_forecast_day_condition_text': str,
                'origin_forecast_astro_sunrise': str,
                'origin_forecast_astro_sunset': str,
                'origin_forecast_astro_moonrise': str,
                'origin_forecast_astro_moonset': str,
                'origin_forecast_astro_moon_phase': str,
            }
        ))

    return pd.concat(flight_data)

data = load_flight_data()
data = data.sample(n=30000, random_state=RANDOM_SEED)

# SIZE (MB)
print('The training set uses', '%.1f' % (data.memory_usage(deep=1).sum() / (1024 ** 3)), '\bGB of memory.')

# ENTRIES
print('The training set has', data.shape[0], 'entries.')

# FEATURES
print('The training set has', data.shape[1], 'features.')

# CATEGORICAL DATA
print('The training set', 'has' if not data.select_dtypes(include=['category', 'object']).empty else 'does not have',
      'categorical data.')

# MISSING DATA
print('The training set', 'has' if data.isnull().any(axis=None) else 'does not have', 'missing data.', end='\n\n')

# BASIC STATISTICS
data.info()
data.describe()

The training set uses 0.1 GB of memory.
The training set has 30000 entries.
The training set has 100 features.
The training set has categorical data.
The training set has missing data.

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1094344 to 2751035
Data columns (total 100 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   origin_id                                30000 non-null  object 
 1   origin_location_name                     29988 non-null  object 
 2   origin_location_region                   0 non-null      float64
 3   origin_location_country                  30000 non-null  object 
 4   origin_location_lat                      30000 non-null  float64
 5   origin_location_lon                      30000 non-null  float64
 6   origin_location_tz_id                    30000 non-null  object 
 7   origin_location_localtime_epoch          30000 non-null  int64  


Unnamed: 0,origin_location_region,origin_location_lat,origin_location_lon,origin_location_localtime_epoch,origin_forecast_date_epoch,origin_forecast_day_maxtemp_c,origin_forecast_day_maxtemp_f,origin_forecast_day_mintemp_c,origin_forecast_day_mintemp_f,origin_forecast_day_avgtemp_c,...,ARR_DELAY,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,0.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,29930.0,30000.0,29930.0,29930.0,30000.0,5626.0,5626.0,5626.0,5626.0,5626.0
mean,,36.754538,-95.169062,1730877000.0,1278157000.0,19.87497,67.767613,10.550753,50.987267,14.939423,...,4.443602,130.747333,126.703375,104.178717,738.4821,16.471738,2.40064,13.90224,0.134021,21.677391
std,,5.846987,18.083233,210.7694,8974670.0,10.812993,19.462738,9.780819,17.600498,10.113888,...,35.31048,70.406052,69.369163,67.225619,566.620902,41.325513,17.893457,26.176048,2.641311,38.814777
min,,17.6994,-176.645996,1730877000.0,1262304000.0,-25.2,-13.4,-30.9,-23.5,-27.7,...,-87.0,25.0,15.0,8.0,31.0,0.0,0.0,0.0,0.0,0.0
25%,,33.435501,-110.942001,1730877000.0,1270512000.0,12.7,54.9,3.9,39.1,8.4,...,-12.0,80.0,77.0,56.0,334.0,0.0,0.0,0.0,0.0,0.0
50%,,37.3619,-89.981598,1730877000.0,1278288000.0,21.3,70.3,11.1,52.0,16.0,...,-4.0,113.0,109.0,86.0,589.0,0.0,0.0,3.0,0.0,1.0
75%,,40.7869,-81.685699,1730877000.0,1285805000.0,28.4,83.0,18.5,65.3,23.3,...,8.0,160.0,156.0,132.0,965.0,17.0,0.0,18.0,0.0,28.0
max,,71.285301,-64.795898,1730878000.0,1293754000.0,45.6,114.0,31.8,89.2,37.6,...,1047.0,650.0,665.0,631.0,4962.0,1047.0,768.0,279.0,114.0,381.0


### Data Pre-Processing
Sets up two data pre-processors: the first coverts datatypes and correct erroneous values, and the second drops duplicate and unused data columns.

In [5]:
from sklearn.preprocessing import FunctionTransformer

def convert_datatype(dataframe):
    # CONVERT TO DAY, MONTH, YEAR
    dataframe['FLIGHT_DATE'] = pd.to_datetime(dataframe['FL_DATE'])
    dataframe['DAY'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.day)
    dataframe['MONTH'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.month)
    dataframe['YEAR'] = dataframe['FLIGHT_DATE'].apply(lambda x: x.year)

    '''
    # Converts the time (HHMM) to the number of minutes since midnight, which improves model consistency.
    # Otherwise, 1159 (11:59AM) and 1200 (12:00PM) are treated as 41 minutes apart, which is incorrect.
    # '''
    for column in [
        'CRS_DEP_TIME',
        'DEP_TIME',
        'CRS_ARR_TIME',
        'ARR_TIME',
        'WHEELS_OFF',
        'WHEELS_ON',
    ]:
        dataframe[column] = dataframe[column].apply(lambda x: x % 100 + 60 * (x // 100))

    # CONVERTED FOR CLARITY
    cancellation_codes = {
        np.nan: 'NONE',
        'nan': 'NONE',
        'A': 'AIRLINE',
        'B': 'WEATHER',
        'C': 'NAS',
        'D': 'SECURITY'
    }
    dataframe['CANCELLATION_CODE'] = dataframe['CANCELLATION_CODE'].apply(lambda x: cancellation_codes[x])

    return dataframe

datatype_converter = FunctionTransformer(convert_datatype)

In [6]:
def extract_features(dataframe):
    # DROP ID COLUMNS
    dataframe = dataframe.drop(columns=['origin_id', 'weather_id', 'OP_CARRIER_FL_NUM'])

    # DROP UNRELATED DATA
    dataframe = dataframe.drop(columns=['origin_location_localtime_epoch', 'origin_location_localtime'])

    # DROP DATA MEASURED AFTER TAKEOFF
    dataframe = dataframe.drop(columns=['DEP_TIME', 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', ])

    '''
    These columns contain duplicate weather, location, and datetime data. The following data was kept:

    - Location: latitude, longitude, airport code
    - Datetime: departure time, day, month, year
    - Weather: Imperial data
    '''
    # LOCATION
    dataframe = dataframe.drop(columns=['origin_location_name', 'origin_location_region', 'origin_location_country', ])

    # DATETIME
    dataframe = dataframe.drop(columns=['FL_DATE', 'FLIGHT_DATE', 'origin_location_tz_id', 'origin_forecast_date', 'origin_forecast_date_epoch', 'origin_forecast_hour_time', ])

    # WEATHER
    dataframe = dataframe.drop(columns=['origin_forecast_day_maxtemp_c', 'origin_forecast_day_mintemp_c', 'origin_forecast_day_avgtemp_c', 'origin_forecast_day_maxwind_kph', 'origin_forecast_day_totalprecip_mm', 'origin_forecast_day_avgvis_km', 'origin_forecast_hour_temp_c',  'origin_forecast_hour_wind_kph', 'origin_forecast_hour_wind_dir', 'origin_forecast_hour_pressure_mb', 'origin_forecast_hour_precip_mm', 'origin_forecast_hour_feelslike_c', 'origin_forecast_hour_windchill_c', 'origin_forecast_hour_heatindex_c', 'origin_forecast_hour_dewpoint_c', 'origin_forecast_hour_vis_km', 'origin_forecast_hour_gust_kph'])
    dataframe = dataframe.drop(columns=['origin_forecast_day_condition_icon', 'origin_forecast_day_condition_code'])
    dataframe = dataframe.drop(columns=['origin_forecast_astro_sunrise', 'origin_forecast_astro_sunset', 'origin_forecast_astro_moonrise','origin_forecast_astro_moonset'])# DUPLICATE OR NULL DATA

    '''
    This data measures the cause of delay, in minutes, of the plane's arrival. While potentially useful, this data DOES NOT explain the reason for the departure delay. Unfortunately, this data is only reported if the aircraft arrives less than fifteen minutes late, significantly reducing its utility. Each of these is described in more detail below (paraphrased from the Bureau of Transportation Statistics):

    - Air Carrier (CARRIER_DELAY): Circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling)
    - Extreme Weather (WEATHER_DELAY): Extreme meteorological conditions that delays or prevents the operation of a flight (e.g. tornado, blizzard or hurricane).
    - National Aviation System (NAS_DELAY): Delays and cancellations attributable to the national aviation system, such as non-extreme weather conditions (like thunderstorms), airport operations, heavy traffic volume, and air traffic control.
    - Late-arriving aircraft (LATE_AIRCRAFT_DELAY): The previous flight arrived late, causing the present flight to depart late.
    - Security (SECURITY_DELAY): Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas.

    In my opinion, this data is not very useful because it is impractical to determine whether weather was the cause of the delay.
    '''
    # arrival_delay_data = dataframe[['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]
    dataframe = dataframe.drop(columns=['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY'])

    # DIVERSIONS (THESE USUALLY OCCUR AFTER TAKEOFF)
    dataframe = dataframe.drop(columns=['DIVERTED'])

    '''
    Since we have a lot of clean, usable data, I decided to drop any rows with missing flight numbers, origins, destinations, date information, and labels. The other columns can be imputed without losing much data integrity.
    '''
    dataframe = dataframe.dropna(subset=['ORIGIN', 'DEST', 'DEP_DELAY', 'CANCELLED', 'DAY', 'MONTH', 'YEAR'])

    return dataframe

feature_extractor = FunctionTransformer(extract_features)

### Data Pipeline
Uses two preprocessing steps, generates labels, then transforms numeric and categorical data.

In [7]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder())
])

column_transformer = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, make_column_selector(dtype_include=[np.number])),
        ('categorical', categorical_transformer, make_column_selector(dtype_include=['object', 'category'])),
    ]
)

preprocessor = Pipeline(steps=[
    ('datatype_converter', datatype_converter),
    ('feature_extractor', feature_extractor),
])
transformer = Pipeline(steps=[
    ('transformer', column_transformer),
])

In [8]:
# PREPROCESS
data = preprocessor.fit_transform(data)

# LABELS
cancellation_data = data['CANCELLATION_CODE']
delay_data = data['DEP_DELAY']
data = data.drop(columns=['CANCELLATION_CODE', 'CANCELLED', 'DEP_DELAY'])

# TRANSFORMER
data = transformer.fit_transform(data)

### Training, Testing, and Validation Datasets & Labels
Creates training labels for the amount of delay in minutes and a boolean delay status with a fifteen-minute threshold. The dataset is split into training, validation, and test datasets.

In [9]:
'''
Treat all cancelled flights as 2-hour delays.
Flights are only considered delayed if they depart at least fifteen minutes after their intended departure.
'''
delay_labels = delay_data.where(cancellation_data.notna(), 120)
delay_statuses = delay_labels > 15

In [10]:
from sklearn.model_selection import train_test_split

full_training, test, full_training_delays, test_delays, full_training_statuses, test_statuses = train_test_split(
    data, delay_labels, delay_statuses, test_size=0.2, random_state=RANDOM_SEED
)
training, validation, training_delays, validation_delays, training_statuses, validation_statuses = train_test_split(
    full_training, full_training_delays, full_training_statuses, test_size=0.2, random_state=RANDOM_SEED
)

In [11]:
# PRINTS THE NUMBER OF DELAYED FLIGHTS IN EACH DATASET
print("TRAINING:", np.nonzero(training_statuses.to_numpy())[0].shape[0])
print("VALIDATION:", np.nonzero(validation_statuses.to_numpy())[0].shape[0])
print("TEST:", np.nonzero(test_statuses.to_numpy())[0].shape[0])

TRAINING: 3325
VALIDATION: 803
TEST: 994


---

## Models
### Linear Regressor

In [29]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

linear_regressor = SGDRegressor(random_state=RANDOM_SEED)
linear_regressor.fit(training, training_delays)

In [30]:
linear_delay_predictions = linear_regressor.predict(test)
linear_rmse = root_mean_squared_error(test_delays, linear_delay_predictions)
print(linear_rmse)

30.98350526408498


### Random Forest Regressor

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

random_forest_regressor = RandomForestRegressor(random_state=RANDOM_SEED)

grid_search = GridSearchCV(
    random_forest_regressor,
    [{'n_estimators': [5, 10, 30], 'max_features': [5, 10, 20, 50]}],
    scoring='neg_mean_squared_error',
    return_train_score=True,
    verbose=3
)

grid_search.fit(training, training_delays)
random_forest_regressor = grid_search.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END max_features=5, n_estimators=5;, score=(train=-279.109, test=-1117.285) total time=   0.9s
[CV 2/5] END max_features=5, n_estimators=5;, score=(train=-247.565, test=-1201.280) total time=   0.9s
[CV 3/5] END max_features=5, n_estimators=5;, score=(train=-256.843, test=-1329.958) total time=   0.9s
[CV 4/5] END max_features=5, n_estimators=5;, score=(train=-281.927, test=-1142.912) total time=   0.9s
[CV 5/5] END max_features=5, n_estimators=5;, score=(train=-285.915, test=-1227.915) total time=   0.9s
[CV 1/5] END max_features=5, n_estimators=10;, score=(train=-198.069, test=-1044.451) total time=   1.7s
[CV 2/5] END max_features=5, n_estimators=10;, score=(train=-185.494, test=-1102.898) total time=   1.7s
[CV 3/5] END max_features=5, n_estimators=10;, score=(train=-179.743, test=-1254.472) total time=   1.7s
[CV 4/5] END max_features=5, n_estimators=10;, score=(train=-201.700, test=-1051.243) total time=   1.7s

In [39]:
random_forest_delay_predictions = random_forest_regressor.predict(test)
random_forest_rmse = root_mean_squared_error(test_delays, random_forest_delay_predictions)
print(random_forest_rmse)

31.266197266080848


### XG Boost

In [52]:
# https://xgboosting.com/configure-xgboost-early-stopping-via-callback/

import xgboost as xgb

early_stop = xgb.callback.EarlyStopping(rounds=10, metric_name="rmse")

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "callbacks":[early_stop]
}

model_xgb = xgb.XGBRegressor(**params)
model_xgb.fit(
    X=training,
    y=training_delays,
    eval_set=[(validation, validation_delays)],
    verbose=True
    )

print(f"BestIteration:{model_xgb.best_iteration}")
print(f"Best val score:{model_xgb.best_score}")

[0]	validation_0-rmse:31.27727
[1]	validation_0-rmse:31.07753
[2]	validation_0-rmse:31.01644
[3]	validation_0-rmse:30.99133
[4]	validation_0-rmse:30.99351
[5]	validation_0-rmse:30.97048
[6]	validation_0-rmse:30.89473
[7]	validation_0-rmse:30.96143
[8]	validation_0-rmse:30.95128
[9]	validation_0-rmse:31.02350
[10]	validation_0-rmse:31.03548
[11]	validation_0-rmse:31.00810
[12]	validation_0-rmse:31.06944
[13]	validation_0-rmse:31.06047
[14]	validation_0-rmse:31.06957
[15]	validation_0-rmse:31.11650
BestIteration:6
Best val score:30.894727615791176


### LightGBM

In [35]:
# https://pub.aimind.so/hyperparameter-optimization-with-gridsearchcv-method-for-a-lightgbm-classification-model-f14755328fb

from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error, accuracy_score

lightgbm_regressor = LGBMRegressor(random_state=RANDOM_SEED, metric="rmse")

param_lgbm_grid = {
    "num_leaves":[31, 50, 63, 127, 200],
    "max_depth":[-1,3,5,7],
    "n_estimators":[50,100],
    "learning_rate":[0.01, 0.001,0.1,0.2],
    "subsample":[0.8,1.0],
}

lgbm_grid = GridSearchCV(lightgbm_regressor, param_lgbm_grid, cv=5, scoring="neg_root_mean_squared_error", refit=True)

lgbm_grid.fit(training, training_delays)




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6113
[LightGBM] [Info] Number of data points in the train set: 15360, number of used features: 295
[LightGBM] [Info] Start training from score 8.380339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6112
[LightGBM] [Info] Number of data points in the train set: 15360, number of used features: 293
[LightGBM] [Info] Start training from score 8.779948
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6109
[LightGBM] [Info] Number of data points in the train 

  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6203
[LightGBM] [Info] Number of data points in the train set: 19200, number of used features: 317
[LightGBM] [Info] Start training from score 8.599896


In [38]:
lgbm = lgbm_grid.best_estimator_

y_pred = lgbm.predict(test)
lgbm_rmse = root_mean_squared_error(y_true=test_delays, y_pred=y_pred)
print(f"RMSE \n {lgbm_rmse}")

RMSE 
 30.703190383218082


### KMeans-Linear Regressor

In [32]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

# Train a KMeans clustering algorithm using KMeans
CLUSTER_COUNT = 5
kmeans = KMeans(CLUSTER_COUNT, random_state=RANDOM_SEED)
kmeans.fit(training)

# Train linear models based on each training cluster
kmeans_models = {
    cluster: SGDRegressor(random_state=RANDOM_SEED).fit(
        training[kmeans.labels_ == cluster],
        training_delays[kmeans.labels_ == cluster]
    )
    for cluster in range(CLUSTER_COUNT)
}

In [None]:
# Predict and analyze test data
test_clusters = kmeans.predict(test)

kmeans_predictions = np.concatenate([
    kmeans_models[cluster].predict(test[test_clusters == cluster])
    for cluster in range(CLUSTER_COUNT)
])

kmeans_delays = np.concatenate([
    test_delays[test_clusters == cluster]
    for cluster in range(CLUSTER_COUNT)
])

kmeans_rmse = root_mean_squared_error(kmeans_delays, kmeans_predictions)
print(kmeans_rmse)

1868180885.6916697


### SVM-Linear Regressor

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

# Train a non-linear support vector machine
svm = SVC(random_state=RANDOM_SEED)

parameters = [
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto']
    },
    {
        'kernel': ['poly'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'degree': [1, 3, 5]
    }
]

grid_search = GridSearchCV(
    svm,
    parameters,
    scoring='accuracy',
    return_train_score=True,
    verbose=3
)

grid_search.fit(training, training_statuses)
svm = grid_search.best_estimator_

# Train a linear model based on the SVM's predicted delay statuses
svm_status_predictions = svm.predict(training)

if np.nonzero(svm_status_predictions)[0].shape[0] < 10:
    raise StopExecution("Error: SVM predicted less than 10 delayed flights in the training data. Unable to train a linear model.")

svm_linear_regressor = SGDRegressor(random_state=RANDOM_SEED).fit(
    training[svm_status_predictions == 1],
    training_delays[svm_status_predictions == 1]
)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.827, test=0.827) total time=  27.5s
[CV 2/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.827, test=0.827) total time=  25.9s
[CV 3/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.827, test=0.827) total time=  28.1s
[CV 4/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.827, test=0.827) total time=  24.6s
[CV 5/5] END C=0.1, gamma=scale, kernel=rbf;, score=(train=0.827, test=0.827) total time=  24.2s
[CV 1/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.827, test=0.827) total time=  12.5s
[CV 2/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.827, test=0.827) total time=  12.3s
[CV 3/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.827, test=0.827) total time=  12.1s
[CV 4/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.827, test=0.827) total time=  14.5s
[CV 5/5] END C=0.1, gamma=auto, kernel=rbf;, score=(train=0.827, test

In [None]:
# Predict and analyze test data
svm_test_status_predictions = svm.predict(test)

svm_predictions = np.zeros(test.shape[0])
svm_predictions[svm_test_status_predictions == 1] = svm_linear_regressor.predict(test[svm_test_status_predictions == 1])

svm_rmse = root_mean_squared_error(test_delays, svm_predictions)
print(svm_rmse)

9940603635.653193


In [None]:
print(svm_predictions[svm_test_status_predictions == 1].sum())

-2153394845686.1987


### Neural Network


### FNN (baseline)

In [42]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV

clf = MLPClassifier(random_state=RANDOM_SEED, alpha=1e-5, learning_rate="adaptive", early_stopping=True, n_iter_no_change=10)

param_mlp_grid = {
    "hidden_layer_sizes":[(50,),(100,),(4,67),(100,50)],
    "activation":['relu'],
    "learning_rate_init":[0.01, 0.001,0.1,0.2],
    "max_iter":[500]
}

mlp_grid = GridSearchCV(clf, param_grid=param_mlp_grid, cv=5, scoring="neg_root_mean_squared_error", refit=True)

mlp_grid.fit(training, training_delays)





In [43]:
mlp = mlp_grid.best_estimator_
y_pred = mlp.predict(test)
mlp_rmse = root_mean_squared_error(y_true=test_delays, y_pred=y_pred)
print(mlp_rmse)

33.92607158710441


In [63]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import root_mean_squared_error


voting = VotingRegressor([
    ('lr', linear_regressor),
    ('lgbm', lgbm),
])

voting.fit(training, training_delays)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008800 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6203
[LightGBM] [Info] Number of data points in the train set: 19200, number of used features: 317
[LightGBM] [Info] Start training from score 8.599896


In [64]:
y_pred = voting.predict(test)
voting_rmse = root_mean_squared_error(y_true=test_delays, y_pred=y_pred)

In [65]:
print(f"Ensemble Voting: \n{voting_rmse}")

Ensemble Voting: 
30.637887206766898
