# Random Forest Baseline Advanced Optimization 

Cloned from: [Feature_selection_LR_airlinejoin (baseline-3)](tobe update)
- Has the hyperparameter tuning for the baseline with all features for midterm.

- What is this notebook for? 
  - Notebook that runs the baseline Random Forest with advanced optimization
  - Machine Learning Algorithms : Random Forest
  - Metrics: F1, accuracy, precision
  
---
  
- Data:
  - Final, cleaned weather
  - Cleaned flights minus (PR departure delay, PR arrival delay, carrier ordinal)

RF in spark

https://adb-731998097721284.4.azuredatabricks.net/?o=731998097721284#notebook/1898361324232092/command/1898361324232093

Feature selection in RF

https://antonhaugen.medium.com/feature-selection-for-pyspark-tree-classifiers-3be54a8bc493

# 0. Preparation

In [0]:
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import geopandas as gpd
import folium
import time
import hyperopt

from pyspark.sql.functions import col, max, min
from pyspark.sql.functions import col,isnan,when,count
from pyspark.sql.functions import desc
from pyspark.sql.types import DateType
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MinMaxScaler
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import add_months, date_add
from pyspark.ml.classification import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from hyperopt import tpe, hp, fmin, STATUS_OK
from hyperopt.pyll.base import scope

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from hyperopt import Trials



In [0]:
# Put at the top of any notebooks for storing in blob

from pyspark.sql.functions import col, max

blob_container = "team06" # The name of your container created in https://portal.azure.com
storage_account = "apatel" # The name of your Storage account created in https://portal.azure.com
secret_scope = "team06" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "team06" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

#Matt RF Manual Simplified

This is a streamlined version of the original tuning notebook to more quickly tune a model for each fold for use in ensembling/voting.

We export both the validation and test set predictions, as the validation set will be used to tune the threshold.

In [0]:
def rebalanceDF(trainingDF, desired_neg_to_pos_ratio):
  original_dep_del15_count_df = trainingDF.groupby('DEP_DEL15').count()
  positiveTrainCount = original_dep_del15_count_df.filter(original_dep_del15_count_df.DEP_DEL15 ==  1).select(['count']).head()[0]
  negativeTrainCount = original_dep_del15_count_df.filter(original_dep_del15_count_df.DEP_DEL15 ==  0).select(['count']).head()[0]

  # undersampling the negative cases
  negativeDF = trainingDF.filter(trainingDF['DEP_DEL15']==0).sample(False, positiveTrainCount/negativeTrainCount*desired_neg_to_pos_ratio, seed=12345)
  positiveDF = trainingDF.filter(trainingDF['DEP_DEL15']==1)
  new_trainDF = positiveDF.union(negativeDF).cache()
  return new_trainDF

In [0]:
df_imported = spark.read.parquet(f'{blob_url}/full_join_mattsFeats_anandFeats_cleaned_extraFeatures_upToOD_good_v8_4-6-22')

In [0]:
feature_list_full = ['DISTANCE', 'ORI_elevation', 'DEST_elevation', 'prior_dep_delayed', 'previous_DEP_DELAY_NEW', 'prior_arr_delayed', 'previous_ARR_DELAY_NEW', 'plane_is_here', 'avg_carrier_delay_24hrs', 'flights_sch_Today_ORIGIN', 'flights_sch_Today_DEST', 'avg_ori_airport_delay_24hrs', 'daytime', 'evening', 'new_england', 'mid_atlantic', 'south', 'midwest', 'southwest', 'west', 'pacific_islands', 'spring', 'winter', 'autumn', 'weekend_or_holiday', 'prior_dep_delayed', 'previous_DEP_DELAY_NEW', 'plane_is_here', 'previous_ARR_DELAY_NEW', 'avg_carrier_delay_24hrs', 'prior_arr_delayed', 'avg_ori_airport_delay_24hrs', 'depDelayPageRank', 'arrDelayPageRank', 'OC1_0_wind_gust_spd_rate_imp', 'AA1_1_liquid_precip', 'AA3_1_liquid_precip', 'gd1_0_sky_coverage', 'au2_4_extreme_wind_weather', 'mv1_0_sand_dust_near', 'mv1_0_thunder_rain_near', 'aw1_mw1_0_smoke_haze_dust', 'aw1_mw1_0_fog', 'aw1_mw1_0_rain_drizzle', 'aw1_mw1_0_freezing_rain_drizzle', 'aw1_mw1_0_snow', 'aw1_mw1_0_hail_or_ice', 'aw1_mw1_0_thunderstorm', 'aw1_mw1_0_tornado', 'SLP_0_avg_station_press_imp', 'WND_3_wind_speed_imp', 'CIG_0_sky_ceiling_height_imp', 'VIS_0_visibility_dist_imp', 'TMP_0_air_temperature_imp', 'DEW_0_dew_pt_temp_imp', 'MA1_0_altimeter_set_rate_imp', 'MA1_2_station_pres_rate_imp', 'GD1_3_cloud_height_imp', 'avg_hourly_delay_24hr', 'avg_OD_dep_del15_24hr', 'avg_OD_delay_min_24hr', 'avg_OD_num_flights_24hr', 'depDelayPageRank_ordinal', 'arrDelayPageRank_ordinal', 'departed_for_current_aiport', 'time_inb_flight_min', 'avg_time_inb_flights_carrier_24hr', 'avg_time_inb_flights_origin_24hr', 'avg_time_inb_flights_dest_24hr', 'avg_ori_DEP_DELAY_NEW_24hr', 'airline_carrier_del', 'airline_carrier_del_min', 'avg_carrier_delay_over15_lastQ', 'airline_carrier_del_ordinal', 'airline_carrier_del_min_ordinal', 'origin_avg_DEP_DEL15', 'origin_avg_DEP_DELAY_NEW', 'avg_origin_delay_over15_lastQ', 'origin_del_ordinal', 'origin_del_min_ordinal', 'OD_avg_DEP_DEL15', 'OD_avg_DEP_DELAY_NEW', 'avg_OD_delay_over15_lastQ', 'OD_del_ordinal', 'OD_del_min_ordinal']

In [0]:
desired_neg_to_pos_ratio = 1.6

maxDepth=8
maxBins=64
minInstancesPerNode=10
numTrees=100
seed=12345

CV_name = 'fold_1'
train_data = df_imported.where((col('YEAR').cast('int') == 2015))
val_data = df_imported.where((col('YEAR').cast('int') == 2016))
holdout_data = df_imported.where((col('YEAR').cast('int') == 2019))
save_name = 'xgboost_test_preds_4_9_'+CV_name

new_trainDF = rebalanceDF(train_data, desired_neg_to_pos_ratio)
val = val_data
holdout = holdout_data

# Assemble features
assembler = VectorAssembler().setInputCols(feature_list_full).setOutputCol('features')
assmb_train = assembler.transform(new_trainDF)
assmb_val = assembler.transform(val)
assmb_holdout = assembler.transform(holdout)

# modeling
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'DEP_DEL15',
                            maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, numTrees=numTrees, seed=seed)
rfModel = rf.fit(assmb_train)
val_fold_1 = rfModel.transform(assmb_val)
predictions_fold_1 = rfModel.transform(assmb_holdout)

In [0]:
display(predictions_fold_1)

YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,LATE_AIRCRAFT_DELAY,ORI_IATA,ORI_station_id,ORI_station_lat,ORI_station_lon,ORI_airport_lat,ORI_airport_lon,ORI_elevation,ORI_dist_airp_sta,DEST_IATA,DEST_station_id,DEST_station_lat,DEST_station_lon,DEST_airport_lat,DEST_airport_lon,DEST_elevation,DEST_dist_airp_sta,CRS_DEP_HRS,CRS_DEP_MINS,CRS_DEP_TIME_STR,CRS_DEP_DT_STR,CRS_DEP_DATETIME,iata_code,ORI_timezone,CRS_DEP_DATETIME_UTC,CRS_DEP_DATETIME_UTC_END,CRS_DEP_DATETIME_UTC_START,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,DEST_timezone,DEP_HRS,DEP_MINS,DEP_TIME_STR,DEP_DT_STR,DEP_DATETIME,ARR_DATETIME_ACTUAL_UTC,late_night,daytime,evening,region_name,new_england,mid_atlantic,south,midwest,southwest,west,pacific_islands,atlantic_islands,spring,summer,autumn,winter,dep_date,weekend_or_holiday,flightID,ID,previous_flight_delay_status,previous_flight_dep_time,time_between_departures_min,valid_dep_delay,prior_dep_delayed,previous_DEP_DELAY_NEW_value,previous_DEP_DELAY_NEW,previous_flight_arrdelay_status,previous_flight_arr_time,time_between_arrival_and_end_min,valid_arr_delay,prior_arr_delayed,previous_ARR_DELAY_NEW_value,previous_ARR_DELAY_NEW,prev_arrival_airport,plane_is_here,avg_carrier_delay_24hrs,flights_sch_Today_ORIGIN,flights_sch_Today_DEST,avg_ori_airport_delay_24hrs,year_quarter,quarter_enum,quarter_enum_prev,month_seq_index,wnd_2_wind_obs_type,wnd_3_wind_sp_rate,cig_0_height,vis_0_distance,slp_0_day_avg,tmp_0_air_temp,dew_0_point_temp,ma1_0_altimeter_setting_rate,ma1_2_station_pressure_rate,gd1_3_sky_cover_height,WND_3_wind_speed,CIG_0_sky_ceiling_height,VIS_0_visibility_dist,SLP_0_avg_station_press,TMP_0_air_temperature,DEW_0_dew_pt_temp,MA1_0_altimeter_set_rate,MA1_2_station_pres_rate,OC1_0_wind_gust_spd_rate_imp,GD1_3_cloud_height,AA1_1_liquid_precip,AA3_1_liquid_precip,gd1_0_sky_coverage,au2_4_extreme_wind_weather,mv1_0_sand_dust_near,mv1_0_thunder_rain_near,aw1_mw1_0_smoke_haze_dust,aw1_mw1_0_fog,aw1_mw1_0_rain_drizzle,aw1_mw1_0_freezing_rain_drizzle,aw1_mw1_0_snow,aw1_mw1_0_hail_or_ice,aw1_mw1_0_thunderstorm,aw1_mw1_0_tornado,SLP_0_avg_station_press_imp,WND_3_wind_speed_imp,CIG_0_sky_ceiling_height_imp,VIS_0_visibility_dist_imp,TMP_0_air_temperature_imp,DEW_0_dew_pt_temp_imp,MA1_0_altimeter_set_rate_imp,MA1_2_station_pres_rate_imp,GD1_3_cloud_height_imp,depDelayPageRank,arrDelayPageRank,UniqueID,hour,hour_UTC,avg_hourly_delay_24hr,OD_Pair,avg_OD_dep_del15_24hr,avg_OD_delay_min_24hr,avg_OD_num_flights_24hr,depDelayPageRank_ordinal,arrDelayPageRank_ordinal,recent_dep_time,recent_dep_dest,time_between_currentDepTime_recentDepTime,dep_flight_after_prediction,recent_dep_flight_coming_here,departed_for_current_aiport,1st_most_recent_dep_UTC,2nd_most_recent_dep_UTC,1st_most_recent_arr_UTC,2nd_most_recent_arr_UTC,3rd_most_recent_arr_UTC,col1_time_inb_flight_min,col2_time_inb_flight_min,col3_time_inb_flight_min,time_inb_flight_min,avg_time_inb_flights_carrier_24hr,avg_time_inb_flights_origin_24hr,avg_time_inb_flights_dest_24hr,avg_ori_DEP_DELAY_NEW_24hr,airline_carrier_del,airline_carrier_del_min,avg_carrier_delay_over15_lastQ,airline_carrier_del_ordinal,airline_carrier_del_min_ordinal,origin_avg_DEP_DEL15,origin_avg_DEP_DELAY_NEW,avg_origin_delay_over15_lastQ,origin_del_ordinal,origin_del_min_ordinal,OD_avg_DEP_DEL15,OD_avg_DEP_DELAY_NEW,avg_OD_delay_over15_lastQ,OD_del_ordinal,OD_del_min_ordinal,features,rawPrediction,probability,prediction
2019,1,1,1,2,2019-01-01,MQ,20398,N259NN,3574,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1715,1728,13.0,13.0,0.0,0,1700-1759,1935,1933.0,-2.0,0.0,0.0,0.0,185.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,17,15,17:15,2019-01-01 17:15,2019-01-01T17:15:00.000+0000,EYW,America/New_York,2019-01-01T22:15:00.000+0000,2019-01-01T20:15:00.000+0000,2019-01-01T18:15:00.000+0000,72201012836,2019-01-01T19:53:00.000+0000,24.5571,-81.7554,0.3,FM-15,America/Chicago,17,28,17:28,2019-01-01 17:28,2019-01-01T17:28:00.000+0000,2019-01-02T01:33:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-01,1,25770743874,MQN259NNEYW2019-01-011715,0.0,2019-01-01T18:52:00.000+0000,203.0,yes,0,0.0,0.0,0.0,2019-01-01T21:47:00.000+0000,28.0,no,0,8.0,0.0,EYW,0,0.24,18,756,0.06,2019-1,16,15,48,N,62,22000,16093,10187,278,222,10190,10182,1463.0,62,22000.0,16093.0,10187.0,278.0,222.0,10190.0,10182.0,0,1463.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10187.0,62.0,22000.0,16093.0,278.0,222.0,10190.0,10182.0,1463.0,0.0018347068392541,0.0012491987102257,MQN259NNEYWDFW2019-01-012019-01-01 22:15:00,17,22,0.268,EYW-DFW,0.0,0.0,1,124,174,2019-01-01T18:52:00.000+0000,EYW,203.0,no,yes,1,2019-01-01T18:52:00.000+0000,2019-01-01T16:31:00.000+0000,2019-01-01T21:47:00.000+0000,2019-01-01T17:52:00.000+0000,2019-01-01T15:40:00.000+0000,28.0,60.0,51.0,60.0,315.93,433.13,478.33,7.8125,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 24, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.24, 18.0, 756.0, 0.06, 1.0, 1.0, 1.0, 1.0, 0.24, 0.06, 0.0018347068392541843, 0.0012491987102257365, 10187.0, 62.0, 22000.0, 16093.0, 278.0, 222.0, 10190.0, 10182.0, 1463.0, 0.268, 1.0, 124.0, 174.0, 1.0, 60.0, 315.93, 433.13, 478.33, 7.8125, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(70.81626433353937, 29.18373566646064))","Map(vectorType -> dense, length -> 2, values -> List(0.7081626433353937, 0.2918373566646064))",0.0
2019,1,1,2,3,2019-01-02,MQ,20398,N241NN,3574,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1715,1715,0.0,0.0,0.0,0,1700-1759,1935,1915.0,-20.0,0.0,0.0,0.0,180.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,17,15,17:15,2019-01-02 17:15,2019-01-02T17:15:00.000+0000,EYW,America/New_York,2019-01-02T22:15:00.000+0000,2019-01-02T20:15:00.000+0000,2019-01-02T18:15:00.000+0000,72201012836,2019-01-02T20:03:00.000+0000,24.5571,-81.7554,0.3,FM-16,America/Chicago,17,15,17:15,2019-01-02 17:15,2019-01-02T17:15:00.000+0000,2019-01-03T01:15:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-02,1,25770744660,MQN241NNEYW2019-01-021715,0.0,2019-01-02T18:52:00.000+0000,203.0,yes,0,0.0,0.0,0.0,2019-01-02T21:41:00.000+0000,34.0,no,0,2.0,0.0,EYW,0,0.33,19,785,0.21,2019-1,16,15,48,N,62,22000,16093,99999,272,222,10183,10176,579.0,62,22000.0,16093.0,,272.0,222.0,10183.0,10176.0,118,579.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10186.980827447023,62.0,22000.0,16093.0,272.0,222.0,10183.0,10176.0,579.0,0.0018347068392541,0.0012491987102257,MQN241NNEYWDFW2019-01-022019-01-02 22:15:00,17,22,0.26,EYW-DFW,0.0,13.0,1,124,174,2019-01-02T18:52:00.000+0000,EYW,203.0,no,yes,1,2019-01-02T18:52:00.000+0000,2019-01-02T16:43:00.000+0000,2019-01-02T21:41:00.000+0000,2019-01-02T18:05:00.000+0000,2019-01-02T16:15:00.000+0000,34.0,47.0,28.0,47.0,255.4,532.79,348.61,10.105263157894736,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 24, 29, 31, 32, 33, 34, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.33, 19.0, 785.0, 0.21, 1.0, 1.0, 1.0, 1.0, 0.33, 0.21, 0.0018347068392541843, 0.0012491987102257365, 118.0, 10186.980827447023, 62.0, 22000.0, 16093.0, 272.0, 222.0, 10183.0, 10176.0, 579.0, 0.26, 13.0, 1.0, 124.0, 174.0, 1.0, 47.0, 255.4, 532.79, 348.61, 10.105263157894736, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(62.69879176423711, 37.301208235762864))","Map(vectorType -> dense, length -> 2, values -> List(0.6269879176423713, 0.3730120823576287))",0.0
2019,1,1,3,4,2019-01-03,MQ,20398,N250NN,3574,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1715,1725,10.0,10.0,0.0,0,1700-1759,1935,1944.0,9.0,9.0,0.0,0.0,199.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,17,15,17:15,2019-01-03 17:15,2019-01-03T17:15:00.000+0000,EYW,America/New_York,2019-01-03T22:15:00.000+0000,2019-01-03T20:15:00.000+0000,2019-01-03T18:15:00.000+0000,72201012836,2019-01-03T19:53:00.000+0000,24.5571,-81.7554,0.3,FM-15,America/Chicago,17,25,17:25,2019-01-03 17:25,2019-01-03T17:25:00.000+0000,2019-01-04T01:44:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-03,0,25770745489,MQN250NNEYW2019-01-031715,0.0,2019-01-03T18:52:00.000+0000,203.0,yes,0,0.0,0.0,0.0,2019-01-03T21:44:00.000+0000,31.0,no,0,5.0,0.0,EYW,0,0.33,19,791,0.21,2019-1,16,15,48,N,51,22000,16093,10164,278,222,10166,10159,99999.0,51,22000.0,16093.0,10164.0,278.0,222.0,10166.0,10159.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10164.0,51.0,22000.0,16093.0,278.0,222.0,10166.0,10159.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN250NNEYWDFW2019-01-032019-01-03 22:15:00,17,22,0.285,EYW-DFW,0.0,0.0,1,124,174,2019-01-03T18:52:00.000+0000,EYW,203.0,no,yes,1,2019-01-03T18:52:00.000+0000,2019-01-03T16:43:00.000+0000,2019-01-03T21:44:00.000+0000,2019-01-03T18:10:00.000+0000,2019-01-03T16:24:00.000+0000,31.0,42.0,19.0,42.0,242.01,324.42,369.78,10.789473684210526,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.33, 19.0, 791.0, 0.21, 1.0, 1.0, 1.0, 0.33, 0.21, 0.0018347068392541843, 0.0012491987102257365, 10164.0, 51.0, 22000.0, 16093.0, 278.0, 222.0, 10166.0, 10159.0, 1089.0690978886755, 0.285, 1.0, 124.0, 174.0, 1.0, 42.0, 242.01, 324.42, 369.78, 10.789473684210526, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(64.5180344004762, 35.481965599523804))","Map(vectorType -> dense, length -> 2, values -> List(0.645180344004762, 0.354819655995238))",0.0
2019,1,1,4,5,2019-01-04,MQ,20398,N253NN,3574,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1715,1718,3.0,3.0,0.0,0,1700-1759,1935,1949.0,14.0,14.0,0.0,0.0,211.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,17,15,17:15,2019-01-04 17:15,2019-01-04T17:15:00.000+0000,EYW,America/New_York,2019-01-04T22:15:00.000+0000,2019-01-04T20:15:00.000+0000,2019-01-04T18:15:00.000+0000,72201012836,2019-01-04T19:53:00.000+0000,24.5571,-81.7554,0.3,FM-15,America/Chicago,17,18,17:18,2019-01-04 17:18,2019-01-04T17:18:00.000+0000,2019-01-05T01:49:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-04,1,25770746329,MQN253NNEYW2019-01-041715,0.0,2019-01-04T18:52:00.000+0000,203.0,yes,0,0.0,0.0,0.0,2019-01-04T21:45:00.000+0000,30.0,no,0,6.0,0.0,EYW,0,0.15,19,805,0.11,2019-1,16,15,48,N,41,22000,16093,10152,278,228,10152,10145,99999.0,41,22000.0,16093.0,10152.0,278.0,228.0,10152.0,10145.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10152.0,41.0,22000.0,16093.0,278.0,228.0,10152.0,10145.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN253NNEYWDFW2019-01-042019-01-04 22:15:00,17,22,0.219,EYW-DFW,0.0,10.0,1,124,174,2019-01-04T18:52:00.000+0000,EYW,203.0,no,yes,1,2019-01-04T18:52:00.000+0000,2019-01-04T16:43:00.000+0000,2019-01-04T21:45:00.000+0000,2019-01-04T18:00:00.000+0000,2019-01-04T16:01:00.000+0000,30.0,52.0,42.0,52.0,279.29,542.0,670.65,5.7368421052631575,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 24, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.15, 19.0, 805.0, 0.11, 1.0, 1.0, 1.0, 1.0, 0.15, 0.11, 0.0018347068392541843, 0.0012491987102257365, 10152.0, 41.0, 22000.0, 16093.0, 278.0, 228.0, 10152.0, 10145.0, 1089.0690978886755, 0.219, 10.0, 1.0, 124.0, 174.0, 1.0, 52.0, 279.29, 542.0, 670.65, 5.7368421052631575, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(73.59567715733483, 26.40432284266517))","Map(vectorType -> dense, length -> 2, values -> List(0.7359567715733484, 0.2640432284266517))",0.0
2019,1,1,5,6,2019-01-05,MQ,20398,N268NN,3574,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1715,1712,-3.0,0.0,0.0,-1,1700-1759,1935,1918.0,-17.0,0.0,0.0,0.0,186.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,17,15,17:15,2019-01-05 17:15,2019-01-05T17:15:00.000+0000,EYW,America/New_York,2019-01-05T22:15:00.000+0000,2019-01-05T20:15:00.000+0000,2019-01-05T18:15:00.000+0000,72201012836,2019-01-05T19:53:00.000+0000,24.5571,-81.7554,0.3,FM-15,America/Chicago,17,12,17:12,2019-01-05 17:12,2019-01-05T17:12:00.000+0000,2019-01-06T01:18:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-05,1,25770747099,MQN268NNEYW2019-01-051715,0.0,2019-01-05T18:52:00.000+0000,203.0,yes,0,0.0,0.0,0.0,2019-01-05T21:31:00.000+0000,44.0,no,0,0.0,0.0,EYW,0,0.08,20,672,0.15,2019-1,16,15,48,N,67,22000,16093,10172,250,183,10173,10165,99999.0,67,22000.0,16093.0,10172.0,250.0,183.0,10173.0,10165.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10172.0,67.0,22000.0,16093.0,250.0,183.0,10173.0,10165.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN268NNEYWDFW2019-01-052019-01-05 22:15:00,17,22,0.175,EYW-DFW,0.0,3.0,1,124,174,2019-01-05T18:52:00.000+0000,EYW,203.0,no,yes,1,2019-01-05T18:52:00.000+0000,2019-01-05T16:43:00.000+0000,2019-01-05T21:31:00.000+0000,2019-01-05T17:46:00.000+0000,2019-01-05T15:43:00.000+0000,44.0,66.0,60.0,66.0,252.44,319.45,377.09,9.05,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 24, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.08, 20.0, 672.0, 0.15, 1.0, 1.0, 1.0, 1.0, 0.08, 0.15, 0.0018347068392541843, 0.0012491987102257365, 10172.0, 67.0, 22000.0, 16093.0, 250.0, 183.0, 10173.0, 10165.0, 1089.0690978886755, 0.175, 3.0, 1.0, 124.0, 174.0, 1.0, 66.0, 252.44, 319.45, 377.09, 9.05, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(75.8359646341786, 24.164035365821412))","Map(vectorType -> dense, length -> 2, values -> List(0.758359646341786, 0.24164035365821412))",0.0
2019,1,1,6,7,2019-01-06,MQ,20398,N241NN,3910,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1510,1507,-3.0,0.0,0.0,-1,1500-1559,1729,1712.0,-17.0,0.0,0.0,0.0,185.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,15,10,15:10,2019-01-06 15:10,2019-01-06T15:10:00.000+0000,EYW,America/New_York,2019-01-06T20:10:00.000+0000,2019-01-06T18:10:00.000+0000,2019-01-06T16:10:00.000+0000,72201012836,2019-01-06T18:00:00.000+0000,24.5571,-81.7554,0.3,FM-12,America/Chicago,15,7,15:07,2019-01-06 15:07,2019-01-06T15:07:00.000+0000,2019-01-06T23:12:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-06,1,25770747808,MQN241NNEYW2019-01-061510,0.0,2019-01-06T16:45:00.000+0000,205.0,yes,0,0.0,0.0,0.0,2019-01-06T19:22:00.000+0000,48.0,no,0,0.0,0.0,EYW,0,0.08,19,756,0.17,2019-1,16,15,48,N,21,22000,16000,10199,211,139,99999,10193,,21,22000.0,16000.0,10199.0,211.0,139.0,,10193.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10199.0,21.0,22000.0,16000.0,211.0,139.0,10183.97337962963,10193.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN241NNEYWDFW2019-01-062019-01-06 20:10:00,15,20,0.191,EYW-DFW,0.0,0.0,1,124,174,2019-01-06T16:45:00.000+0000,EYW,205.0,no,yes,1,2019-01-06T16:45:00.000+0000,2019-01-05T21:52:00.000+0000,2019-01-06T19:22:00.000+0000,2019-01-05T23:22:00.000+0000,2019-01-05T21:02:00.000+0000,48.0,1043.0,50.0,1043.0,289.76,461.11,424.29,6.833333333333333,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 24, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.08, 19.0, 756.0, 0.17, 1.0, 1.0, 1.0, 1.0, 0.08, 0.17, 0.0018347068392541843, 0.0012491987102257365, 10199.0, 21.0, 22000.0, 16000.0, 211.0, 139.0, 10183.97337962963, 10193.0, 1089.0690978886755, 0.191, 1.0, 124.0, 174.0, 1.0, 1043.0, 289.76, 461.11, 424.29, 6.833333333333333, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(75.07310107897175, 24.926898921028272))","Map(vectorType -> dense, length -> 2, values -> List(0.7507310107897174, 0.24926898921028268))",0.0
2019,1,1,7,1,2019-01-07,MQ,20398,N227NN,3910,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1510,1503,-7.0,0.0,0.0,-1,1500-1559,1729,1712.0,-17.0,0.0,0.0,0.0,189.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,15,10,15:10,2019-01-07 15:10,2019-01-07T15:10:00.000+0000,EYW,America/New_York,2019-01-07T20:10:00.000+0000,2019-01-07T18:10:00.000+0000,2019-01-07T16:10:00.000+0000,72201012836,2019-01-07T18:00:00.000+0000,24.5571,-81.7554,0.3,FM-12,America/Chicago,15,3,15:03,2019-01-07 15:03,2019-01-07T15:03:00.000+0000,2019-01-07T23:12:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-07,0,25770748608,MQN227NNEYW2019-01-071510,0.0,2019-01-07T16:45:00.000+0000,205.0,yes,0,0.0,0.0,0.0,2019-01-07T19:31:00.000+0000,39.0,no,0,0.0,0.0,EYW,0,0.12,17,792,0.11,2019-1,16,15,48,N,67,22000,16000,10206,272,206,99999,10200,,67,22000.0,16000.0,10206.0,272.0,206.0,,10200.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10206.0,67.0,22000.0,16000.0,272.0,206.0,10183.97337962963,10200.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN227NNEYWDFW2019-01-072019-01-07 20:10:00,15,20,0.223,EYW-DFW,0.0,0.0,1,124,174,2019-01-07T16:45:00.000+0000,EYW,205.0,no,yes,1,2019-01-07T16:45:00.000+0000,2019-01-06T21:21:00.000+0000,2019-01-07T19:31:00.000+0000,2019-01-07T00:27:00.000+0000,2019-01-06T21:49:00.000+0000,39.0,978.0,-28.0,978.0,282.24,297.79,397.22,3.0526315789473686,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.12, 17.0, 792.0, 0.11, 1.0, 1.0, 1.0, 0.12, 0.11, 0.0018347068392541843, 0.0012491987102257365, 10206.0, 67.0, 22000.0, 16000.0, 272.0, 206.0, 10183.97337962963, 10200.0, 1089.0690978886755, 0.223, 1.0, 124.0, 174.0, 1.0, 978.0, 282.24, 297.79, 397.22, 3.0526315789473686, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(75.80875851765559, 24.191241482344413))","Map(vectorType -> dense, length -> 2, values -> List(0.758087585176556, 0.24191241482344414))",0.0
2019,1,1,8,2,2019-01-08,MQ,20398,N264NN,3910,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1510,1506,-4.0,0.0,0.0,-1,1500-1559,1729,1722.0,-7.0,0.0,0.0,0.0,196.0,1089.0,5,,,,,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,15,10,15:10,2019-01-08 15:10,2019-01-08T15:10:00.000+0000,EYW,America/New_York,2019-01-08T20:10:00.000+0000,2019-01-08T18:10:00.000+0000,2019-01-08T16:10:00.000+0000,72201012836,2019-01-08T18:00:00.000+0000,24.5571,-81.7554,0.3,FM-12,America/Chicago,15,6,15:06,2019-01-08 15:06,2019-01-08T15:06:00.000+0000,2019-01-08T23:22:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-08,0,25770749439,MQN264NNEYW2019-01-081510,0.0,2019-01-08T16:45:00.000+0000,205.0,yes,0,0.0,0.0,0.0,2019-01-08T19:27:00.000+0000,43.0,no,0,0.0,0.0,EYW,0,0.11,16,725,0.12,2019-1,16,15,48,N,41,22000,16000,10213,272,206,99999,10207,,41,22000.0,16000.0,10213.0,272.0,206.0,,10207.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10213.0,41.0,22000.0,16000.0,272.0,206.0,10183.97337962963,10207.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN264NNEYWDFW2019-01-082019-01-08 20:10:00,15,20,0.183,EYW-DFW,0.0,0.0,1,124,174,2019-01-08T16:45:00.000+0000,EYW,205.0,no,yes,1,2019-01-08T16:45:00.000+0000,2019-01-08T14:20:00.000+0000,2019-01-08T19:27:00.000+0000,2019-01-08T15:53:00.000+0000,2019-01-08T04:01:00.000+0000,43.0,52.0,619.0,52.0,256.94,353.71,400.57,7.764705882352941,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.11, 16.0, 725.0, 0.12, 1.0, 1.0, 1.0, 0.11, 0.12, 0.0018347068392541843, 0.0012491987102257365, 10213.0, 41.0, 22000.0, 16000.0, 272.0, 206.0, 10183.97337962963, 10207.0, 1089.0690978886755, 0.183, 1.0, 124.0, 174.0, 1.0, 52.0, 256.94, 353.71, 400.57, 7.764705882352941, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(76.98710414055437, 23.01289585944563))","Map(vectorType -> dense, length -> 2, values -> List(0.7698710414055436, 0.2301289585944563))",0.0
2019,1,1,9,3,2019-01-09,MQ,20398,N243NN,3910,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1510,1511,1.0,1.0,0.0,0,1500-1559,1729,1751.0,22.0,22.0,1.0,0.0,220.0,1089.0,5,1.0,0.0,21.0,0.0,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,15,10,15:10,2019-01-09 15:10,2019-01-09T15:10:00.000+0000,EYW,America/New_York,2019-01-09T20:10:00.000+0000,2019-01-09T18:10:00.000+0000,2019-01-09T16:10:00.000+0000,72201012836,2019-01-09T18:00:00.000+0000,24.5571,-81.7554,0.3,FM-12,America/Chicago,15,11,15:11,2019-01-09 15:11,2019-01-09T15:11:00.000+0000,2019-01-09T23:51:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-09,0,25770750176,MQN243NNEYW2019-01-091510,0.0,2019-01-09T16:45:00.000+0000,205.0,yes,0,0.0,0.0,0.0,2019-01-09T19:19:00.000+0000,51.0,no,0,0.0,0.0,EYW,0,0.08,17,741,0.06,2019-1,16,15,48,N,57,99999,16000,10191,233,178,99999,10186,,57,,16000.0,10191.0,233.0,178.0,,10186.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10191.0,57.0,15439.117330462865,16000.0,233.0,178.0,10183.97337962963,10186.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN243NNEYWDFW2019-01-092019-01-09 20:10:00,15,20,0.119,EYW-DFW,0.0,0.0,1,124,174,2019-01-09T16:45:00.000+0000,EYW,205.0,no,yes,1,2019-01-09T16:45:00.000+0000,2019-01-09T14:01:00.000+0000,2019-01-09T19:19:00.000+0000,2019-01-09T15:43:00.000+0000,2019-01-09T02:18:00.000+0000,51.0,62.0,703.0,62.0,292.66,247.19,398.8,3.625,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.08, 17.0, 741.0, 0.06, 1.0, 1.0, 1.0, 0.08, 0.06, 0.0018347068392541843, 0.0012491987102257365, 10191.0, 57.0, 15439.117330462863, 16000.0, 233.0, 178.0, 10183.97337962963, 10186.0, 1089.0690978886755, 0.119, 1.0, 124.0, 174.0, 1.0, 62.0, 292.66, 247.19, 398.8, 3.625, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(78.6430094954053, 21.35699050459472))","Map(vectorType -> dense, length -> 2, values -> List(0.7864300949540528, 0.21356990504594717))",0.0
2019,1,1,10,4,2019-01-10,MQ,20398,N255NN,3910,11624,1162402,EYW,FL,12,Florida,11298,1129806,DFW,TX,48,Texas,1510,1507,-3.0,0.0,0.0,-1,1500-1559,1729,1747.0,18.0,18.0,1.0,0.0,220.0,1089.0,5,0.0,0.0,18.0,0.0,EYW,72201012836,24.557,-81.755,24.55610085,-81.75959778,3.0,0.4756356865333155,DFW,72259003927,32.898,-97.019,32.896801,-97.038002,607.0,1.7791117135539851,15,10,15:10,2019-01-10 15:10,2019-01-10T15:10:00.000+0000,EYW,America/New_York,2019-01-10T20:10:00.000+0000,2019-01-10T18:10:00.000+0000,2019-01-10T16:10:00.000+0000,72201012836,2019-01-10T18:00:00.000+0000,24.5571,-81.7554,0.3,FM-12,America/Chicago,15,7,15:07,2019-01-10 15:07,2019-01-10T15:07:00.000+0000,2019-01-10T23:47:00.000+0000,0,0,1,south,0,0,1,0,0,0,0,0,0,0,0,1,2019-01-10,0,25770750988,MQN255NNEYW2019-01-101510,0.0,2019-01-10T16:45:00.000+0000,205.0,yes,0,0.0,0.0,0.0,2019-01-10T19:27:00.000+0000,43.0,no,0,0.0,0.0,EYW,0,0.14,17,781,0.18,2019-1,16,15,48,N,77,99999,16000,10182,189,100,99999,10176,,77,,16000.0,10182.0,189.0,100.0,,10176.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10182.0,77.0,15439.117330462865,16000.0,189.0,100.0,10183.97337962963,10176.0,1089.0690978886755,0.0018347068392541,0.0012491987102257,MQN255NNEYWDFW2019-01-102019-01-10 20:10:00,15,20,0.124,EYW-DFW,0.0,1.0,1,124,174,2019-01-10T16:45:00.000+0000,EYW,205.0,no,yes,1,2019-01-10T16:45:00.000+0000,2019-01-10T14:20:00.000+0000,2019-01-10T19:27:00.000+0000,2019-01-10T15:52:00.000+0000,2019-01-10T05:22:00.000+0000,43.0,53.0,538.0,53.0,281.5,684.06,430.32,9.764705882352942,0.160446408186836,10.314097312687178,0,10,12,0.1594454072790294,12.46100519930676,0,147,165,0.5,135.875,1,25,1,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 13, 16, 22, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84), values -> List(1089.0, 3.0, 607.0, 0.14, 17.0, 781.0, 0.18, 1.0, 1.0, 1.0, 0.14, 0.18, 0.0018347068392541843, 0.0012491987102257365, 10182.0, 77.0, 15439.117330462863, 16000.0, 189.0, 100.0, 10183.97337962963, 10176.0, 1089.0690978886755, 0.124, 1.0, 1.0, 124.0, 174.0, 1.0, 53.0, 281.5, 684.06, 430.32, 9.764705882352942, 0.16044640818683606, 10.314097312687178, 10.0, 12.0, 0.15944540727902945, 12.46100519930676, 147.0, 165.0, 0.5, 135.875, 1.0, 25.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(74.25511160686989, 25.744888393130115))","Map(vectorType -> dense, length -> 2, values -> List(0.7425511160686988, 0.25744888393130116))",0.0


In [0]:
feature_for_analysis = ['UniqueID','DEP_DEL15','probability','prediction','features']
RF_test_preds_fold_1 = predictions_fold_1.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')

In [0]:
desired_neg_to_pos_ratio = 1.6

maxDepth=8
maxBins=64
minInstancesPerNode=10
numTrees=100
seed=12345

CV_name = 'fold_2'
train_data = df_imported.where((col('YEAR').cast('int') == 2016))
val_data = df_imported.where((col('YEAR').cast('int') == 2017))
holdout_data = df_imported.where((col('YEAR').cast('int') == 2019))
save_name = 'xgboost_test_preds_4_9_'+CV_name

new_trainDF = rebalanceDF(train_data, desired_neg_to_pos_ratio)
val = val_data
holdout = holdout_data

# Assemble features
assembler = VectorAssembler().setInputCols(feature_list_full).setOutputCol('features')
assmb_train = assembler.transform(new_trainDF)
assmb_val = assembler.transform(val)
assmb_holdout = assembler.transform(holdout)

# modeling
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'DEP_DEL15',
                            maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, numTrees=numTrees, seed=seed)
rfModel = rf.fit(assmb_train)
val_fold_2 = rfModel.transform(assmb_val)
predictions_fold_2 = rfModel.transform(assmb_holdout)

In [0]:
RF_test_preds_fold_2 = predictions_fold_2.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')

In [0]:
desired_neg_to_pos_ratio = 1.6

maxDepth=8
maxBins=64
minInstancesPerNode=10
numTrees=100
seed=12345

CV_name = 'fold_3'
train_data = df_imported.where((col('YEAR').cast('int') == 2017))
val_data = df_imported.where((col('YEAR').cast('int') == 2018))
holdout_data = df_imported.where((col('YEAR').cast('int') == 2019))
save_name = 'xgboost_test_preds_4_9_'+CV_name

new_trainDF = rebalanceDF(train_data, desired_neg_to_pos_ratio)
val = val_data
holdout = holdout_data

# Assemble features
assembler = VectorAssembler().setInputCols(feature_list_full).setOutputCol('features')
assmb_train = assembler.transform(new_trainDF)
assmb_val = assembler.transform(val)
assmb_holdout = assembler.transform(holdout)

# modeling
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'DEP_DEL15',
                            maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, numTrees=numTrees, seed=seed)
rfModel = rf.fit(assmb_train)
val_fold_3 = rfModel.transform(assmb_val)
predictions_fold_3 = rfModel.transform(assmb_holdout)

In [0]:
RF_test_preds_fold_3 = predictions_fold_3.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')

In [0]:
desired_neg_to_pos_ratio = 1.6

maxDepth=8
maxBins=64
minInstancesPerNode=10
numTrees=100
seed=12345

CV_name = 'fold_4'
train_data = df_imported.where(((col('YEAR').cast('int') == 2017) & (col('MONTH').cast('int') > 6)) | ((col('YEAR').cast('int') == 2018) & (col('MONTH').cast('int') < 7)))
val_data = df_imported.where(((col('YEAR').cast('int') == 2018) & (col('MONTH').cast('int') > 6)))
holdout_data = df_imported.where((col('YEAR').cast('int') == 2019))
save_name = 'xgboost_test_preds_4_9_'+CV_name

new_trainDF = rebalanceDF(train_data, desired_neg_to_pos_ratio)
val = val_data
holdout = holdout_data

# Assemble features
assembler = VectorAssembler().setInputCols(feature_list_full).setOutputCol('features')
assmb_train = assembler.transform(new_trainDF)
assmb_val = assembler.transform(val)
assmb_holdout = assembler.transform(holdout)

# modeling
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'DEP_DEL15',
                            maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, numTrees=numTrees, seed=seed)
rfModel = rf.fit(assmb_train)
val_fold_4 = rfModel.transform(assmb_val)
predictions_fold_4 = rfModel.transform(assmb_holdout)

In [0]:
RF_test_preds_fold_4 = predictions_fold_4.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')

In [0]:
desired_neg_to_pos_ratio = 1.6

maxDepth=8
maxBins=64
minInstancesPerNode=10
numTrees=100
seed=12345

CV_name = 'fold_5'
train_data = df_imported.where(((col('YEAR').cast('int') == 2016) & (col('MONTH').cast('int') > 6)) | ((col('YEAR').cast('int') == 2017) & (col('MONTH').cast('int') < 7)))
val_data = df_imported.where(((col('YEAR').cast('int') == 2017) & (col('MONTH').cast('int') > 6)) | ((col('YEAR').cast('int') == 2018) & (col('MONTH').cast('int') < 7)))
holdout_data = df_imported.where((col('YEAR').cast('int') == 2019))
save_name = 'xgboost_test_preds_4_9_'+CV_name

new_trainDF = rebalanceDF(train_data, desired_neg_to_pos_ratio)
val = val_data
holdout = holdout_data

# Assemble features
assembler = VectorAssembler().setInputCols(feature_list_full).setOutputCol('features')
assmb_train = assembler.transform(new_trainDF)
assmb_val = assembler.transform(val)
assmb_holdout = assembler.transform(holdout)

# modeling
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'DEP_DEL15',
                            maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, numTrees=numTrees, seed=seed)
rfModel = rf.fit(assmb_train)
val_fold_5 = rfModel.transform(assmb_val)
predictions_fold_5 = rfModel.transform(assmb_holdout)

In [0]:
RF_test_preds_fold_5 = predictions_fold_5.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')

In [0]:
feature_for_analysis = ['UniqueID','DEP_DEL15','probability','prediction','features']
RF_val_preds_fold_1 = val_fold_1.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_val_preds_fold_2 = val_fold_2.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_val_preds_fold_3 = val_fold_3.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_val_preds_fold_4 = val_fold_4.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_val_preds_fold_5 = val_fold_5.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')

RF_test_preds_fold_1 = predictions_fold_1.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_test_preds_fold_2 = predictions_fold_2.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_test_preds_fold_3 = predictions_fold_3.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_test_preds_fold_4 = predictions_fold_4.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')
RF_test_preds_fold_5 = predictions_fold_5.select(feature_for_analysis).withColumnRenamed('DEP_DEL15', 'label')

In [0]:
RF_val_preds_fold_1.write.mode("overwrite").parquet(f"{blob_url}/RF_val_pred_table_cvgroup1_0410_xgbm")
RF_val_preds_fold_2.write.mode("overwrite").parquet(f"{blob_url}/RF_val_pred_table_cvgroup2_0410_xgbm")
RF_val_preds_fold_3.write.mode("overwrite").parquet(f"{blob_url}/RF_val_pred_table_cvgroup3_0410_xgbm")
RF_val_preds_fold_4.write.mode("overwrite").parquet(f"{blob_url}/RF_val_pred_table_cvgroup4_0410_xgbm")
RF_val_preds_fold_5.write.mode("overwrite").parquet(f"{blob_url}/RF_val_pred_table_cvgroup5_0410_xgbm")

RF_test_preds_fold_1.write.mode("overwrite").parquet(f"{blob_url}/RF_pred_table_cvgroup1_0410_xgbm")
RF_test_preds_fold_2.write.mode("overwrite").parquet(f"{blob_url}/RF_pred_table_cvgroup2_0410_xgbm")
RF_test_preds_fold_3.write.mode("overwrite").parquet(f"{blob_url}/RF_pred_table_cvgroup3_0410_xgbm")
RF_test_preds_fold_4.write.mode("overwrite").parquet(f"{blob_url}/RF_pred_table_cvgroup4_0410_xgbm")
RF_test_preds_fold_5.write.mode("overwrite").parquet(f"{blob_url}/RF_pred_table_cvgroup5_0410_xgbm")