**NOTE:** Many of these cells rely on earlier ones. As such, you should only execute a cell once all previous ones have finished running.

In [18]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings("ignore")

In [19]:
data = pd.read_csv("flights.csv", low_memory=False)
print(data.shape)

(5819079, 31)


In [20]:
# Note: Aaron changed the entry for key = 4 from 60 to 90.
monthToDaysDict = {1: 0, 2: 31, 3: 59, 4: 90, 5: 120, 6: 151, 7: 181, 8: 212, 
                   9: 243, 10: 273, 11: 304, 12: 334}
def monthToDays(month):
    return monthToDaysDict[month]

# Remove the following features: 
# YEAR, FLIGHT_NUMBER, TAIL_NUMBER, DEPARTURE_TIME, TAXI_OUT, 
# WHEELS_OFF, ELAPSED_TIME, AIR_TIME, WHEELS_ON, TAXI_IN, and ARRIVAL_TIME
data = data.drop(['YEAR', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'DEPARTURE_TIME', 
                  'TAXI_OUT', 'WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 
                  'TAXI_IN', 'ARRIVAL_TIME'], 1)

# Convert MONTH -> # of days before a month
data['MONTH'] = data['MONTH'].apply(monthToDays)

# Convert MONTH + DAYS -> DAY_OF_YEAR
data['DAY'] = data['DAY'] + data['MONTH']
data.rename(columns={'DAY': 'DAY_OF_YEAR'}, inplace=True)
data = data.drop("MONTH", 1)

In [21]:
data.head()

Unnamed: 0,DAY_OF_YEAR,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,1,4,AS,ANC,SEA,5,-11.0,205.0,1448,430,-22.0,0,0,,,,,,
1,1,4,AA,LAX,PBI,10,-8.0,280.0,2330,750,-9.0,0,0,,,,,,
2,1,4,US,SFO,CLT,20,-2.0,286.0,2296,806,5.0,0,0,,,,,,
3,1,4,AA,LAX,MIA,20,-5.0,285.0,2342,805,-9.0,0,0,,,,,,
4,1,4,AS,SEA,ANC,25,-1.0,235.0,1448,320,-21.0,0,0,,,,,,


**The code below cleans the AIR_SYSTEM_DELAY, SECURITY_DELAY, AIRLINE_DELAY, LATE_AIRCRAFT_DELAY, and WEATHER_DELAY features.**

In [22]:
def notANumberToZero(delay):
    if (math.isnan(delay)):
        return 0
    else:
        return delay
    
data['AIR_SYSTEM_DELAY'] = data['AIR_SYSTEM_DELAY'].apply(notANumberToZero)
data['SECURITY_DELAY'] = data['SECURITY_DELAY'].apply(notANumberToZero)
data['AIRLINE_DELAY'] = data['AIRLINE_DELAY'].apply(notANumberToZero)
data['LATE_AIRCRAFT_DELAY'] = data['LATE_AIRCRAFT_DELAY'].apply(notANumberToZero)
data['WEATHER_DELAY'] = data['WEATHER_DELAY'].apply(notANumberToZero)

print("Transforming delay columns complete!")

Transforming delay columns complete!


In [23]:
data.iloc[50000:50010]

Unnamed: 0,DAY_OF_YEAR,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
50000,4,7,DL,JAX,ATL,915,23.0,75.0,270,1030,26.0,0,0,,26.0,0.0,0.0,0.0,0.0
50001,4,7,DL,MSP,SNA,915,109.0,246.0,1522,1121,83.0,0,0,,0.0,0.0,83.0,0.0,0.0
50002,4,7,DL,MSP,PDX,915,0.0,242.0,1426,1117,-6.0,0,0,,0.0,0.0,0.0,0.0,0.0
50003,4,7,DL,ATL,SRQ,915,7.0,93.0,444,1048,8.0,0,0,,0.0,0.0,0.0,0.0,0.0
50004,4,7,DL,MSP,SFO,915,-4.0,265.0,1589,1140,-24.0,0,0,,0.0,0.0,0.0,0.0,0.0
50005,4,7,DL,DEN,ATL,915,0.0,178.0,1199,1413,-2.0,0,0,,0.0,0.0,0.0,0.0,0.0
50006,4,7,EV,ATL,ECP,915,10.0,68.0,240,923,5.0,0,0,,0.0,0.0,0.0,0.0,0.0
50007,4,7,EV,MOB,ATL,915,-1.0,69.0,302,1124,-3.0,0,0,,0.0,0.0,0.0,0.0,0.0
50008,4,7,HA,HNL,LIH,915,-4.0,38.0,102,953,3.0,0,0,,0.0,0.0,0.0,0.0,0.0
50009,4,7,MQ,DFW,PIA,915,-3.0,108.0,672,1103,11.0,0,0,,0.0,0.0,0.0,0.0,0.0


**The following code does 3 things to clean the data:**

1) Replace the NaN values in CANCELLATION_REASON with empty strings

2) Convert SCHEDULED_DEPARTURE to a numeric quantity ranging from 0 to 1440 representing the minutes in a day

3) Do the same for SCHEDULED_ARRIVAL

The reason we have to do the last two things is because SCHEDULED_DEPARTURE and SCHEDULED_ARRIVAL are floats which *actually* represent HHMM time values. We do not need to do the same for SCHEDULED_TIME as it represents estimated travel time in minutes.

In [24]:
def hhmmFloatToMinutes(time):
    timeString = "{:04d}".format(time)
    hours = float(timeString[0:2])
    minutes = float(timeString[2:4])
    totalMinutes = (hours * 60) + minutes
    return totalMinutes

**NOTE:** The cell below will take roughly 30 seconds to a minute to finish.

In [25]:
data['CANCELLATION_REASON'] = data['CANCELLATION_REASON'].fillna('')
data['SCHEDULED_DEPARTURE'] = data['SCHEDULED_DEPARTURE'].apply(hhmmFloatToMinutes)
data['SCHEDULED_ARRIVAL'] = data['SCHEDULED_ARRIVAL'].apply(hhmmFloatToMinutes)

missing_values will show how many NaN/None values are remaining in the dataset.

In [26]:
missing_values = data.isnull().sum(axis=0)
missing_values

DAY_OF_YEAR                 0
DAY_OF_WEEK                 0
AIRLINE                     0
ORIGIN_AIRPORT              0
DESTINATION_AIRPORT         0
SCHEDULED_DEPARTURE         0
DEPARTURE_DELAY         86153
SCHEDULED_TIME              6
DISTANCE                    0
SCHEDULED_ARRIVAL           0
ARRIVAL_DELAY          105071
DIVERTED                    0
CANCELLED                   0
CANCELLATION_REASON         0
AIR_SYSTEM_DELAY            0
SECURITY_DELAY              0
AIRLINE_DELAY               0
LATE_AIRCRAFT_DELAY         0
WEATHER_DELAY               0
dtype: int64

**There's an issue with the ORIGIN_AIRPORT and DESTINATION_AIRPORT features for October 2015. The problem is described in [this](https://www.kaggle.com/usdot/flight-delays/discussion/29600#latest-168452) post on Kaggle. The approach to fix it is drawn from [this](https://www.kaggle.com/srcole/fix-inconsistent-airport-codes) Kaggle kernel.** 

The datasets we're using to solve the conflict between the 5-digit airport codes and the 3-letter airport codes can be found in the comments of the kernel linked above.

In [27]:
df_threeLetterCode = pd.read_csv('L_AIRPORT.csv')
df_threeLetterCode.shape

(6429, 2)

After much trial and error, I realized one issue was due to the AUS and YUM codes being replicated as BSM and NYL, respectively. We drop BSM and NYL as the flights.csv file doesn't use these codes.

In [28]:
codesToDrop = ['BSM', 'NYL']
df_threeLetterCode = df_threeLetterCode[~df_threeLetterCode['Code'].isin(codesToDrop)]
threeLetterCodes = list(df_threeLetterCode['Code'])

In [29]:
df_fiveLetterCode = pd.read_csv('L_AIRPORT_ID.csv')
df_fiveLetterCode.shape

(6414, 2)

We set different indices for these two tables to avoid creating a new dictionary.

In [30]:
df3 = df_threeLetterCode.set_index('Description')
df5 = df_fiveLetterCode.set_index('Code')

This DataFrame can be used to double-check the results of fixing the issue at hand. 

We can do so by running the apply method with testFixAirports, outputting countTest, and then comparing it to the shape of df_October.

In [31]:
df_airports = data[['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']]
df_October = df_airports.loc[~df_airports['ORIGIN_AIRPORT'].isin(threeLetterCodes) |
                            ~df_airports['DESTINATION_AIRPORT'].isin(threeLetterCodes)]
df_October.shape

(486165, 2)

The testFixAirports function simply counts the number of 5-digit airport codes when passed to the apply method. There should be ~500k.

In [32]:
countTest = 0
nFlightsTest = 0
def testFixAirports(airport):
    global countTest
    global nFlightsTest
    nFlightsTest += 1
    if len(airport) != 3:
        countTest += 1
    if nFlightsTest % 1000000 == 0:
        print(nFlightsTest)

The fixOctoberAirports function will create a DataFrame column where the 5-digit airport codes have been replaced by 3-letter airport codes (when passed to apply).

In [33]:
nFlights = 0
def fixOctoberAirports(airport):
    global nFlights
    nFlights += 1
    
    if nFlights % 1000000 == 0:
        print(nFlights)
    
    if len(airport) != 3:
        index = int(airport)
        descriptionAsKey = df5.loc[index]['Description']
        newCode = df3.loc[descriptionAsKey]['Code']
        return newCode
    else:
        return airport

**NOTE:** The two cells below will take several minutes each to run. Specifically, after '4000000' is printed in the output, it will take several minutes to see '5000000'.

In [34]:
fixed_origin_airport = data['ORIGIN_AIRPORT'].apply(fixOctoberAirports)

1000000
2000000
3000000
4000000
5000000


In [35]:
nFlights = 0
fixed_dest_airport = data['DESTINATION_AIRPORT'].apply(fixOctoberAirports)

1000000
2000000
3000000
4000000
5000000


In [36]:
data['ORIGIN_AIRPORT'] = fixed_origin_airport
data['DESTINATION_AIRPORT'] = fixed_dest_airport

countTest should be 0 for both features now if everything worked correctly.

In [37]:
countTest = 0
nFlightsTest = 0
data['ORIGIN_AIRPORT'].apply(testFixAirports)
print()
print("countTest:", countTest)

1000000
2000000
3000000
4000000
5000000

countTest: 0


In [38]:
countTest = 0
nFlightsTest = 0
data['DESTINATION_AIRPORT'].apply(testFixAirports)
print()
print("countTest:", countTest)

1000000
2000000
3000000
4000000
5000000

countTest: 0


In [39]:
# Changed to specify delay
def delayTimeToDummy(delay, threshold):
    if delay >= threshold:
        return 1
    else:
        return 0

# cleaningOption specifies how to handle NaN/None values.
#     * "drop" = simply drop the NaN/None values
#     * "mean" = use mean imputation to replace NaN/None values
#     * "median" = use median imputation to replace NaN/None values
#
# sampleFraction specifies how much of the dataset to sample.
# sample_n specifies how many samples to return
# Note: ONLY ONE of the two parameters above should be used.
#
# dropCols specifies the columns to drop. It drops the 5 delay columns by default.
class DataSet:
    def __init__(self, cleaningOption="drop", sampleFraction=0.1, sample_n=0, 
                 dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']):
        # Initialize myData instance variable
        self.myData = None
        # Take care of NaN/None values
        if cleaningOption == "drop":
            self.myData = data.dropna()
        elif cleaningOption == "mean":
            self.myData = data.copy()
            self.myData['DEPARTURE_DELAY'].fillna(data['DEPARTURE_DELAY'].mean(), inplace=True)
            self.myData['SCHEDULED_TIME'].fillna(data['SCHEDULED_TIME'].mean(), inplace=True)
            self.myData['ARRIVAL_DELAY'].fillna(data['ARRIVAL_DELAY'].mean(), inplace=True)
        elif cleaningOption == "median":
            self.myData = data.copy()
            self.myData['DEPARTURE_DELAY'].fillna(data['DEPARTURE_DELAY'].median(), inplace=True)
            self.myData['SCHEDULED_TIME'].fillna(data['SCHEDULED_TIME'].median(), inplace=True)
            self.myData['ARRIVAL_DELAY'].fillna(data['ARRIVAL_DELAY'].median(), inplace=True)
            
        # Sample data
        self.sampleData(sampleFraction, sample_n)
        
        # Drop columns
        self.dropColumns(dropCols)
        
        # Create a binary target named ARRIVAL_DELAY_BIN: 1 if ARRIVAL_DELAY > 30; 0 otherwise
        self.myData['ARRIVAL_DELAY_BIN_30'] = self.createBinaryTarget(30)
        
        print(self.myData.shape)
        
    def sampleData(self, sampleFraction, sample_n):
        if sample_n > 0:
            self.myData = self.myData.sample(n=sample_n, replace=False)
        else:
            self.myData = self.myData.sample(frac=sampleFraction, replace=False)
        self.myData = self.myData.reset_index()
        self.myData = self.myData.drop(['index'], axis=1)

    def dropColumns(self, cols):
        colsToDrop = cols
        self.myData.drop(colsToDrop, axis=1, inplace=True)
        
    def createBinaryTarget(self, threshold):
        binaryTarget = self.myData['ARRIVAL_DELAY'].apply(delayTimeToDummy, threshold = threshold)
        return binaryTarget

In [96]:
def get_woe(column, dataframe):
    print(column)
    column_to_woe = dataframe[column].copy()
    total_count = dataframe[column].value_counts()
    tot_loop = total_count.shape[0]
    p = np.float(dataframe[(dataframe.ARRIVAL_DELAY_BIN_30) == 1].shape[0])
    n = dataframe.shape[0]-p
    i = 0
    for item in total_count.iteritems():
        i += 1
        # Edited for Mac and Python 3
        # print('{0} of {1}\n'.format(i, tot_loop))
        pi = dataframe[(dataframe[column] == item[0]) & (dataframe.ARRIVAL_DELAY_BIN_30)].shape[0]
        ni = np.float(item[1]-pi)
        if pi == 0 or ni == 0:
            WOE = float('nan')
        else:
            WOE = np.log((pi/p)/(ni/n))
            
        column_to_woe[column_to_woe == item[0]] = WOE
    return column_to_woe.apply(pd.to_numeric)

### Dataset 1

In [41]:
# The categorical (or nominal) variables in our dataset are: DAY_OF_WEEK, 
# AIRLINE, ORIGIN_AIRPORT, DESTINATION_AIRPORT, CANCELLATION_REASON, and DAY_OF_YEAR.
dummy_columns = ['DAY_OF_YEAR', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 
                 'DESTINATION_AIRPORT']

ds1 = DataSet(cleaningOption='drop', sample_n=25000, dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])
data1 = ds1.myData
data1.head()

(25000, 14)


Unnamed: 0,DAY_OF_YEAR,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DIVERTED,CANCELLED,ARRIVAL_DELAY_BIN_30
0,117,1,EV,ICT,ORD,1106.0,-9.0,113.0,588,1219.0,-13.0,0,0,0
1,111,2,WN,LAS,SFO,640.0,43.0,100.0,414,740.0,28.0,0,0,0
2,356,2,WN,BWI,ORF,975.0,17.0,55.0,159,1030.0,6.0,0,0,0
3,251,2,UA,IAH,LAX,1137.0,2.0,210.0,1379,1227.0,6.0,0,0,0
4,161,3,OO,MBS,ORD,766.0,-10.0,79.0,222,785.0,-18.0,0,0,0


In [44]:
# data1_missing_values = data1.isnull().sum(axis=0)
# data1_missing_values

In [45]:
data1 = pd.get_dummies(data1, columns=dummy_columns, drop_first=True)
data1.shape

(25000, 1003)

In [46]:
X1 = data1.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY_BIN_30'], axis=1)
y1 = data1['ARRIVAL_DELAY_BIN_30']
print(X1.shape, y1.shape)

(25000, 998) (25000,)


In [48]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, test_size=0.33, random_state=42)
print(X_train_1.shape, X_test_1.shape, y_train_1.shape, y_test_1.shape)

(16750, 998) (8250, 998) (16750,) (8250,)


In [49]:
X_train_1.to_csv('datasets/1_X_train.csv', index=False)
X_test_1.to_csv('datasets/1_X_test.csv', index=False)
y_train_1.to_csv('datasets/1_y_train.csv', index=False)
y_test_1.to_csv('datasets/1_y_test.csv', index=False)

### Dataset 2

In [50]:
ds2 = DataSet(cleaningOption='mean', sample_n=25000, dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])

(25000, 14)


In [51]:
data2 = ds2.myData
data2_missing_values = data2.isnull().sum(axis=0)
data2_missing_values

DAY_OF_YEAR             0
DAY_OF_WEEK             0
AIRLINE                 0
ORIGIN_AIRPORT          0
DESTINATION_AIRPORT     0
SCHEDULED_DEPARTURE     0
DEPARTURE_DELAY         0
SCHEDULED_TIME          0
DISTANCE                0
SCHEDULED_ARRIVAL       0
ARRIVAL_DELAY           0
DIVERTED                0
CANCELLED               0
ARRIVAL_DELAY_BIN_30    0
dtype: int64

In [52]:
data2 = pd.get_dummies(data2, columns=dummy_columns, drop_first=True)
data2.shape

(25000, 1003)

In [53]:
X2 = data2.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY_BIN_30'], axis=1)
y2 = data2['ARRIVAL_DELAY_BIN_30']
print(X2.shape, y2.shape)

(25000, 998) (25000,)


In [54]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.33, random_state=42)
print(X_train_2.shape, X_test_2.shape, y_train_2.shape, y_test_2.shape)

(16750, 998) (8250, 998) (16750,) (8250,)


In [57]:
# X_train_2['DISTANCE'].describe()

In [58]:
# X_train_1['DISTANCE'].describe()

In [59]:
X_train_2.to_csv('datasets/2_X_train.csv', index=False)
X_test_2.to_csv('datasets/2_X_test.csv', index=False)
y_train_2.to_csv('datasets/2_y_train.csv', index=False)
y_test_2.to_csv('datasets/2_y_test.csv', index=False)

In [60]:
test_read_X2_train = pd.read_csv('datasets/2_X_train.csv', low_memory=False)
test_read_X2_train.shape

(16750, 998)

### Dataset 3

In [69]:
ds3 = DataSet(cleaningOption='median', sample_n=25000, 
              dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])

(25000, 14)


In [70]:
data3 = ds3.myData
data3_missing_values = data3.isnull().sum(axis=0)
data3_missing_values

DAY_OF_YEAR             0
DAY_OF_WEEK             0
AIRLINE                 0
ORIGIN_AIRPORT          0
DESTINATION_AIRPORT     0
SCHEDULED_DEPARTURE     0
DEPARTURE_DELAY         0
SCHEDULED_TIME          0
DISTANCE                0
SCHEDULED_ARRIVAL       0
ARRIVAL_DELAY           0
DIVERTED                0
CANCELLED               0
ARRIVAL_DELAY_BIN_30    0
dtype: int64

In [71]:
data3 = pd.get_dummies(data3, columns=dummy_columns, drop_first=True)

In [66]:
# data3['DISTANCE'].describe()

In [67]:
# data2['DISTANCE'].describe()

In [72]:
print(data1.shape, data2.shape, data3.shape)

(25000, 1003) (25000, 1003) (25000, 1004)


In [73]:
X3 = data3.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY_BIN_30'], axis=1)
y3 = data3['ARRIVAL_DELAY_BIN_30']
print(X3.shape, y3.shape)

(25000, 999) (25000,)


In [74]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X3, y3, test_size=0.33, random_state=42)
print(X_train_3.shape, X_test_3.shape, y_train_3.shape, y_test_3.shape)

(16750, 999) (8250, 999) (16750,) (8250,)


In [75]:
X_train_3.to_csv('datasets/3_X_train.csv', index=False)
X_test_3.to_csv('datasets/3_X_test.csv', index=False)
y_train_3.to_csv('datasets/3_y_train.csv', index=False)
y_test_3.to_csv('datasets/3_y_test.csv', index=False)

In [76]:
test_read_X3_train = pd.read_csv('datasets/3_X_train.csv', low_memory=False)
test_read_X3_train.shape

(16750, 999)

---

#### Here, I'm testing the cause of feature differences among first three datasets.

In [77]:
print("X2:", X2.shape)
print("X3:", X3.shape)

X2: (25000, 998)
X3: (25000, 999)


In [78]:
x2_features = list(X2.columns)

In [79]:
x3_features = list(X3.columns)

In [80]:
diff1 = list(set(x2_features) - set(x3_features))

In [81]:
sorted(diff1)

['DESTINATION_AIRPORT_AKN',
 'DESTINATION_AIRPORT_GUC',
 'DESTINATION_AIRPORT_GUM',
 'DESTINATION_AIRPORT_HOB',
 'DESTINATION_AIRPORT_ILG',
 'DESTINATION_AIRPORT_MMH',
 'DESTINATION_AIRPORT_MVY',
 'DESTINATION_AIRPORT_PBG',
 'DESTINATION_AIRPORT_PIH',
 'ORIGIN_AIRPORT_APN',
 'ORIGIN_AIRPORT_BGM',
 'ORIGIN_AIRPORT_CIU',
 'ORIGIN_AIRPORT_COD',
 'ORIGIN_AIRPORT_IAG',
 'ORIGIN_AIRPORT_MQT',
 'ORIGIN_AIRPORT_STC',
 'ORIGIN_AIRPORT_TWF']

In [82]:
len(diff1)

17

In [83]:
diff2 = list(set(x3_features) - set(x2_features))

In [84]:
sorted(diff2)

['DESTINATION_AIRPORT_ADK',
 'DESTINATION_AIRPORT_ADQ',
 'DESTINATION_AIRPORT_DLG',
 'DESTINATION_AIRPORT_GRI',
 'DESTINATION_AIRPORT_HYA',
 'DESTINATION_AIRPORT_LAR',
 'ORIGIN_AIRPORT_BTM',
 'ORIGIN_AIRPORT_DIK',
 'ORIGIN_AIRPORT_DVL',
 'ORIGIN_AIRPORT_GCK',
 'ORIGIN_AIRPORT_GST',
 'ORIGIN_AIRPORT_HOB',
 'ORIGIN_AIRPORT_HYA',
 'ORIGIN_AIRPORT_OME',
 'ORIGIN_AIRPORT_PBG',
 'ORIGIN_AIRPORT_UST',
 'ORIGIN_AIRPORT_VEL',
 'ORIGIN_AIRPORT_WYS']

In [85]:
len(diff2)

18

If there were no airports left out, there would be 364 + 6 + 13 + 321 + 321 + (1 $*$ 9) = 1034 features in the sample. Some testing shows we need roughly 5% of the total dataset to guarantee inclusion of all airports, which would result in ~250,000,000 (250,000 $*$ 1000) data points. This is too prohibitive a quantity for testing. Furthermore, I do not believe we gain much value from making sure *every* airport is included.

---

### Dataset 4

In [86]:
woe_columns = ['DAY_OF_YEAR_WOE', 'DAY_OF_WEEK_WOE', 'AIRLINE_WOE', 'ORIGIN_WOE', 'DESTINATION_WOE']
cols_to_drop = ['DAY_OF_YEAR', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']

In [87]:
ds4 = DataSet(cleaningOption='drop', sampleFraction=0.1, 
              dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                        'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])
data4 = ds4.myData

(571401, 14)


In [88]:
data4 = data4.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED'], axis=1)
data4.dtypes

DAY_OF_YEAR               int64
DAY_OF_WEEK               int64
AIRLINE                  object
ORIGIN_AIRPORT           object
DESTINATION_AIRPORT      object
SCHEDULED_DEPARTURE     float64
SCHEDULED_TIME          float64
DISTANCE                  int64
SCHEDULED_ARRIVAL       float64
ARRIVAL_DELAY_BIN_30      int64
dtype: object

In [90]:
# data4['ARRIVAL_DELAY_BIN_30'].value_counts()

In [91]:
X4 = data4.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y4 = data4['ARRIVAL_DELAY_BIN_30']

In [92]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X4, y4, test_size=0.33, random_state=42)
print(X_train_4.shape, X_test_4.shape, y_train_4.shape, y_test_4.shape)

(382838, 9) (188563, 9) (382838,) (188563,)


In [93]:
train_data_4 = pd.concat([X_train_4, y_train_4], axis=1)
test_data_4 = pd.concat([X_test_4, y_test_4], axis=1)

In [94]:
print(train_data_4.shape, test_data_4.shape)

(382838, 10) (188563, 10)


In [97]:
# Transform training set
day_of_year_woe = get_woe('DAY_OF_YEAR', train_data_4)
train_data_4['DAY_OF_YEAR_WOE'] = day_of_year_woe
day_of_week_woe = get_woe('DAY_OF_WEEK', train_data_4)
train_data_4['DAY_OF_WEEK_WOE'] = day_of_week_woe
airline_woe = get_woe('AIRLINE', train_data_4)
train_data_4['AIRLINE_WOE'] = airline_woe
origin_woe = get_woe('ORIGIN_AIRPORT', train_data_4)
train_data_4['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', train_data_4)
train_data_4['DESTINATION_WOE'] = dest_woe

print()

# Transform testing set
day_of_year_woe = get_woe('DAY_OF_YEAR', test_data_4)
test_data_4['DAY_OF_YEAR_WOE'] = day_of_year_woe
day_of_week_woe = get_woe('DAY_OF_WEEK', test_data_4)
test_data_4['DAY_OF_WEEK_WOE'] = day_of_week_woe
airline_woe = get_woe('AIRLINE', test_data_4)
test_data_4['AIRLINE_WOE'] = airline_woe
origin_woe = get_woe('ORIGIN_AIRPORT', test_data_4)
test_data_4['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', test_data_4)
test_data_4['DESTINATION_WOE'] = dest_woe

DAY_OF_YEAR
DAY_OF_WEEK
AIRLINE
ORIGIN_AIRPORT
DESTINATION_AIRPORT

DAY_OF_YEAR
DAY_OF_WEEK
AIRLINE
ORIGIN_AIRPORT
DESTINATION_AIRPORT


In [99]:
# for c in woe_columns:
#     print(train_data_4[c].describe())
#     print()

# for c in woe_columns:
#     print(test_data_4[c].describe())
#     print()

In [100]:
train_data_4 = train_data_4.drop(cols_to_drop, axis=1)
train_data_4 = train_data_4.reset_index()
train_data_4 = train_data_4.drop(['index'], axis=1)

test_data_4 = test_data_4.drop(cols_to_drop, axis=1)
test_data_4 = test_data_4.reset_index()
test_data_4 = test_data_4.drop(['index'], axis=1)

In [101]:
print(train_data_4.shape)
train_data_4 = train_data_4.dropna(axis=0, how='any')
print(train_data_4.shape)

print(test_data_4.shape)
test_data_4 = test_data_4.dropna(axis=0, how='any')
print(test_data_4.shape)

(382838, 10)
(382434, 10)
(188563, 10)
(187970, 10)


In [103]:
# train4_missing_values = train_data_4.isnull().sum(axis=0)
# train4_missing_values

In [104]:
X_train_4 = train_data_4.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_train_4 = train_data_4['ARRIVAL_DELAY_BIN_30']
X_test_4 = test_data_4.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_test_4 = test_data_4['ARRIVAL_DELAY_BIN_30']
print(X_train_4.shape, X_test_4.shape, y_train_4.shape, y_test_4.shape)

(382434, 9) (187970, 9) (382434,) (187970,)


In [105]:
X_train_4.to_csv('datasets/4_X_train.csv', index=False)
X_test_4.to_csv('datasets/4_X_test.csv', index=False)
y_train_4.to_csv('datasets/4_y_train.csv', index=False)
y_test_4.to_csv('datasets/4_y_test.csv', index=False)

### Dataset 5

In [121]:
ds5 = DataSet(cleaningOption='mean', sampleFraction=0.1, 
              dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                        'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])
data5 = ds5.myData
data5 = data5.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED'], axis=1)
data5.dtypes

(581908, 14)


DAY_OF_YEAR               int64
DAY_OF_WEEK               int64
AIRLINE                  object
ORIGIN_AIRPORT           object
DESTINATION_AIRPORT      object
SCHEDULED_DEPARTURE     float64
SCHEDULED_TIME          float64
DISTANCE                  int64
SCHEDULED_ARRIVAL       float64
ARRIVAL_DELAY_BIN_30      int64
dtype: object

In [122]:
data5['ARRIVAL_DELAY_BIN_30'].value_counts()

0    516717
1     65191
Name: ARRIVAL_DELAY_BIN_30, dtype: int64

In [123]:
X5 = data5.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y5 = data5['ARRIVAL_DELAY_BIN_30']

In [124]:
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X5, y5, test_size=0.33, random_state=42)
print(X_train_5.shape, X_test_5.shape, y_train_5.shape, y_test_5.shape)

(389878, 9) (192030, 9) (389878,) (192030,)


In [125]:
train_data_5 = pd.concat([X_train_5, y_train_5], axis=1)
test_data_5 = pd.concat([X_test_5, y_test_5], axis=1)
print(train_data_5.shape, test_data_5.shape)

(389878, 10) (192030, 10)


In [126]:
# Transform training set
day_of_year_woe = get_woe('DAY_OF_YEAR', train_data_5)
train_data_5['DAY_OF_YEAR_WOE'] = day_of_year_woe
day_of_week_woe = get_woe('DAY_OF_WEEK', train_data_5)
train_data_5['DAY_OF_WEEK_WOE'] = day_of_week_woe
airline_woe = get_woe('AIRLINE', train_data_5)
train_data_5['AIRLINE_WOE'] = airline_woe
origin_woe = get_woe('ORIGIN_AIRPORT', train_data_5)
train_data_5['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', train_data_5)
train_data_5['DESTINATION_WOE'] = dest_woe

print()

# Transform testing set
day_of_year_woe = get_woe('DAY_OF_YEAR', test_data_5)
test_data_5['DAY_OF_YEAR_WOE'] = day_of_year_woe
day_of_week_woe = get_woe('DAY_OF_WEEK', test_data_5)
test_data_5['DAY_OF_WEEK_WOE'] = day_of_week_woe
airline_woe = get_woe('AIRLINE', test_data_5)
test_data_5['AIRLINE_WOE'] = airline_woe
origin_woe = get_woe('ORIGIN_AIRPORT', test_data_5)
test_data_5['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', test_data_5)
test_data_5['DESTINATION_WOE'] = dest_woe

DAY_OF_YEAR
DAY_OF_WEEK
AIRLINE
ORIGIN_AIRPORT
DESTINATION_AIRPORT

DAY_OF_YEAR
DAY_OF_WEEK
AIRLINE
ORIGIN_AIRPORT
DESTINATION_AIRPORT


In [132]:
# for c in woe_columns:
#     print(train_data_5[c].describe())
#     print()

# for c in woe_columns:
#     print(test_data_5[c].describe())
#     print()

In [129]:
print(train_data_5.shape)
train_data_5 = train_data_5.dropna(axis=0, how='any')
print(train_data_5.shape)

print(test_data_5.shape)
test_data_5 = test_data_5.dropna(axis=0, how='any')
print(test_data_5.shape)

(389878, 15)
(389599, 15)
(192030, 15)
(191516, 15)


In [130]:
train_data_5 = train_data_5.drop(cols_to_drop, axis=1)
train_data_5 = train_data_5.reset_index()
train_data_5 = train_data_5.drop(['index'], axis=1)

test_data_5 = test_data_5.drop(cols_to_drop, axis=1)
test_data_5 = test_data_5.reset_index()
test_data_5 = test_data_5.drop(['index'], axis=1)

In [134]:
# train5_missing_values = train_data_5.isnull().sum(axis=0)
# train5_missing_values

In [135]:
X_train_5 = train_data_5.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_train_5 = train_data_5['ARRIVAL_DELAY_BIN_30']
X_test_5 = test_data_5.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_test_5 = test_data_5['ARRIVAL_DELAY_BIN_30']
print(X_train_5.shape, X_test_5.shape, y_train_5.shape, y_test_5.shape)

(389599, 9) (191516, 9) (389599,) (191516,)


In [136]:
X_train_5.to_csv('datasets/5_X_train.csv', index=False)
X_test_5.to_csv('datasets/5_X_test.csv', index=False)
y_train_5.to_csv('datasets/5_y_train.csv', index=False)
y_test_5.to_csv('datasets/5_y_test.csv', index=False)

### Dataset 6

In [137]:
ds6 = DataSet(cleaningOption='median', sampleFraction=0.1, 
              dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                        'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])
data6 = ds6.myData
data6 = data6.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED'], axis=1)
data6.dtypes

(581908, 14)


DAY_OF_YEAR               int64
DAY_OF_WEEK               int64
AIRLINE                  object
ORIGIN_AIRPORT           object
DESTINATION_AIRPORT      object
SCHEDULED_DEPARTURE     float64
SCHEDULED_TIME          float64
DISTANCE                  int64
SCHEDULED_ARRIVAL       float64
ARRIVAL_DELAY_BIN_30      int64
dtype: object

In [138]:
data6['ARRIVAL_DELAY_BIN_30'].value_counts()

0    516865
1     65043
Name: ARRIVAL_DELAY_BIN_30, dtype: int64

In [139]:
X6 = data6.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y6 = data6['ARRIVAL_DELAY_BIN_30']

In [140]:
X_train_6, X_test_6, y_train_6, y_test_6 = train_test_split(X6, y6, test_size=0.33, random_state=42)
print(X_train_6.shape, X_test_6.shape, y_train_6.shape, y_test_6.shape)

train_data_6 = pd.concat([X_train_6, y_train_6], axis=1)
test_data_6 = pd.concat([X_test_6, y_test_6], axis=1)
print(train_data_6.shape, test_data_6.shape)

(389878, 9) (192030, 9) (389878,) (192030,)
(389878, 10) (192030, 10)


In [141]:
# Transform training set
day_of_year_woe = get_woe('DAY_OF_YEAR', train_data_6)
train_data_6['DAY_OF_YEAR_WOE'] = day_of_year_woe
day_of_week_woe = get_woe('DAY_OF_WEEK', train_data_6)
train_data_6['DAY_OF_WEEK_WOE'] = day_of_week_woe
airline_woe = get_woe('AIRLINE', train_data_6)
train_data_6['AIRLINE_WOE'] = airline_woe
origin_woe = get_woe('ORIGIN_AIRPORT', train_data_6)
train_data_6['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', train_data_6)
train_data_6['DESTINATION_WOE'] = dest_woe

print()

# Transform testing set
day_of_year_woe = get_woe('DAY_OF_YEAR', test_data_6)
test_data_6['DAY_OF_YEAR_WOE'] = day_of_year_woe
day_of_week_woe = get_woe('DAY_OF_WEEK', test_data_6)
test_data_6['DAY_OF_WEEK_WOE'] = day_of_week_woe
airline_woe = get_woe('AIRLINE', test_data_6)
test_data_6['AIRLINE_WOE'] = airline_woe
origin_woe = get_woe('ORIGIN_AIRPORT', test_data_6)
test_data_6['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', test_data_6)
test_data_6['DESTINATION_WOE'] = dest_woe

DAY_OF_YEAR
DAY_OF_WEEK
AIRLINE
ORIGIN_AIRPORT
DESTINATION_AIRPORT

DAY_OF_YEAR
DAY_OF_WEEK
AIRLINE
ORIGIN_AIRPORT
DESTINATION_AIRPORT


In [146]:
# for c in woe_columns:
#     print(train_data_6[c].describe())
#     print()

# for c in woe_columns:
#     print(test_data_6[c].describe())
#     print()

In [144]:
print(train_data_6.shape)
train_data_6 = train_data_6.dropna(axis=0, how='any')
print(train_data_6.shape)

print(test_data_6.shape)
test_data_6 = test_data_6.dropna(axis=0, how='any')
print(test_data_6.shape)

(389878, 15)
(389556, 15)
(192030, 15)
(191344, 15)


In [147]:
train_data_6 = train_data_6.drop(cols_to_drop, axis=1)
train_data_6 = train_data_6.reset_index()
train_data_6 = train_data_6.drop(['index'], axis=1)

test_data_6 = test_data_6.drop(cols_to_drop, axis=1)
test_data_6 = test_data_6.reset_index()
test_data_6 = test_data_6.drop(['index'], axis=1)

In [149]:
# train6_missing_values = train_data_6.isnull().sum(axis=0)
# train6_missing_values

In [150]:
X_train_6 = train_data_6.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_train_6 = train_data_6['ARRIVAL_DELAY_BIN_30']
X_test_6 = test_data_6.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_test_6 = test_data_6['ARRIVAL_DELAY_BIN_30']
print(X_train_6.shape, X_test_6.shape, y_train_6.shape, y_test_6.shape)

(389556, 9) (191344, 9) (389556,) (191344,)


In [151]:
X_train_6.to_csv('datasets/6_X_train.csv', index=False)
X_test_6.to_csv('datasets/6_X_test.csv', index=False)
y_train_6.to_csv('datasets/6_y_train.csv', index=False)
y_test_6.to_csv('datasets/6_y_test.csv', index=False)

---

### Dataset 7

In [224]:
dummy_columns2 = ['DAY_OF_WEEK', 'AIRLINE']
woe_columns2 = ['DAY_OF_YEAR_WOE', 'ORIGIN_WOE', 'DESTINATION_WOE']
cols_to_drop2 = ['DAY_OF_YEAR', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']

In [225]:
ds7 = DataSet(cleaningOption='drop', sampleFraction=0.1, 
              dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                        'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])
data7 = ds7.myData
data7.head()

(571401, 14)


Unnamed: 0,DAY_OF_YEAR,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DIVERTED,CANCELLED,ARRIVAL_DELAY_BIN_30
0,77,3,WN,MKE,PHX,855.0,-3.0,230.0,1460,965.0,-18.0,0,0,0
1,340,7,WN,DAL,STL,1290.0,5.0,95.0,546,1385.0,2.0,0,0,0
2,225,4,WN,SAT,DAL,995.0,-1.0,60.0,248,1055.0,-2.0,0,0,0
3,359,5,UA,SFO,IAH,830.0,1.0,225.0,1635,1175.0,-5.0,0,0,0
4,183,4,DL,FLL,JFK,445.0,-5.0,170.0,1069,615.0,-20.0,0,0,0


In [226]:
data7 = ds7.myData
data7_missing_values = data7.isnull().sum(axis=0)
# data7_missing_values

In [227]:
data7 = pd.get_dummies(data7, columns=dummy_columns2, drop_first=True)
data7.shape

(571401, 31)

In [229]:
# data7.dtypes

In [230]:
X7 = data7.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY_BIN_30'], axis=1)
y7 = data7['ARRIVAL_DELAY_BIN_30']
print(X7.shape, y7.shape)

(571401, 26) (571401,)


In [231]:
X_train_7, X_test_7, y_train_7, y_test_7 = train_test_split(X7, y7, test_size=0.33, random_state=42)
print(X_train_7.shape, X_test_7.shape, y_train_7.shape, y_test_7.shape)

(382838, 26) (188563, 26) (382838,) (188563,)


In [232]:
train_data_7 = pd.concat([X_train_7, y_train_7], axis=1)
test_data_7 = pd.concat([X_test_7, y_test_7], axis=1)
print(train_data_7.shape, test_data_7.shape)

(382838, 27) (188563, 27)


In [233]:
# Transform training set
day_of_year_woe = get_woe('DAY_OF_YEAR', train_data_7)
train_data_7['DAY_OF_YEAR_WOE'] = day_of_year_woe
origin_woe = get_woe('ORIGIN_AIRPORT', train_data_7)
train_data_7['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', train_data_7)
train_data_7['DESTINATION_WOE'] = dest_woe

print()

# Transform testing set
day_of_year_woe = get_woe('DAY_OF_YEAR', test_data_7)
test_data_7['DAY_OF_YEAR_WOE'] = day_of_year_woe
origin_woe = get_woe('ORIGIN_AIRPORT', test_data_7)
test_data_7['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', test_data_7)
test_data_7['DESTINATION_WOE'] = dest_woe

DAY_OF_YEAR
ORIGIN_AIRPORT
DESTINATION_AIRPORT

DAY_OF_YEAR
ORIGIN_AIRPORT
DESTINATION_AIRPORT


In [235]:
# for c in woe_columns2:
#     print(train_data_7[c].describe())
#     print()

# for c in woe_columns2:
#     print(test_data_7[c].describe())
#     print()

In [236]:
train_data_7 = train_data_7.drop(cols_to_drop2, axis=1)
train_data_7 = train_data_7.reset_index()
train_data_7 = train_data_7.drop(['index'], axis=1)

test_data_7 = test_data_7.drop(cols_to_drop2, axis=1)
test_data_7 = test_data_7.reset_index()
test_data_7 = test_data_7.drop(['index'], axis=1)

print(train_data_7.shape)
train_data_7 = train_data_7.dropna(axis=0, how='any')
print(train_data_7.shape)

print(test_data_7.shape)
test_data_7 = test_data_7.dropna(axis=0, how='any')
print(test_data_7.shape)

(382838, 27)
(382464, 27)
(188563, 27)
(188120, 27)


In [238]:
# train7_missing_values = train_data_7.isnull().sum(axis=0)
# train7_missing_values

In [239]:
X_train_7 = train_data_7.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_train_7 = train_data_7['ARRIVAL_DELAY_BIN_30']
X_test_7 = test_data_7.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_test_7 = test_data_7['ARRIVAL_DELAY_BIN_30']
print(X_train_7.shape, X_test_7.shape, y_train_7.shape, y_test_7.shape)

(382464, 26) (188120, 26) (382464,) (188120,)


In [241]:
# test_data_7.dtypes

In [243]:
# X_test_7.dtypes

In [244]:
X_train_7.to_csv('datasets/7_X_train.csv', index=False)
X_test_7.to_csv('datasets/7_X_test.csv', index=False)
y_train_7.to_csv('datasets/7_y_train.csv', index=False)
y_test_7.to_csv('datasets/7_y_test.csv', index=False)

In [245]:
test_read_X7_train = pd.read_csv('datasets/7_X_train.csv', low_memory=False)
test_read_X7_train.shape

(382464, 26)

In [249]:
# test_read_X7_train.dtypes

In [248]:
# test_read_y7_train = pd.read_csv('datasets/7_y_train.csv', low_memory=False, header=None)
# test_read_y7_train = pd.Series(test_read_y7_train[0])
# test_read_y7_train.describe()

### Dataset 8

In [250]:
ds8 = DataSet(cleaningOption='mean', sampleFraction=0.1, 
              dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                        'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])
data8 = ds8.myData
data8.head()

(581908, 14)


Unnamed: 0,DAY_OF_YEAR,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DIVERTED,CANCELLED,ARRIVAL_DELAY_BIN_30
0,333,7,WN,DCA,MCO,935.0,-4.0,150.0,759,1085.0,-19.0,0,0,0
1,35,3,WN,MCI,PDX,1090.0,15.0,245.0,1482,1215.0,19.0,0,0,0
2,167,2,EV,AVP,DTW,1038.0,-6.0,96.0,399,1134.0,-14.0,0,0,0
3,155,4,EV,ABE,DTW,360.0,-4.0,100.0,425,460.0,8.0,0,0,0
4,136,6,UA,ORD,ANC,950.0,2.0,392.0,2846,1162.0,-9.0,0,0,0


In [252]:
data8 = ds8.myData
data8_missing_values = data8.isnull().sum(axis=0)
# data8_missing_values

In [253]:
data8 = pd.get_dummies(data8, columns=dummy_columns2, drop_first=True)
data8.shape

(581908, 31)

In [255]:
# data8.dtypes

In [256]:
X8 = data8.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY_BIN_30'], axis=1)
y8 = data8['ARRIVAL_DELAY_BIN_30']
print(X8.shape, y8.shape)

(581908, 26) (581908,)


In [257]:
X_train_8, X_test_8, y_train_8, y_test_8 = train_test_split(X8, y8, test_size=0.33, random_state=42)
print(X_train_8.shape, X_test_8.shape, y_train_8.shape, y_test_8.shape)

(389878, 26) (192030, 26) (389878,) (192030,)


In [258]:
train_data_8 = pd.concat([X_train_8, y_train_8], axis=1)
test_data_8 = pd.concat([X_test_8, y_test_8], axis=1)
print(train_data_8.shape, test_data_8.shape)

(389878, 27) (192030, 27)


In [259]:
# Transform training set
day_of_year_woe = get_woe('DAY_OF_YEAR', train_data_8)
train_data_8['DAY_OF_YEAR_WOE'] = day_of_year_woe
origin_woe = get_woe('ORIGIN_AIRPORT', train_data_8)
train_data_8['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', train_data_8)
train_data_8['DESTINATION_WOE'] = dest_woe

print()

# Transform testing set
day_of_year_woe = get_woe('DAY_OF_YEAR', test_data_8)
test_data_8['DAY_OF_YEAR_WOE'] = day_of_year_woe
origin_woe = get_woe('ORIGIN_AIRPORT', test_data_8)
test_data_8['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', test_data_8)
test_data_8['DESTINATION_WOE'] = dest_woe

DAY_OF_YEAR
ORIGIN_AIRPORT
DESTINATION_AIRPORT

DAY_OF_YEAR
ORIGIN_AIRPORT
DESTINATION_AIRPORT


In [261]:
# for c in woe_columns2:
#     print(train_data_8[c].describe())
#     print()

# for c in woe_columns2:
#     print(test_data_8[c].describe())
#     print()

In [262]:
train_data_8 = train_data_8.drop(cols_to_drop2, axis=1)
train_data_8 = train_data_8.reset_index()
train_data_8 = train_data_8.drop(['index'], axis=1)

test_data_8 = test_data_8.drop(cols_to_drop2, axis=1)
test_data_8 = test_data_8.reset_index()
test_data_8 = test_data_8.drop(['index'], axis=1)

print(train_data_8.shape)
train_data_8 = train_data_8.dropna(axis=0, how='any')
print(train_data_8.shape)

print(test_data_8.shape)
test_data_8 = test_data_8.dropna(axis=0, how='any')
print(test_data_8.shape)

(389878, 27)
(389695, 27)
(192030, 27)
(191497, 27)


In [264]:
# train8_missing_values = train_data_8.isnull().sum(axis=0)
# train8_missing_values

In [265]:
X_train_8 = train_data_8.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_train_8 = train_data_8['ARRIVAL_DELAY_BIN_30']
X_test_8 = test_data_8.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_test_8 = test_data_8['ARRIVAL_DELAY_BIN_30']
print(X_train_8.shape, X_test_8.shape, y_train_8.shape, y_test_8.shape)

(389695, 26) (191497, 26) (389695,) (191497,)


In [267]:
# X_test_8.dtypes

In [268]:
X_train_8.to_csv('datasets/8_X_train.csv', index=False)
X_test_8.to_csv('datasets/8_X_test.csv', index=False)
y_train_8.to_csv('datasets/8_y_train.csv', index=False)
y_test_8.to_csv('datasets/8_y_test.csv', index=False)

In [269]:
test_read_X8_train = pd.read_csv('datasets/8_X_train.csv', low_memory=False)
test_read_X8_train.shape

(389695, 26)

### Dataset 9

In [270]:
ds9 = DataSet(cleaningOption='median', sampleFraction=0.1, 
              dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                        'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])
data9 = ds9.myData
data9.head()

(581908, 14)


Unnamed: 0,DAY_OF_YEAR,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DIVERTED,CANCELLED,ARRIVAL_DELAY_BIN_30
0,183,4,AA,PHX,ONT,1370.0,0.0,67.0,325,1437.0,3.0,0,0,0
1,255,6,AA,TPA,ORD,1056.0,39.0,175.0,1012,1171.0,23.0,0,0,0
2,258,2,AS,SEA,DEN,665.0,-2.0,158.0,1024,883.0,-5.0,0,0,0
3,205,5,UA,ORD,AUS,803.0,-8.0,164.0,978,967.0,-26.0,0,0,0
4,75,1,DL,HNL,ATL,1000.0,-5.0,520.0,4502,440.0,-28.0,0,0,0


In [272]:
data9 = ds9.myData
data9_missing_values = data9.isnull().sum(axis=0)
# data9_missing_values

In [273]:
data9 = pd.get_dummies(data9, columns=dummy_columns2, drop_first=True)
data9.shape

(581908, 31)

In [275]:
# data9.dtypes

In [276]:
X9 = data9.drop(['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY_BIN_30'], axis=1)
y9 = data9['ARRIVAL_DELAY_BIN_30']
print(X9.shape, y9.shape)

(581908, 26) (581908,)


In [277]:
X_train_9, X_test_9, y_train_9, y_test_9 = train_test_split(X9, y9, test_size=0.33, random_state=42)
print(X_train_9.shape, X_test_9.shape, y_train_9.shape, y_test_9.shape)

(389878, 26) (192030, 26) (389878,) (192030,)


In [278]:
train_data_9 = pd.concat([X_train_9, y_train_9], axis=1)
test_data_9 = pd.concat([X_test_9, y_test_9], axis=1)
print(train_data_9.shape, test_data_9.shape)

(389878, 27) (192030, 27)


In [279]:
# Transform training set
day_of_year_woe = get_woe('DAY_OF_YEAR', train_data_9)
train_data_9['DAY_OF_YEAR_WOE'] = day_of_year_woe
origin_woe = get_woe('ORIGIN_AIRPORT', train_data_9)
train_data_9['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', train_data_9)
train_data_9['DESTINATION_WOE'] = dest_woe

print()

# Transform testing set
day_of_year_woe = get_woe('DAY_OF_YEAR', test_data_9)
test_data_9['DAY_OF_YEAR_WOE'] = day_of_year_woe
origin_woe = get_woe('ORIGIN_AIRPORT', test_data_9)
test_data_9['ORIGIN_WOE'] = origin_woe
dest_woe = get_woe('DESTINATION_AIRPORT', test_data_9)
test_data_9['DESTINATION_WOE'] = dest_woe

DAY_OF_YEAR
ORIGIN_AIRPORT
DESTINATION_AIRPORT

DAY_OF_YEAR
ORIGIN_AIRPORT
DESTINATION_AIRPORT


In [281]:
# for c in woe_columns2:
#     print(train_data_9[c].describe())
#     print()

# for c in woe_columns2:
#     print(test_data_9[c].describe())
#     print()

In [282]:
train_data_9 = train_data_9.drop(cols_to_drop2, axis=1)
train_data_9 = train_data_9.reset_index()
train_data_9 = train_data_9.drop(['index'], axis=1)

test_data_9 = test_data_9.drop(cols_to_drop2, axis=1)
test_data_9 = test_data_9.reset_index()
test_data_9 = test_data_9.drop(['index'], axis=1)

print(train_data_9.shape)
train_data_9 = train_data_9.dropna(axis=0, how='any')
print(train_data_9.shape)

print(test_data_9.shape)
test_data_9 = test_data_9.dropna(axis=0, how='any')
print(test_data_9.shape)

(389878, 27)
(389583, 27)
(192030, 27)
(191552, 27)


In [284]:
# train9_missing_values = train_data_9.isnull().sum(axis=0)
# train9_missing_values

In [285]:
X_train_9 = train_data_9.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_train_9 = train_data_9['ARRIVAL_DELAY_BIN_30']
X_test_9 = test_data_9.drop(['ARRIVAL_DELAY_BIN_30'], axis=1)
y_test_9 = test_data_9['ARRIVAL_DELAY_BIN_30']
print(X_train_9.shape, X_test_9.shape, y_train_9.shape, y_test_9.shape)

(389583, 26) (191552, 26) (389583,) (191552,)


In [287]:
# X_test_9.dtypes

In [288]:
X_train_9.to_csv('datasets/9_X_train.csv', index=False)
X_test_9.to_csv('datasets/9_X_test.csv', index=False)
y_train_9.to_csv('datasets/9_y_train.csv', index=False)
y_test_9.to_csv('datasets/9_y_test.csv', index=False)

In [289]:
test_read_X9_train = pd.read_csv('datasets/9_X_train.csv', low_memory=False)
test_read_X9_train.shape

(389583, 26)

---

### Setup for datasets 10 - 12

In [298]:
import category_encoders as ce
columns_to_encode = ['DAY_OF_YEAR', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']
remove_for_X = ['DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'ARRIVAL_DELAY_BIN_30']

In [323]:
def create_test_train_split(dataset, enc, ds_num):
    iterData = dataset.myData
    iterData_missing_values = iterData.isnull().sum(axis=0)
    print(iterData_missing_values)
    print()
    X = iterData.drop(remove_for_X, axis=1)
    y = iterData['ARRIVAL_DELAY_BIN_30']
    print(X.shape, y.shape)
    print()
    
    print(enc)
    print()
    encoder = enc
    encoder.fit(X, None)
    X_transformed = encoder.transform(X)
    print(X_transformed.shape)
    print()
    
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.33, random_state=42)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    print()
    
    X_train.to_csv('datasets/' + str(ds_num) + '_X_train.csv', index=False)
    print("Finished outputting X_train_" + str(ds_num) + " to csv")
    X_test.to_csv('datasets/' + str(ds_num) + '_X_test.csv', index=False)
    print("Finished outputting X_test_" + str(ds_num) + " to csv")
    y_train.to_csv('datasets/' + str(ds_num) + '_y_train.csv', index=False)
    print("Finished outputting y_train_" + str(ds_num) + " to csv")
    y_test.to_csv('datasets/' + str(ds_num) + '_y_test.csv', index=False)
    print("Finished outputting y_test_" + str(ds_num) + " to csv")

### Dataset 10

In [319]:
ds10 = DataSet(cleaningOption='drop', sample_n=25000, 
               dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                         'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])

(25000, 14)


In [324]:
create_test_train_split(ds10, ce.HelmertEncoder(cols=columns_to_encode), 10)

DAY_OF_YEAR             0
DAY_OF_WEEK             0
AIRLINE                 0
ORIGIN_AIRPORT          0
DESTINATION_AIRPORT     0
SCHEDULED_DEPARTURE     0
DEPARTURE_DELAY         0
SCHEDULED_TIME          0
DISTANCE                0
SCHEDULED_ARRIVAL       0
ARRIVAL_DELAY           0
DIVERTED                0
CANCELLED               0
ARRIVAL_DELAY_BIN_30    0
dtype: int64

(25000, 9) (25000,)

HelmertEncoder(cols=['DAY_OF_YEAR', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'],
        drop_invariant=False, handle_unknown='impute', impute_missing=True,
        return_df=True, verbose=0)

(25000, 1009)

(16750, 1009) (8250, 1009) (16750,) (8250,)

Finished outputting X_train_10 to csv
Finished outputting X_test_10 to csv
Finished outputting y_train_10 to csv
Finished outputting y_test_10 to csv


### Dataset 11

In [359]:
ds11 = DataSet(cleaningOption='mean', sample_n=25000, 
               dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                         'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])

(25000, 14)


In [360]:
create_test_train_split(ds11, ce.HelmertEncoder(cols=columns_to_encode), 11)

DAY_OF_YEAR             0
DAY_OF_WEEK             0
AIRLINE                 0
ORIGIN_AIRPORT          0
DESTINATION_AIRPORT     0
SCHEDULED_DEPARTURE     0
DEPARTURE_DELAY         0
SCHEDULED_TIME          0
DISTANCE                0
SCHEDULED_ARRIVAL       0
ARRIVAL_DELAY           0
DIVERTED                0
CANCELLED               0
ARRIVAL_DELAY_BIN_30    0
dtype: int64

(25000, 9) (25000,)

HelmertEncoder(cols=['DAY_OF_YEAR', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'],
        drop_invariant=False, handle_unknown='impute', impute_missing=True,
        return_df=True, verbose=0)

(25000, 1004)

(16750, 1004) (8250, 1004) (16750,) (8250,)

Finished outputting X_train_11 to csv
Finished outputting X_test_11 to csv
Finished outputting y_train_11 to csv
Finished outputting y_test_11 to csv


### Dataset 12

In [363]:
ds12 = DataSet(cleaningOption='median', sample_n=25000, 
               dropCols=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
                         'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'CANCELLATION_REASON'])

(25000, 14)


In [364]:
create_test_train_split(ds12, ce.HelmertEncoder(cols=columns_to_encode), 12)

DAY_OF_YEAR             0
DAY_OF_WEEK             0
AIRLINE                 0
ORIGIN_AIRPORT          0
DESTINATION_AIRPORT     0
SCHEDULED_DEPARTURE     0
DEPARTURE_DELAY         0
SCHEDULED_TIME          0
DISTANCE                0
SCHEDULED_ARRIVAL       0
ARRIVAL_DELAY           0
DIVERTED                0
CANCELLED               0
ARRIVAL_DELAY_BIN_30    0
dtype: int64

(25000, 9) (25000,)

HelmertEncoder(cols=['DAY_OF_YEAR', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'],
        drop_invariant=False, handle_unknown='impute', impute_missing=True,
        return_df=True, verbose=0)

(25000, 1001)

(16750, 1001) (8250, 1001) (16750,) (8250,)

Finished outputting X_train_12 to csv
Finished outputting X_test_12 to csv
Finished outputting y_train_12 to csv
Finished outputting y_test_12 to csv
