In [105]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [82]:
# column details - 


# FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,ARR_DELAY,DEP_DELAY,CANCELLED,DISTANCE
# FL_DATE -- flight date (yyyy-mm-dd)
# OP_CARRIER -- carrier code
# OP_CARRIER_FL_NUM -- flight number
# ORIGIN -- origin place 
# DEST -- destination place
# CRS_DEP_TIME -- scheduled departure time (hhmm)
# DEP_TIME -- actual departure time (hhmm)
# DEP_DELAY -- departure delay (minutes)
# TAXI_OUT -- taxi out time (minutes)
# WHEELS_OFF -- wheels off time (hhmm)
# WHEELS_ON -- wheels on time (hhmm)
# TAXI_IN -- taxi in time (minutes)
# CRS_ARR_TIME -- scheduled arrival time (hhmm)
# ARR_TIME -- actual arrival time (hhmm)
# ARR_DELAY -- arrival delay (minutes)
# CANCELLED -- was the flight cancelled?
# CANCELLATION_CODE 
# DIVERTED -- was the flight diverted? (1 = yes, 0 = no)
# CRS_ELAPSED_TIME -- scheduled elapsed time (minutes)
# ACTUAL_ELAPSED_TIME -- actual elapsed time (minutes)
# AIR_TIME -- air time (minutes)
# DISTANCE -- distance (miles)
# CARRIER_DELAY -- carrier delay (minutes)
# WEATHER_DELAY -- weather delay (minutes)
# NAS_DELAY -- NAS delay (minutes)
# SECURITY_DELAY -- security delay (minutes)
# LATE_AIRCRAFT_DELAY -- late aircraft delay (minutes)


Tasks: 
1. do cyclic feature engineering on the data with time - FL_DATE, CRS_DEP_TIME, DEP_TIME, WHEELS_OFF, WHEELS_ON, CRS_ARR_TIME, ARR_TIME--- done 
2. Handle the OP_CARRIER data with numbers instead of strings, or possibly use one-hot encoding -- doing label encoding instead of one-hot encoding for now.
3. Similarly, implement one-hot encoding or numerical encoding on the origin and destination airports. If using one-hot encoding, find a method to combine both the destination and origin airports for each side of one-hot encoding. 

In [83]:
# import data
data = pd.read_csv('2010-17_all_labels.csv', skiprows=0)
data1 = data
data.head()

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2010-12-31,OO,6439,CWA,EAU,2305.0,1939.0,-206.0,15.0,1954.0,...,0.0,38.0,52.0,32.0,90.0,,,,,
1,2010-06-16,AS,66,YAK,JNU,1845.0,1822.0,-23.0,4.0,1826.0,...,0.0,48.0,38.0,30.0,199.0,,,,,
2,2010-02-10,OO,4551,RAP,SLC,1645.0,1624.0,-21.0,16.0,1640.0,...,0.0,106.0,102.0,80.0,508.0,,,,,
3,2010-01-01,OH,6593,DCA,IND,2000.0,1940.0,-20.0,7.0,1947.0,...,0.0,120.0,92.0,77.0,499.0,,,,,
4,2010-07-01,UA,828,JAC,DEN,1506.0,1447.0,-19.0,14.0,1501.0,...,0.0,90.0,79.0,60.0,406.0,,,,,


In [84]:
data.columns

Index(['FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'],
      dtype='object')

In [85]:
# data.head()
# count number of unique OP_CARRIER, ORIGIN, DEST, and total number of unique places
print(len(data['OP_CARRIER'].unique()))
print(len(data['ORIGIN'].unique()))
print(len(data['DEST'].unique()))
origin = data['ORIGIN'].unique()
dest = data['DEST'].unique()
origin = np.append(origin, dest)
origin = np.unique(origin)
print(len(origin))

20
328
329
335


In [86]:
# function for implementing cyclic encoding of categorical variables
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [87]:
# fill up all the missing values in the column 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY' with 0
data['CARRIER_DELAY'] = data['CARRIER_DELAY'].fillna(0)
data['WEATHER_DELAY'] = data['WEATHER_DELAY'].fillna(0)
data['NAS_DELAY'] = data['NAS_DELAY'].fillna(0)
data['SECURITY_DELAY'] = data['SECURITY_DELAY'].fillna(0)
data['LATE_AIRCRAFT_DELAY'] = data['LATE_AIRCRAFT_DELAY'].fillna(0)

In [88]:
# splitting data of the form dd-mm-yyyy into day, month and year columns
data.head()
data['FL_DAY'] = data['FL_DATE'].map(lambda x: x.split('-')[2])
data['FL_MONTH'] = data['FL_DATE'].map(lambda x: x.split('-')[1])
data['FL_YEAR'] = data['FL_DATE'].map(lambda x: x.split('-')[0])
data['FL_YEAR'] = data['FL_YEAR'].astype(int)
data['FL_MONTH'] = data['FL_MONTH'].astype(int)
data['FL_DAY'] = data['FL_DAY'].astype(int)

In [89]:
# dropping the columns where time is not available, that is, where the flight was cancelled
data = data.dropna(subset=['ARR_TIME', 'DEP_TIME', 'DEP_DELAY', 'ARR_DELAY', 'CRS_DEP_TIME', 'CRS_ARR_TIME'])

In [90]:
# split the CRS_DEP_TIME into hours and minutes
data['CRS_DEP_HOUR'] = data['CRS_DEP_TIME'].map(lambda x: int(x/100))
data['CRS_DEP_MIN'] = data['CRS_DEP_TIME'].map(lambda x: int(x%100))
# split the DEP_TIME into hours and minutes
data['DEP_TIME_HOUR'] = data['DEP_TIME'].map(lambda x: int(x/100))
data['DEP_TIME_MIN'] = data['DEP_TIME'].map(lambda x: int(x%100))
# split the WHEELS_OFF into hours and minutes
data['WHEELS_OFF_HOUR'] = data['WHEELS_OFF'].map(lambda x: int(x/100))
data['WHEELS_OFF_MIN'] = data['WHEELS_OFF'].map(lambda x: int(x%100))
# split the WHEELS_ON into hours and minutes
data['WHEELS_ON_HOUR'] = data['WHEELS_ON'].map(lambda x: int(x/100))
data['WHEELS_ON_MIN'] = data['WHEELS_ON'].map(lambda x: int(x%100))
# split the CRS_ARR_TIME into hours and minutes
data['CRS_ARR_HOUR'] = data['CRS_ARR_TIME'].map(lambda x: int(x/100))
data['CRS_ARR_MIN'] = data['CRS_ARR_TIME'].map(lambda x: int(x%100))
# split the ARR_TIME into hours and minutes
data['ARR_TIME_HOUR'] = data['ARR_TIME'].map(lambda x: int(x/100))
data['ARR_TIME_MIN'] = data['ARR_TIME'].map(lambda x: int(x%100))
# drop the columns that are not required- CRS_DEP_TIME, DEP_TIME, WHEELS_OFF, WHEELS_ON, CRS_ARR_TIME, ARR_TIME
data = data.drop(columns=['CRS_DEP_TIME', 'DEP_TIME', 'WHEELS_OFF', 'WHEELS_ON', 'CRS_ARR_TIME', 'ARR_TIME'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CRS_DEP_HOUR'] = data['CRS_DEP_TIME'].map(lambda x: int(x/100))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CRS_DEP_MIN'] = data['CRS_DEP_TIME'].map(lambda x: int(x%100))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['DEP_TIME_HOUR'] = data['DEP_TIME'].map(lambda x: int(x/100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CRS_ARR_HOUR'] = data['CRS_ARR_TIME'].map(lambda x: int(x/100))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CRS_ARR_MIN'] = data['CRS_ARR_TIME'].map(lambda x: int(x%100))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ARR_TIME_HOUR'] = data['ARR_TIME'].map(lambda x: int(x/100))

In [91]:
# columns of the dataset
data.columns

Index(['FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST',
       'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ARR_DELAY', 'CANCELLED',
       'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'FL_DAY', 'FL_MONTH', 'FL_YEAR', 'CRS_DEP_HOUR', 'CRS_DEP_MIN',
       'DEP_TIME_HOUR', 'DEP_TIME_MIN', 'WHEELS_OFF_HOUR', 'WHEELS_OFF_MIN',
       'WHEELS_ON_HOUR', 'WHEELS_ON_MIN', 'CRS_ARR_HOUR', 'CRS_ARR_MIN',
       'ARR_TIME_HOUR', 'ARR_TIME_MIN'],
      dtype='object')

In [92]:
#cyclic feature encoding 
data = encode(data, 'CRS_DEP_HOUR', 24)
data = encode(data, 'CRS_DEP_MIN', 60)
data = encode(data, 'DEP_TIME_HOUR', 24)
data = encode(data, 'DEP_TIME_MIN', 60)
data = encode(data, 'WHEELS_OFF_HOUR', 24)
data = encode(data, 'WHEELS_OFF_MIN', 60)
data = encode(data, 'WHEELS_ON_HOUR', 24)
data = encode(data, 'WHEELS_ON_MIN', 60)
data = encode(data, 'CRS_ARR_HOUR', 24)
data = encode(data, 'CRS_ARR_MIN', 60)
data = encode(data, 'ARR_TIME_HOUR', 24)
data = encode(data, 'ARR_TIME_MIN', 60)
# reduce FL_MONTH and FL_DAY by 1
data['FL_MONTH'] = data['FL_MONTH'] - 1
data['FL_DAY'] = data['FL_DAY'] - 1
# cyclic feature encoding for FL_MONTH and FL_DAY
data = encode(data, 'FL_MONTH', 12)
data = encode(data, 'FL_DAY', 31)

In [93]:
# drop the columns that are not required- CRS_DEP_TIME, DEP_TIME, WHEELS_OFF, WHEELS_ON, CRS_ARR_TIME, ARR_TIME
data = data.drop(columns=['CRS_DEP_HOUR', 'CRS_DEP_MIN', 'DEP_TIME_HOUR', 'DEP_TIME_MIN', 'WHEELS_OFF_HOUR', 'WHEELS_OFF_MIN', 'WHEELS_ON_HOUR', 'WHEELS_ON_MIN', 'CRS_ARR_HOUR', 'CRS_ARR_MIN', 'ARR_TIME_HOUR', 'ARR_TIME_MIN', 'FL_MONTH', 'FL_DAY'])
print(data.shape)
data.head()

(47032, 50)


Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_DELAY,TAXI_OUT,TAXI_IN,ARR_DELAY,CANCELLED,...,CRS_ARR_MIN_sin,CRS_ARR_MIN_cos,ARR_TIME_HOUR_sin,ARR_TIME_HOUR_cos,ARR_TIME_MIN_sin,ARR_TIME_MIN_cos,FL_MONTH_sin,FL_MONTH_cos,FL_DAY_sin,FL_DAY_cos
0,2010-12-31,OO,6439,CWA,EAU,-206.0,15.0,5.0,-192.0,0.0,...,-0.978148,-0.207912,-0.866025,0.5,-0.104528,-0.994522,-0.5,0.866025,-0.201299,0.97953
1,2010-06-16,AS,66,YAK,JNU,-23.0,4.0,4.0,-33.0,0.0,...,-0.309017,-0.951057,-0.965926,0.258819,0.0,1.0,0.5,-0.866025,0.101168,-0.994869
2,2010-02-10,OO,4551,RAP,SLC,-21.0,16.0,6.0,-25.0,0.0,...,-0.104528,-0.994522,-1.0,-1.83697e-16,0.587785,0.809017,0.5,0.866025,0.968077,-0.250653
3,2010-01-01,OH,6593,DCA,IND,-20.0,7.0,8.0,-48.0,0.0,...,0.0,1.0,-0.707107,0.7071068,0.951057,0.309017,0.0,1.0,0.0,1.0
4,2010-07-01,UA,828,JAC,DEN,-19.0,14.0,5.0,-30.0,0.0,...,-0.587785,-0.809017,-0.866025,-0.5,0.587785,0.809017,1.224647e-16,-1.0,0.0,1.0


In [94]:
# count number of null values in each column
nulls = data.isnull().sum()

In [95]:
# since all the values of the column "CANCELLATION_CODE" are null, we can drop this column
data = data.drop(columns=['CANCELLATION_CODE'])
# FL_DATE column already split and handled, so dropping it
data = data.drop(columns=['FL_DATE'])
# OP_CARRIER_FL_NUM is the flight number, so dropping it
data = data.drop(columns=['OP_CARRIER_FL_NUM'])
print(data.shape)
data.head()

(47032, 47)


Unnamed: 0,OP_CARRIER,ORIGIN,DEST,DEP_DELAY,TAXI_OUT,TAXI_IN,ARR_DELAY,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,...,CRS_ARR_MIN_sin,CRS_ARR_MIN_cos,ARR_TIME_HOUR_sin,ARR_TIME_HOUR_cos,ARR_TIME_MIN_sin,ARR_TIME_MIN_cos,FL_MONTH_sin,FL_MONTH_cos,FL_DAY_sin,FL_DAY_cos
0,OO,CWA,EAU,-206.0,15.0,5.0,-192.0,0.0,0.0,38.0,...,-0.978148,-0.207912,-0.866025,0.5,-0.104528,-0.994522,-0.5,0.866025,-0.201299,0.97953
1,AS,YAK,JNU,-23.0,4.0,4.0,-33.0,0.0,0.0,48.0,...,-0.309017,-0.951057,-0.965926,0.258819,0.0,1.0,0.5,-0.866025,0.101168,-0.994869
2,OO,RAP,SLC,-21.0,16.0,6.0,-25.0,0.0,0.0,106.0,...,-0.104528,-0.994522,-1.0,-1.83697e-16,0.587785,0.809017,0.5,0.866025,0.968077,-0.250653
3,OH,DCA,IND,-20.0,7.0,8.0,-48.0,0.0,0.0,120.0,...,0.0,1.0,-0.707107,0.7071068,0.951057,0.309017,0.0,1.0,0.0,1.0
4,UA,JAC,DEN,-19.0,14.0,5.0,-30.0,0.0,0.0,90.0,...,-0.587785,-0.809017,-0.866025,-0.5,0.587785,0.809017,1.224647e-16,-1.0,0.0,1.0


In [96]:
data.columns

Index(['OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN',
       'ARR_DELAY', 'CANCELLED', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'FL_YEAR', 'CRS_DEP_HOUR_sin', 'CRS_DEP_HOUR_cos', 'CRS_DEP_MIN_sin',
       'CRS_DEP_MIN_cos', 'DEP_TIME_HOUR_sin', 'DEP_TIME_HOUR_cos',
       'DEP_TIME_MIN_sin', 'DEP_TIME_MIN_cos', 'WHEELS_OFF_HOUR_sin',
       'WHEELS_OFF_HOUR_cos', 'WHEELS_OFF_MIN_sin', 'WHEELS_OFF_MIN_cos',
       'WHEELS_ON_HOUR_sin', 'WHEELS_ON_HOUR_cos', 'WHEELS_ON_MIN_sin',
       'WHEELS_ON_MIN_cos', 'CRS_ARR_HOUR_sin', 'CRS_ARR_HOUR_cos',
       'CRS_ARR_MIN_sin', 'CRS_ARR_MIN_cos', 'ARR_TIME_HOUR_sin',
       'ARR_TIME_HOUR_cos', 'ARR_TIME_MIN_sin', 'ARR_TIME_MIN_cos',
       'FL_MONTH_sin', 'FL_MONTH_cos', 'FL_DAY_sin', 'FL_DAY_cos'],
      dtype='object')

In [103]:
X = data.drop(columns=['DEP_DELAY'])
y = data['DEP_DELAY']

In [104]:
# splitting dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(37625, 46)
(9407, 46)
(37625,)
(9407,)


In [None]:
# label encoding for OP_CARRIER, ORIGIN, DEST
labelEncoder_OP_CARRIER = LabelEncoder()
labelEncoder_ORIGIN = LabelEncoder()
labelEncoder_DEST = LabelEncoder()
X_train['OP_CARRIER'] = labelEncoder_OP_CARRIER.fit_transform(X_train['OP_CARRIER'])
X_train['ORIGIN'] = labelEncoder_ORIGIN.fit_transform(X_train['ORIGIN'])
X_train['DEST'] = labelEncoder_DEST.fit_transform(X_train['DEST'])
X_test['OP_CARRIER'] = labelEncoder_OP_CARRIER.transform(X_test['OP_CARRIER'])
X_test['ORIGIN'] = labelEncoder_ORIGIN.transform(X_test['ORIGIN'])
X_test['DEST'] = labelEncoder_DEST.transform(X_test['DEST'])

In [None]:
# standardization of data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# converting numpy arrays to tensors
X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
y_train = torch.from_numpy(y_train.values)
y_test = torch.from_numpy(y_test.values)