# Machine Learning

In this file, instructions how to approach the challenge can be found.

In [2]:
# import pandas
import pandas as pd
import numpy as np
import copy

import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import accuracy_score, confusion_matrix

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


### Read Preprocessed Data

In [8]:
# load data
df = pd.read_csv("data/flights_preprocessed_37k.csv", index_col=0)
df.head(3)

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,taxi_out,...,arr_hr_sin,arr_hr_cos,fl_mnth_sin,fl_mnth_cos,fl_wkday_sin,fl_wkday_cos,day_num_of_flights,num_flights_6hrs,inbound_fl_num,inbound_fl
0,2019-04-08,PT,N645AE,4857,14100,11577,1900-01-01 21:30:00,2129.0,-1.0,37.0,...,-0.5,0.866025,0.866025,-0.5,0.0,1.0,10,6,0,0
1,2018-11-29,UA,N817UA,1249,12953,13930,1900-01-01 21:00:00,2058.0,-2.0,25.0,...,-0.5,0.866025,-0.5,0.866025,0.433884,-0.900969,1,1,0,0
2,2018-08-03,AX,N14116,4650,11292,14783,1900-01-01 12:30:00,1237.0,7.0,13.0,...,-0.707107,-0.707107,-0.866025,-0.5,-0.433884,-0.900969,5,11,0,0


In [9]:
df['label'] = df['arr_delay']

In [11]:
df.loc[df.label <= 0, 'label'] = 0
df.loc[(df.label > 0) & (df.label <= 15), 'label'] = 1
df.loc[(df.label > 15) & (df.label <= 180), 'label'] = 2
df.loc[df.label > 180, 'label'] = 3

#### More Feature Engineering

##### Transform some new features by using 'arr_delay'

In [15]:
# Split traing and test set first
# In training set, I'm going to add more feature generated by combining 'arr_delay' and some categorial features
# For test set, same features as above mentioned should be added with values computed from training set
# i.e. DON NOT touch target variable in test set from now on
df_train, df_test = train_test_split(df, test_size=0.3)

In [16]:
# calculate how many times has delay ('arr_delay' > 15) happened on each carrier/flight_num/tail_num/carrier/origin_airport/dest_airport/origin_city/origin_state/dest_city/dest_state 
# calculate average/median delay time of each ... (same as above)
# merge with df

tran_features = ['op_unique_carrier', 'tail_num',  'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'origin_city', 'origin_state', 'dest_city', 'dest_state']

for col in tran_features:
    # delay count group by col
    feature_delay_ct = df_train[df_train['arr_delay'] > 15][[col, 'arr_delay']].groupby(col, as_index=False).count().rename(columns={'arr_delay': f'{col}_delayct'})
    df_train = pd.merge(df_train, feature_delay_ct, on=col, how='left').fillna(0)
    # average delay time group by col
    feature_delay_avg = df[[col, 'arr_delay']].groupby(col, as_index=False).mean().rename(columns={'arr_delay': f'{col}_delayavg'})
    df_train = pd.merge(df_train, feature_delay_avg, on=col, how='left').fillna(0)
    # median delay time group by col
    feature_delay_median = df[[col, 'arr_delay']].groupby(col, as_index=False).median().rename(columns={'arr_delay': f'{col}_delaymedian'})
    df_train = pd.merge(df_train, feature_delay_median, on=col, how='left').fillna(0)

In [17]:
# reset dtypes
categorical_features = ['op_unique_carrier',
                        'tail_num', 
                        'op_carrier_fl_num',
                        'origin_airport_id',
                        'dest_airport_id',
                        # 'share_code',
                        'origin_city',
                        'origin_state',
                        'dest_city',
                        'dest_state',
                        'fl_month',
                        'fl_weekday',
                        'season',
                        'inbound_fl']

df_train[categorical_features] = df_train[categorical_features].astype('str')
df_test[categorical_features] =df_test[categorical_features].astype('str')

In [18]:
df_test.shape

(11232, 63)

In [19]:
# add features to test set with values computed by training set
# NOTICE: THE VALUES ADDED HERE ARE STILL FROM TRAINING SET
# for example, flight No.#### used to have 7 delays in training set, then add 7 to same flight No. in test set
# It's like assigning weight to categories (assign weight of 7 to flight No.##### in this example)

origin_new = [['op_unique_carrier', 'op_unique_carrier_delayct', 'op_unique_carrier_delaymedian', 'op_unique_carrier_delayavg'],
              ['tail_num', 'tail_num_delayct', 'tail_num_delaymedian', 'tail_num_delayavg'],
              ['op_carrier_fl_num', 'op_carrier_fl_num_delayct', 'op_carrier_fl_num_delaymedian', 'op_carrier_fl_num_delayavg'],
              ['origin_airport_id', 'origin_airport_id_delayct', 'origin_airport_id_delaymedian', 'origin_airport_id_delayavg'],
              ['dest_airport_id', 'dest_airport_id_delayct', 'dest_airport_id_delaymedian', 'dest_airport_id_delayavg'],
              ['origin_city', 'origin_city_delayct', 'origin_city_delaymedian', 'origin_city_delayavg'],
              ['origin_state', 'origin_state_delayct', 'origin_state_delaymedian', 'origin_state_delayavg'],
              ['dest_city', 'dest_city_delayct', 'dest_city_delaymedian', 'dest_city_delayavg'],
              ['dest_state', 'dest_state_delayct', 'dest_state_delaymedian', 'dest_state_delayavg']]
              
for cols in origin_new:
    df_test = pd.merge(df_test, df_train[cols].drop_duplicates(), on=cols[0], how='left').fillna(0)

In [20]:
df_test.shape

(11232, 90)

## Main Task: Classification Problem
Convert delay time into 4 classes: 
- 0: on-time (<=0)
- 1: mild delay (0,15]
- 2: moderate delay (15,180]
- 3: severe delay (>180)

#### Ridge Regression

In [21]:
avail_features = [
    # 'fl_date',
    # 'op_unique_carrier',
    # 'tail_num', 
    # 'op_carrier_fl_num',
    # 'origin_airport_id',
    # 'dest_airport_id',
    # 'crs_dep_time',
    # 'crs_arr_time',
    # 'crs_elapsed_time',
    'distance',
    # 'share_code',
    # 'origin_city',
    # 'origin_state',
    # 'dest_city',
    # 'dest_state',
    # 'arr_date',
    # 'dep_datetime',
    # 'arr_datetime',
    # 'fl_month',
    # 'fl_weekday',
    # 'season',
    # 'day_num_of_flights',
    'num_flights_6hrs',
    'inbound_fl_num',
    # 'inbound_fl',
    # 'dep_min_of_day',
    # 'arr_min_of_day',
    # 'dep_hr',
    # 'arr_hr',
    'arr_min_sin',
    'arr_min_cos',
    # 'arr_hr_sin',
    # 'arr_hr_cos',
    'dep_min_sin', 
    'dep_min_cos', 
    # 'dep_hr_sin', 
    # 'dep_hr_cos',
    'fl_mnth_sin', 
    'fl_mnth_cos',
    'fl_wkday_sin',
    'fl_wkday_cos',
    'op_unique_carrier_delayct',
    'op_unique_carrier_delaymedian',
    # 'op_unique_carrier_delayavg',
    # 'tail_num_delayct', 
    # 'tail_num_delaymedian',
    'tail_num_delayavg',
    # 'op_carrier_fl_num_delayct',
    # 'op_carrier_fl_num_delaymedian', 
    'op_carrier_fl_num_delayavg',
    'origin_airport_id_delayct', 
    # 'origin_airport_id_delaymedian',
    'origin_airport_id_delayavg',
    'dest_airport_id_delayct',
    # 'dest_airport_id_delaymedian',
    'dest_airport_id_delayavg',
    'origin_city_delayct',
    'origin_city_delaymedian', 
    # 'origin_city_delayavg',
    'origin_state_delayct', 
    'origin_state_delaymedian',
    # 'origin_state_delayavg',
    'dest_city_delayct', 
    'dest_city_delaymedian',
    # 'dest_city_delayavg',
    # 'dest_state_delayct',
    # 'dest_state_delaymedian',
    'dest_state_delayavg'
]

In [28]:
X_train = df_train[avail_features]
y_train = df_train.label
X_test = df_test[avail_features]
y_test = df_test.label

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### XGBoost

In [36]:
xg_clf = XGBClassifier()
xg_clf.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [37]:
xg_clf.score(X_test, y_test)

0.6674679487179487

#### Random Forest

In [38]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

RandomForestClassifier()

In [39]:
rf_clf.score(X_test, y_test)

0.6696047008547008