# Machine Learning

In this file, instructions how to approach the challenge can be found.

In [1]:
# import pandas
import pandas as pd
import numpy as np
import copy

import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor, XGBClassifier, plot_importance
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


### Read Preprocessed Data

In [2]:
# load data
df = pd.read_csv("data/flights_preprocessed_37k.csv", index_col=0)
df.head(3)

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,taxi_out,...,arr_hr_sin,arr_hr_cos,fl_mnth_sin,fl_mnth_cos,fl_wkday_sin,fl_wkday_cos,day_num_of_flights,num_flights_6hrs,inbound_fl_num,inbound_fl
0,2019-04-08,PT,N645AE,4857,14100,11577,1900-01-01 21:30:00,2129.0,-1.0,37.0,...,-0.5,0.866025,0.866025,-0.5,0.0,1.0,10,6,0,0
1,2018-11-29,UA,N817UA,1249,12953,13930,1900-01-01 21:00:00,2058.0,-2.0,25.0,...,-0.5,0.866025,-0.5,0.866025,0.433884,-0.900969,1,1,0,0
2,2018-08-03,AX,N14116,4650,11292,14783,1900-01-01 12:30:00,1237.0,7.0,13.0,...,-0.707107,-0.707107,-0.866025,-0.5,-0.433884,-0.900969,5,11,0,0


#### More Feature Engineering

##### Transform some new features by using 'arr_delay'

In [3]:
# Split traing and test set first
# In training set, I'm going to add more feature generated by combining 'arr_delay' and some categorial features
# For test set, same features as above mentioned should be added with values computed from training set
# i.e. DON NOT touch target variable in test set from now on
df_train, df_test = train_test_split(df, test_size=0.3)

In [4]:
# calculate how many times has delay ('arr_delay' > 15) happened on each carrier/flight_num/tail_num/carrier/origin_airport/dest_airport/origin_city/origin_state/dest_city/dest_state 
# calculate average/median delay time of each ... (same as above)
# merge with df

tran_features = ['op_unique_carrier', 'tail_num',  'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'origin_city', 'origin_state', 'dest_city', 'dest_state']

for col in tran_features:
    # delay count group by col
    feature_delay_ct = df_train[df_train['arr_delay'] > 15][[col, 'arr_delay']].groupby(col, as_index=False).count().rename(columns={'arr_delay': f'{col}_delayct'})
    df_train = pd.merge(df_train, feature_delay_ct, on=col, how='left').fillna(0)
    # average delay time group by col
    feature_delay_avg = df[[col, 'arr_delay']].groupby(col, as_index=False).mean().rename(columns={'arr_delay': f'{col}_delayavg'})
    df_train = pd.merge(df_train, feature_delay_avg, on=col, how='left').fillna(0)
    # median delay time group by col
    feature_delay_median = df[[col, 'arr_delay']].groupby(col, as_index=False).median().rename(columns={'arr_delay': f'{col}_delaymedian'})
    df_train = pd.merge(df_train, feature_delay_median, on=col, how='left').fillna(0)

In [5]:
df_train.head()

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,taxi_out,...,origin_city_delaymedian,origin_state_delayct,origin_state_delayavg,origin_state_delaymedian,dest_city_delayct,dest_city_delayavg,dest_city_delaymedian,dest_state_delayct,dest_state_delayavg,dest_state_delaymedian
0,2018-12-23,QX,N444QX,2685,14747,14057,1900-01-01 14:45:00,1449.0,4.0,10.0,...,-4.0,107.0,3.401559,-5.0,42.0,1.047506,-6.0,52,6.102128,-5.0
1,2018-03-01,DL,N950AT,45,12478,10397,1900-01-01 15:40:00,1559.0,19.0,39.0,...,-10.0,236.0,2.520643,-9.0,208.0,4.44168,-8.0,225,4.5826,-8.0
2,2018-01-06,QX,N622QX,2729,10140,14679,1900-01-01 11:55:00,1145.0,-10.0,7.0,...,-7.0,17.0,11.793548,-8.0,54.0,4.866667,-6.0,553,4.380535,-6.0
3,2018-10-26,WN,N419WN,1430,14831,10800,1900-01-01 17:30:00,1731.0,1.0,7.0,...,-6.0,504.0,4.181321,-6.0,20.0,2.712418,-3.0,553,4.380535,-6.0
4,2019-02-28,G4,223NV,1822,10676,12451,1900-01-01 17:27:00,1717.0,-10.0,10.0,...,-14.0,387.0,9.644642,-4.0,22.0,4.927374,-5.0,354,5.866817,-5.0


In [6]:
# reset dtypes
categorical_features = ['op_unique_carrier',
                        'tail_num', 
                        'op_carrier_fl_num',
                        'origin_airport_id',
                        'dest_airport_id',
                        # 'share_code',
                        'origin_city',
                        'origin_state',
                        'dest_city',
                        'dest_state',
                        'fl_month',
                        'fl_weekday',
                        'season',
                        'inbound_fl']

df_train[categorical_features] = df_train[categorical_features].astype('str')
df_test[categorical_features] =df_test[categorical_features].astype('str')

In [7]:
df_test.shape

(11232, 62)

In [8]:
# add features to test set with values computed by training set
# NOTICE: THE VALUES ADDED HERE ARE STILL FROM TRAINING SET
# for example, flight No.#### used to have 7 delays in training set, then add 7 to same flight No. in test set
# It's like assigning weight to categories (assign weight of 7 to flight No.##### in this example)

origin_new = [['op_unique_carrier', 'op_unique_carrier_delayct', 'op_unique_carrier_delaymedian', 'op_unique_carrier_delayavg'],
              ['tail_num', 'tail_num_delayct', 'tail_num_delaymedian', 'tail_num_delayavg'],
              ['op_carrier_fl_num', 'op_carrier_fl_num_delayct', 'op_carrier_fl_num_delaymedian', 'op_carrier_fl_num_delayavg'],
              ['origin_airport_id', 'origin_airport_id_delayct', 'origin_airport_id_delaymedian', 'origin_airport_id_delayavg'],
              ['dest_airport_id', 'dest_airport_id_delayct', 'dest_airport_id_delaymedian', 'dest_airport_id_delayavg'],
              ['origin_city', 'origin_city_delayct', 'origin_city_delaymedian', 'origin_city_delayavg'],
              ['origin_state', 'origin_state_delayct', 'origin_state_delaymedian', 'origin_state_delayavg'],
              ['dest_city', 'dest_city_delayct', 'dest_city_delaymedian', 'dest_city_delayavg'],
              ['dest_state', 'dest_state_delayct', 'dest_state_delaymedian', 'dest_state_delayavg']]
              
for cols in origin_new:
    df_test = pd.merge(df_test, df_train[cols].drop_duplicates(), on=cols[0], how='left').fillna(0)

In [9]:
df_test.shape

(11232, 89)

## Main Task: Regression Problem

#### Ridge Regression

In [15]:
avail_features = [
    # 'fl_date',
    # 'op_unique_carrier',
    # 'tail_num', 
    # 'op_carrier_fl_num',
    # 'origin_airport_id',
    # 'dest_airport_id',
    # 'crs_dep_time',
    # 'crs_arr_time',
    # 'crs_elapsed_time',
    'distance',
    # 'share_code',
    # 'origin_city',
    # 'origin_state',
    # 'dest_city',
    # 'dest_state',
    # 'arr_date',
    # 'dep_datetime',
    # 'arr_datetime',
    # 'fl_month',
    # 'fl_weekday',
    # 'season',
    # 'day_num_of_flights',
    'num_flights_6hrs',
    'inbound_fl_num',
    # 'inbound_fl',
    # 'dep_min_of_day',
    # 'arr_min_of_day',
    # 'dep_hr',
    # 'arr_hr',
    'arr_min_sin',
    'arr_min_cos',
    # 'arr_hr_sin',
    # 'arr_hr_cos',
    'dep_min_sin', 
    'dep_min_cos', 
    # 'dep_hr_sin', 
    # 'dep_hr_cos',
    'fl_mnth_sin', 
    'fl_mnth_cos',
    'fl_wkday_sin',
    'fl_wkday_cos',
    'op_unique_carrier_delayct',
    'op_unique_carrier_delaymedian',
    # 'op_unique_carrier_delayavg',
    # 'tail_num_delayct', 
    # 'tail_num_delaymedian',
    'tail_num_delayavg',
    # 'op_carrier_fl_num_delayct',
    # 'op_carrier_fl_num_delaymedian', 
    'op_carrier_fl_num_delayavg',
    'origin_airport_id_delayct', 
    # 'origin_airport_id_delaymedian',
    'origin_airport_id_delayavg',
    'dest_airport_id_delayct',
    # 'dest_airport_id_delaymedian',
    'dest_airport_id_delayavg',
    'origin_city_delayct',
    'origin_city_delaymedian', 
    # 'origin_city_delayavg',
    'origin_state_delayct', 
    'origin_state_delaymedian',
    # 'origin_state_delayavg',
    'dest_city_delayct', 
    'dest_city_delaymedian',
    # 'dest_city_delayavg',
    # 'dest_state_delayct',
    # 'dest_state_delaymedian',
    'dest_state_delayavg'
]

In [24]:
X_train = df_train[avail_features]
# y_train = target_train_log
y_train = np.array(target_train).reshape(-1,1)
X_test = df_test[avail_features]
y_test = np.array(target_test).reshape(-1,1)

X_train_scaled = scaler.fit_transform(X_train)
y_train_scaled = scaler.fit_transform(y_train)
X_test_scaled = scaler.fit_transform(X_test)
y_test_scaled = scaler.fit_transform(y_test)

rr = Ridge(alpha=0.1)
rr.fit(X_train_scaled, y_train_scaled)
y_pred = rr.predict(X_test_scaled)

In [25]:
r2_score(y_test_scaled, y_pred)

0.29098084693328674

#### XGBoost

In [26]:
X_train = df_train[avail_features]
# y_train = target_train_log
y_train = target_train
X_test = df_test[avail_features]
y_test = target_test

xg_reg = XGBRegressor(objective ='reg:squarederror',
                      learning_rate = 0.1,
                      max_depth = 6,
                      # reg_lambda = 10,
                      n_estimators = 300)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
# y_pred = np.exp(xg_reg.predict(X_test)) + diff

In [27]:
xg_reg.score(X_train, y_train)

0.8596270639407491

In [28]:
r2_score(y_test, y_pred)

0.4338893305657229

##### PCA

In [20]:
# pca_features = [
#     # 'op_unique_carrier',
#     # 'tail_num'.
#     # 'op_carrier_fl_num',
#     # 'origin_airport_id',
#     # 'dest_airport_id',
#     'crs_elapsed_time',
#     'distance',
#     'share_code',
#     # 'origin_city',
#     # 'origin_state',
#     # 'dest_city',
#     # 'dest_state',
#     'fl_month',
#     'fl_weekday',
#     'season',
#     'day_num_of_flights',
#     'num_flights_6hr',
#     'inbound_fl_num',
#     'inbound_fl',
#     'dep_min_of_day',
#     'arr_min_of_day',
#     'dep_hr',
#     'arr_hr',
#     'arr_hr_sin',
#     'arr_hr_cos',
#     'arr_min_sin',
#     'arr_min_cos',
#     'dep_min_sin',
#     'dep_min_cos',
#     'dep_hr_sin',
#     'dep_hr_cos',
#     'fl_mnth_sin',
#     'fl_mnth_cos',
#     'fl_wkday_sin',
#     'fl_wkday_cos',
#     'op_unique_carrier_delayct',
#     'op_unique_carrier_delaymedian',
#     'tail_num_delayct',
#     'tail_num_delaymedian',
#     'op_carrier_fl_num_delayct',
#     'op_carrier_fl_num_delaymedian',
#     'origin_airport_id_delayct',
#     'origin_airport_id_delaymedian',
#     'dest_airport_id_delayct',
#     'dest_airport_id_delaymedian',
#     'origin_city_delayct',
#     'origin_city_delaymedian',
#     'origin_state_delayct',
#     'origin_state_delaymedian',
#     'dest_city_delayct',
#     'dest_city_delaymedian',
#     'dest_state_delayct',
#     'dest_state_delaymedian'
# ]

In [21]:
# df_X = pd.concat([df_train[pca_features], df_test[pca_features]])
# df_train.shape[0]

10609

In [25]:
# X_scaled = scaler.fit_transform(df_X)

# pca = PCA(n_components='mle')
# pca.fit(X_scaled)
# X_pca = pca.transform(X_scaled)

In [26]:
# X_scaled_train = X_pca[:10609, :]
# X_scaled_test = X_pca[10609:, :]
# y_train = target_train_log
# y_test = target_test

# xg_reg = XGBRegressor(objective ='reg:squarederror',
#                       learning_rate = 0.1,
#                       max_depth = 6,
#                       # reg_lambda = 10,
#                       n_estimators = 300)
# xg_reg.fit(X_scaled_train, y_train)
# # y_pred = xg_reg.predict(X_test)
# y_pred = np.exp(xg_reg.predict(X_scaled_test)) + diff

In [27]:
# r2_score(y_test, y_pred)

0.11846731863060067

In [43]:
# features = [
#     # 'op_unique_carrier',
#     # 'tail_num'.
#     # 'op_carrier_fl_num',
#     # 'origin_airport_id',
#     # 'dest_airport_id',
#     # 'crs_elapsed_time',
#     'distance',
#     'share_code',
#     # 'origin_city',
#     # 'origin_state',
#     # 'dest_city',
#     # 'dest_state',
#     # 'fl_month',
#     # 'fl_weekday',
#     # 'season',
#     # 'day_num_of_flights',
#     # 'num_flights_6hr',
#     # 'inbound_fl_num',
#     # 'inbound_fl',
#     # 'dep_min_of_day',
#     # 'arr_min_of_day',
#     # 'dep_hr',
#     # 'arr_hr',
#     # 'arr_hr_sin',
#     # 'arr_hr_cos',
#     # 'arr_min_sin',
#     # 'arr_min_cos',
#     'dep_min_sin',
#     # 'dep_min_cos',
#     # 'dep_hr_sin',
#     # 'dep_hr_cos',
#     # 'fl_mnth_sin',
#     # 'fl_mnth_cos',
#     # 'fl_wkday_sin',
#     # 'fl_wkday_cos',
#     # 'op_unique_carrier_delayct',
#     # 'op_unique_carrier_delaymedian',
#     'tail_num_delayct',
#     # 'tail_num_delaymedian',
#     'op_carrier_fl_num_delayct',
#     # 'op_carrier_fl_num_delaymedian',
#     # 'origin_airport_id_delayct',
#     # 'origin_airport_id_delaymedian',
#     # 'dest_airport_id_delayct',
#     # 'dest_airport_id_delaymedian',
#     # 'origin_city_delayct',
#     'origin_city_delaymedian',
#     # 'origin_state_delayct',
#     'origin_state_delaymedian',
#     'dest_city_delayct',
#     # 'dest_city_delaymedian',
#     # 'dest_state_delayct',
#     'dest_state_delaymedian'
# ]

In [44]:
# scores = []
# for f in features:
#     X_train = df_train[[f]]
#     y_train = target_train_log
#     X_test = df_test[[f]]
#     y_test = target_test
    
#     xg_reg = XGBRegressor(objective ='reg:squarederror',
#                       learning_rate = 0.1,
#                       max_depth = 6,
#                       # reg_lambda = 10,
#                       n_estimators = 300)
#     xg_reg.fit(X_train, y_train)
#     y_pred = np.exp(xg_reg.predict(X_test)) + diff
#     # y_pred = xg_reg.predict(X_test)
    
#     scores.append([f, xg_reg.score(X_train, y_train), r2_score(y_test, y_pred)])

In [45]:
# s = pd.DataFrame(scores)
# s[s[2]==s[2].max()]

Unnamed: 0,0,1,2
5,origin_city_delaymedian,0.824356,0.240021
