In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE 
import local_python_packages.features_adding as local

In [2]:
pd.set_option('display.max_columns', None)
flights_data = pd.read_csv('All flights 2019_final.csv',parse_dates=[0])
flights_data = flights_data.sort_values(['fl_date'])

In [3]:
df_weather = pd.read_csv('cities_and_dates_weather_final.csv',parse_dates=[1])

In [4]:
df_holidays = pd.read_csv('US holidays.csv',parse_dates=[0])

Adding additional columns from our feature engineering

In [5]:
flights_data = local.add_taxi_Ndays_rolling(flights_data, 7, 7)

In [6]:
flights_data = local.add_traffic_rolling(flights_data, 7, 7)

In [7]:
flights_data = local.make_month_dummies(flights_data, 'fl_date')

In [8]:
flights_data = local.merging_weather_flights(flights_data,df_weather)

In [9]:
flights_data = local.add_dep_delay_Ndays_rolling(flights_data, 7, 7)

In [10]:
flights_data = local.add_dep_delay_Ndays_roll_per_tail_num(flights_data, 7, 7)

In [11]:
flights_data = local.add_US_holidays(flights_data, df_holidays)

In [12]:
flights_data = local.make_dates_ordinal(flights_data, 'fl_date')

In [13]:
flights_data = local.add_arr_delay_Ndays_roll(flights_data, 7, 7, ['origin_airport_id', 'dest_airport_id', 'tail_num',
                                                                   'mkt_carrier_fl_num' ])

In [14]:
flights_data = local.add_polynomial_features(flights_data)

In [19]:
features_list = [
        #From initial dataset
        'fl_date', 'arr_delay', 'crs_elapsed_time','air_time', 'distance',
        
        #From add_taxi_Ndays_rolling function
       '7d taxi_out by origin_airport_id', '7d taxi_in by dest_airport_id', 
        '7d taxi_out by mkt_carrier_fl_num', '7d taxi_in by mkt_carrier_fl_num',
        
        #From add_traffic_rolling function
       '7d roll flts origin_airport_id', '7d roll flts dest_airport_id',
        
        #From make_month_dummies function
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 
        'month_6','month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
        
        #From merging_weather_flights function
       'origin_city_wspd', 'origin_visibility',
       'dest_city_wspd', 'dest_visibility',
        'origin_cond_Overcast', 'origin_cond_Partially cloudy', 'origin_cond_Rain', 'origin_cond_Snow',
       'dest_cond_Overcast', 'dest_cond_Partially cloudy', 'dest_cond_Rain','dest_cond_Snow',
        'origin_wspd/visib', 'dest_wspd/visib',
        
        #From add_US_holidays function
        'Type_Federal holiday',
        
        #From add_dep_delay_Ndays_rolling function
        '7 days roll dep_delay', 
        
        #From add_dep_delay_Ndays_roll_per_tail_num function
        '7 days roll dep_delay_per_tail_num',
        
        #From add_polynomial_features function
        'dest_air X date' , 'orig_air X date',
        'mkt_carrier_fl_num X date', 'tail_num X date',
        'tail_num X dest airport', 'tail_num X origin airport',  'mkt_carrier X dest_airport', 'mkt_carrier X origin_airport',
        '7d roll taxi_out X tail_num', '7d roll taxi_in X tail_num',
        
        #From add_arr_delay_Ndays_roll function
        '7 days roll arr_delay_per_origin_airport_id', '7 days roll arr_delay_per_dest_airport_id', 
        '7 days roll arr_delay_per_tail_num', '7 days roll arr_delay_per_mkt_carrier_fl_num']

In [20]:
flights_data = local.replace_nan_with_mean(flights_data, 'arr_delay')

In [21]:
flights_data.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,7d taxi_out by origin_airport_id,7d taxi_in by dest_airport_id,7d taxi_out by mkt_carrier_fl_num,7d taxi_in by mkt_carrier_fl_num,7d roll flts origin_airport_id,7d roll flts dest_airport_id,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,origin_city_wspd,origin_visibility,origin_wspd/visib,dest_city_wspd,dest_visibility,dest_wspd/visib,origin_cond_Overcast,origin_cond_Partially cloudy,origin_cond_Rain,origin_cond_Snow,dest_cond_Overcast,dest_cond_Partially cloudy,dest_cond_Rain,dest_cond_Snow,7 days roll dep_time,7 days roll dep_delay_per_tail_num,Type_Federal holiday,7 days roll arr_delay_per_origin_airport_id,7 days roll arr_delay_per_dest_airport_id,7 days roll arr_delay_per_tail_num,7 days roll arr_delay_per_mkt_carrier_fl_num,orig_air X date,dest_air X date,mkt_carrier_fl_num X date,tail_num X date,tail_num X dest airport,tail_num X origin airport,mkt_carrier X dest_airport,mkt_carrier X origin_airport,7d roll taxi_out X tail_num,7d roll taxi_in X tail_num
0,737060,UA,3683,N751YX,3683,13487,MSP,"Minneapolis, MN",12266,IAH,"Houston, TX",530,527.0,-3.0,31.0,558.0,842.0,8.0,835,850.0,15.0,0.0,,0.0,N,185.0,203.0,164.0,1.0,1034.0,0.0,0.0,15.0,0.0,0.0,,,,,16.347962,6.02381,21.988095,11.142857,15.285714,3.0,1,0,0,0,0,0,0,0,0,0,0,0,24.2,16.0,1.51218,18.4,12.0,1.532889,0,0,0,0,0,1,0,0,12.502838,23.404762,1,8.4409,7.083333,22.464286,-0.785714,9940728220,9040777960,2714591980,-5876437948029505976,-8958774546702137904,-8306064417713355976,45175678,49672621,1.312062e+17,4.834615e+16
1,737060,DL,1971,N847DN,1971,14869,SLC,"Salt Lake City, UT",11433,DTW,"Detroit, MI",931,928.0,-3.0,21.0,949.0,1447.0,9.0,1503,1456.0,-7.0,0.0,,0.0,N,212.0,208.0,178.0,1.0,1481.0,,,,,,,,,,14.273368,5.310005,18.952381,7.404762,89.714286,50.857143,1,0,0,0,0,0,0,0,0,0,0,0,14.1,16.0,0.881324,27.6,14.7,1.876954,0,1,0,1,0,1,1,0,22.987325,8.4,1,18.228686,3.259983,0.078571,-10.047619,10959345140,8426806980,1452745260,-5876437948029505976,-2273143371154838136,5470827911421647848,22534443,29306799,1.145558e+17,4.261727e+16
2,737060,DL,2039,N812DN,2039,14771,SFO,"San Francisco, CA",10397,ATL,"Atlanta, GA",2245,2240.0,-5.0,17.0,2257.0,603.0,6.0,620,609.0,-11.0,0.0,,0.0,N,275.0,269.0,246.0,1.0,2139.0,,,,,,,,,,13.441872,3.868603,15.908163,6.842687,37.428571,33.0,1,0,0,0,0,0,0,0,0,0,0,0,32.5,16.0,2.030606,9.4,13.8,0.68139,0,1,0,0,0,1,1,0,11.668305,4.964286,1,8.501651,21.601883,1.116667,-0.876701,10887113260,7663212820,1502865340,-5876437948029505976,-1708973536738680280,-8934063540392418216,21199483,30118069,1.078824e+17,3.10488e+16
3,737060,DL,1972,N339NB,1972,10423,AUS,"Austin, TX",12478,JFK,"New York, NY",805,805.0,0.0,14.0,819.0,1208.0,7.0,1237,1215.0,-22.0,0.0,,0.0,N,212.0,190.0,169.0,1.0,1521.0,,,,,,,,,,11.428571,5.365101,12.642857,7.059524,2.0,92.571429,1,0,0,0,0,0,0,0,0,0,0,0,22.2,13.9,1.596693,20.6,15.0,1.373085,0,1,0,0,0,1,1,0,-9.5,9.107143,1,-20.928571,11.865841,4.614286,-2.928571,7682376380,9197034680,1453482320,-5876437948029505976,-385022516707894928,-6922407595829422728,24606616,20554156,9.172393e+16,4.305946e+16
4,737060,DL,5890,N202JQ,5890,11066,CMH,"Columbus, OH",12953,LGA,"New York, NY",1207,1158.0,-9.0,13.0,1211.0,1324.0,6.0,1349,1330.0,-19.0,0.0,,0.0,N,102.0,92.0,73.0,1.0,479.0,,,,,,,,,,22.216591,5.540921,18.52381,15.785714,709.428571,13.0,1,0,0,0,0,0,0,0,0,0,0,0,37.5,15.9,2.357637,20.6,15.0,1.373085,1,0,1,0,0,1,1,0,11.903441,12.428571,1,7.487871,2.450654,13.357143,7.880952,8156305960,9547138180,4341283400,-5876437948029505976,-6234692700580939512,-3889473068343684016,76293170,65178740,1.783069e+17,4.447057e+16


In [22]:
flights_data = flights_data[features_list]

In [23]:
#flights_data = flights_data.dropna()

In [24]:
features_list.remove('arr_delay')

In [25]:
X = flights_data[features_list]
y = flights_data['arr_delay']

In [26]:
scaler = StandardScaler()

In [27]:
X = scaler.fit_transform(X)

In [28]:
X_train, X_test, y_train, y_test = local.quick_split(X,y,train_ratio=0.75)

In [29]:
# xgb_r = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     n_estimators=3438,
#     random_state=17,    
#     verbosity=1,
#     n_jobs=5,
#     booster='gbtree'
# )    

In [30]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [31]:
params = {
    # Parameters that we are going to tune.
    'max_depth':10,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 0.5,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric':'mae',
    'booster':'gbtree',
    'random_state':17,
    'verbosity':1,
    'n_jobs':5
}

In [32]:
num_boost_round=999

In [None]:
xgb_r = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=30
)

[0]	Test-mae:23.10740
[1]	Test-mae:23.25806
[2]	Test-mae:23.44677
[3]	Test-mae:23.53321
[4]	Test-mae:23.61387
[5]	Test-mae:23.66853
[6]	Test-mae:23.71343
[7]	Test-mae:23.65865
[8]	Test-mae:23.66738
[9]	Test-mae:23.67387


In [None]:
y_pred = xgb_r.predict(dtest) 

In [None]:
rmse = np.sqrt(MSE(y_test, y_pred)) 
print("RMSE : % f" %(rmse)) 


In [None]:
r2_score(y_test, y_pred)

In [None]:
xgb_r.save_model('model_xgboost.json')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
corr=flights_data.corr()
sns.heatmap(corr)

In [None]:
corr

In [None]:
corr.loc['arr_delay', :].abs().sort_values( ascending = False)