In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE 
import local_python_packages.features_adding as local

In [2]:
pd.set_option('display.max_columns', None)
flights_data = pd.read_csv('All flights 2019_final.csv',parse_dates=[0])
flights_data = flights_data.sort_values(['fl_date'])

In [3]:
df_weather = pd.read_csv('cities_and_dates_weather_final.csv',parse_dates=[1])

In [4]:
df_holidays = pd.read_csv('US holidays.csv',parse_dates=[0])

Adding additional columns from our feature engineering

In [5]:
flights_data = local.add_taxi_Ndays_rolling(flights_data, 30)

In [6]:
flights_data = local.add_traffic_rolling(flights_data, 30)

In [7]:
flights_data = local.make_month_dummies(flights_data, 'fl_date')

In [8]:
flights_data = local.merging_weather_flights(flights_data,df_weather)

In [9]:
flights_data = local.add_dep_delay_Ndays_rolling(flights_data, 30)

In [None]:
flights_data = local.add_dep_delay_Ndays_roll_per_tail_num(flights_data, 30)

In [10]:
flights_data = local.add_US_holidays(flights_data, df_holidays)

In [11]:
flights_data = local.make_dates_ordinal(flights_data, 'fl_date')

In [12]:
flights_data['orig_air by date']= flights_data['fl_date'] * flights_data['origin_airport_id'] 

In [13]:
flights_data['dest_air by date']= flights_data['fl_date'] * flights_data['dest_airport_id'] 

In [14]:
flights_data['mkt_carrier_fl_num by date']= flights_data['fl_date'] * flights_data['mkt_carrier_fl_num']  

In [15]:
flights_data['tail_num by date'] = flights_data['fl_date'] * hash(str(flights_data['tail_num']))

In [16]:
flights_data['tail_num by dest airport'] = flights_data['dest_airport_id'] * hash(str(flights_data['tail_num']))

In [17]:
flights_data['tail_num by origin airport'] = flights_data['origin_airport_id'] * hash(str(flights_data['tail_num']))

In [18]:
flights_data['mkt_carrier by dest_airport']= flights_data['mkt_carrier_fl_num']  * flights_data['dest_airport_id'] 

In [19]:
flights_data['mkt_carrier by origin_airport']= flights_data['mkt_carrier_fl_num']  * flights_data['origin_airport_id'] 

In [20]:
features_list = ['fl_date','taxi_out','taxi_in', 'arr_delay',
                 'crs_elapsed_time',
       'air_time', 'distance',
       '30d taxi_out', '30d taxi_in',
       '30d roll flts origin_airport_id', '30d roll flts dest_airport_id',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 
                 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'origin_city_wspd', 'origin_visibility',
       'dest_city_wspd', 'dest_visibility', 'origin_cond_Overcast',
       'origin_cond_Partially cloudy', 'origin_cond_Rain', 'origin_cond_Snow',
       'dest_cond_Overcast', 'dest_cond_Partially cloudy', 'dest_cond_Rain',
       'dest_cond_Snow','30 days roll dep_time', 'Type_Federal holiday', 'dest_air by date' , 'orig_air by date',
                'origin_wspd/visib', 'dest_wspd/visib', 'mkt_carrier_fl_num by date', 'tail_num by date',
                 'tail_num by dest airport', 'tail_num by origin airport', 'mkt_carrier by dest_airport', 
                 'mkt_carrier by origin_airport', '30 days roll dep_delay_per_tail_num']

In [21]:
flights_data = local.replace_nan_with_mean(flights_data, 'arr_delay')

In [22]:
flights_data.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,30d taxi_out,30d taxi_in,30d roll flts origin_airport_id,30d roll flts dest_airport_id,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,origin_city_wspd,origin_visibility,origin_wspd/visib,dest_city_wspd,dest_visibility,dest_wspd/visib,origin_cond_Overcast,origin_cond_Partially cloudy,origin_cond_Rain,origin_cond_Snow,dest_cond_Overcast,dest_cond_Partially cloudy,dest_cond_Rain,dest_cond_Snow,30 days roll dep_time,Type_Federal holiday,orig_air by date,dest_air by date,mkt_carrier_fl_num by date,tail_num by date,tail_num by dest airport,tail_num by origin airport,mkt_carrier by dest_airport,mkt_carrier by origin_airport
0,737060,UA,3683,N751YX,3683,13487,MSP,"Minneapolis, MN",12266,IAH,"Houston, TX",530,527.0,-3.0,31.0,558.0,842.0,8.0,835,850.0,15.0,0.0,,0.0,N,185.0,203.0,164.0,1.0,1034.0,0.0,0.0,15.0,0.0,0.0,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,24.2,16.0,1.51218,18.4,12.0,1.532889,0,0,0,0,0,1,0,0,,1,9940728220,9040777960,2714591980,-2551484499696702092,3520498834488589042,-7435321552698582149,45175678,49672621
1,737060,DL,1971,N847DN,1971,14869,SLC,"Salt Lake City, UT",11433,DTW,"Detroit, MI",931,928.0,-3.0,21.0,949.0,1447.0,9.0,1503,1456.0,-7.0,0.0,,0.0,N,212.0,208.0,178.0,1.0,1481.0,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,14.1,16.0,0.881324,27.6,14.7,1.876954,0,1,0,1,0,1,1,0,,1,10959345140,8426806980,1452745260,-2551484499696702092,4740192803375697341,6950537143119898137,22534443,29306799
2,737060,DL,2039,N812DN,2039,14771,SFO,"San Francisco, CA",10397,ATL,"Atlanta, GA",2245,2240.0,-5.0,17.0,2257.0,603.0,6.0,620,609.0,-11.0,0.0,,0.0,N,275.0,269.0,246.0,1.0,2139.0,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,32.5,16.0,2.030606,9.4,13.8,0.68139,0,1,0,0,0,1,1,0,,1,10887113260,7663212820,1502865340,-2551484499696702092,-2733726935110537663,2753620180939663439,21199483,30118069
3,737060,DL,1972,N339NB,1972,10423,AUS,"Austin, TX",12478,JFK,"New York, NY",805,805.0,0.0,14.0,819.0,1208.0,7.0,1237,1215.0,-22.0,0.0,,0.0,N,212.0,190.0,169.0,1.0,1521.0,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,22.2,13.9,1.596693,20.6,15.0,1.373085,0,1,0,0,0,1,1,0,,1,7682376380,9197034680,1453482320,-2551484499696702092,7329045384675755478,3650239137140008739,24606616,20554156
4,737060,DL,5890,N202JQ,5890,11066,CMH,"Columbus, OH",12953,LGA,"New York, NY",1207,1158.0,-9.0,13.0,1211.0,1324.0,6.0,1349,1330.0,-19.0,0.0,,0.0,N,102.0,92.0,73.0,1.0,479.0,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,37.5,15.9,2.357637,20.6,15.0,1.373085,1,0,1,0,0,1,1,0,,1,8156305960,9547138180,4341283400,-2551484499696702092,-6586994047768396691,-3071085502995169662,76293170,65178740


In [23]:
flights_data = flights_data[features_list]

In [24]:
flights_data = flights_data.dropna()

In [25]:
features_list.remove('arr_delay')

In [26]:
X = flights_data[features_list]
y = flights_data['arr_delay']

In [27]:
scaler = StandardScaler()

In [28]:
X = scaler.fit_transform(X)

In [29]:
X_train, X_test, y_train, y_test = local.quick_split(X,y,train_ratio=0.75)

In [30]:
# xgb_r = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     n_estimators=3438,
#     random_state=17,    
#     verbosity=1,
#     n_jobs=5,
#     booster='gbtree'
# )    

In [31]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [32]:
params = {
    # Parameters that we are going to tune.
    'max_depth':10,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 0.5,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric':'mae',
    'booster':'gbtree',
    'n_estimators':3438,
    'random_state':17,
    'verbosity':1,
    'n_jobs':5,
    'random_state':17
}

In [33]:
num_boost_round=999

In [34]:
xgb_r = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=30
)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	Test-mae:23.93173
[1]	Test-mae:24.06104
[2]	Test-mae:24.10266
[3]	Test-mae:24.19500
[4]	Test-mae:23.48180
[5]	Test-mae:23.11817
[6]	Test-mae:23.08426
[7]	Test-mae:22.94894
[8]	Test-mae:22.86941
[9]	Test-mae:22.85224
[10]	Test-mae:22.82483
[11]	Test-mae:22.76821
[12]	Test-mae:22.74504
[13]	Test-mae:22.70424
[14]	Test-mae:22.62978
[15]	Test-mae:22.58859
[16]	Test-mae:22.51688
[17]	Test-mae:22.49721
[18]	Test-mae:22.48144
[19]	Test-mae:22.46474
[20]	Test-mae:22.30069
[21]	Test-mae:22.28731
[22]	Test-mae:22.12900
[23]	Test-mae:22.12349
[24]	Test-mae:22.10635
[25]	Test-mae:22.03386
[26]	Test-mae:22.01291
[27]	Test-mae:22.00008
[28]	Test-mae:21.95119
[29]	Test-mae:21.91033
[30]	Test-mae:21.87525
[31]	Test

In [35]:
y_pred = xgb_r.predict(dtest) 

In [36]:
rmse = np.sqrt(MSE(y_test, y_pred)) 
print("RMSE : % f" %(rmse)) 


RMSE :  48.621521


In [37]:
r2_score(y_test, y_pred)

0.15166993746571278