In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE 
import local_python_packages.features_adding as local

In [2]:
pd.set_option('display.max_columns', None)
flights_data = pd.read_csv('All flights 2019_final.csv',parse_dates=[0])
flights_data = flights_data.sort_values(['fl_date'])

In [3]:
df_weather = pd.read_csv('cities_and_dates_weather_final.csv',parse_dates=[1])

In [4]:
df_holidays = pd.read_csv('US holidays.csv',parse_dates=[0])

Adding additional columns from our feature engineering

In [5]:
flights_data = local.add_taxi_Ndays_rolling(flights_data, 7, 7)

In [6]:
flights_data = local.add_traffic_rolling(flights_data, 7, 7)

In [7]:
flights_data = local.make_month_dummies(flights_data, 'fl_date')

In [8]:
flights_data = local.merging_weather_flights(flights_data,df_weather)

In [9]:
flights_data = local.add_dep_delay_Ndays_rolling(flights_data, 7, 7)

In [10]:
flights_data = local.add_dep_delay_Ndays_roll_per_tail_num(flights_data, 7, 7)

In [11]:
flights_data = local.add_US_holidays(flights_data, df_holidays)

In [12]:
flights_data = local.make_dates_ordinal(flights_data, 'fl_date')

In [13]:
flights_data = local.add_arr_delay_Ndays_roll(flights_data, 7, 7, ['origin_airport_id', 'dest_airport_id', 'tail_num',
                                                                   'mkt_carrier_fl_num' ])

In [14]:
flights_data = local.add_polynomial_features(flights_data)

KeyError: '7d taxi_out'

In [None]:
features_list = [
        #From initial dataset
        'fl_date', 'arr_delay', 'crs_elapsed_time','air_time', 'distance',
        
        #From add_taxi_Ndays_rolling function
       '7d taxi_out by origin_airport_id', '7d taxi_in by dest_airport_id', 
        '7d taxi_out by mkt_carrier_fl_num', '7d taxi_in by mkt_carrier_fl_num',
        
        #From add_traffic_rolling function
       '7d roll flts origin_airport_id', '7d roll flts dest_airport_id',
        
        #From make_month_dummies function
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 
        'month_6','month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
        
        #From merging_weather_flights function
       'origin_city_wspd', 'origin_visibility',
       'dest_city_wspd', 'dest_visibility',
        'origin_cond_Overcast', 'origin_cond_Partially cloudy', 'origin_cond_Rain', 'origin_cond_Snow',
       'dest_cond_Overcast', 'dest_cond_Partially cloudy', 'dest_cond_Rain','dest_cond_Snow',
        'origin_wspd/visib', 'dest_wspd/visib',
        
        #From add_US_holidays function
        'Type_Federal holiday',
        
        #From add_dep_delay_Ndays_rolling function
        '7 days roll dep_delay', 
        
        #From add_dep_delay_Ndays_roll_per_tail_num function
        '7 days roll dep_delay_per_tail_num',
        
        #From add_polynomial_features function
        'dest_air X date' , 'orig_air X date',
        'mkt_carrier_fl_num X date', 'tail_num X date',
        'tail_num X dest airport', 'tail_num X origin airport',  'mkt_carrier X dest_airport', 'mkt_carrier X origin_airport',
        '7d roll taxi_out X tail_num', '7d roll taxi_in X tail_num',
        
        #From add_arr_delay_Ndays_roll function
        '7 days roll arr_delay_per_origin_airport_id', '7 days roll arr_delay_per_dest_airport_id', 
        '7 days roll arr_delay_per_tail_num', '7 days roll arr_delay_per_mkt_carrier_fl_num']

In [None]:
flights_data = local.replace_nan_with_mean(flights_data, 'arr_delay')

In [None]:
flights_data.head()

In [None]:
flights_data = flights_data[features_list]

In [None]:
#flights_data = flights_data.dropna()

In [None]:
features_list.remove('arr_delay')

In [None]:
X = flights_data[features_list]
y = flights_data['arr_delay']

In [None]:
scaler = StandardScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = local.quick_split(X,y,train_ratio=0.75)

In [None]:
# xgb_r = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     n_estimators=3438,
#     random_state=17,    
#     verbosity=1,
#     n_jobs=5,
#     booster='gbtree'
# )    

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
params = {
    # Parameters that we are going to tune.
    'max_depth':10,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 0.5,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric':'mae',
    'booster':'gbtree',
    'random_state':17,
    'verbosity':1,
    'n_jobs':5
}

In [None]:
num_boost_round=999

In [None]:
xgb_r = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=30
)

In [None]:
y_pred = xgb_r.predict(dtest) 

In [None]:
rmse = np.sqrt(MSE(y_test, y_pred)) 
print("RMSE : % f" %(rmse)) 


In [None]:
r2_score(y_test, y_pred)

In [None]:
xgb_r.save_model('model_xgboost.json')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
corr=flights_data.corr()
sns.heatmap(corr)

In [None]:
corr

In [None]:
corr.loc['arr_delay', :].abs().sort_values( ascending = False)