In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import local_python_packages.features_adding as local

In [2]:
pd.set_option('display.max_columns', None)
flights_data = pd.read_csv('UA flights 2019.csv',parse_dates=[0])
flights_data = flights_data.sort_values(['fl_date'])

In [3]:
df_weather = pd.read_csv('cities_and_dates_weather_final.csv',parse_dates=[1])

In [4]:
df_holidays = pd.read_csv('US holidays.csv',parse_dates=[0])

Adding additional columns from our feature engineering

In [5]:
flights_data = local.add_taxi_Ndays_rolling(flights_data, 30)

In [6]:
flights_data = local.add_traffic_rolling(flights_data, 30)

In [7]:
flights_data = local.make_month_dummies(flights_data, 'fl_date')

In [8]:
flights_data = local.merging_weather_flights(flights_data,df_weather)

In [9]:
flights_data = local.add_dep_delay_Ndays_rolling(flights_data, 30)

In [10]:
flights_data = local.add_US_holidays(flights_data, df_holidays)

In [11]:
flights_data = local.make_dates_ordinal(flights_data, 'fl_date')

In [12]:
flights_data['orig_air by date']= flights_data['fl_date'] * flights_data['origin_airport_id'] 

In [13]:
flights_data['dest_air by date']= flights_data['fl_date'] * flights_data['dest_airport_id'] 

In [14]:
features_list = ['fl_date','taxi_out','taxi_in', 'arr_delay',
                 'crs_elapsed_time',
       'air_time', 'distance',
       '30d taxi_out', '30d taxi_in',
       '30d roll flts origin_airport_id', '30d roll flts dest_airport_id',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'origin_city_wspd', 'origin_visibility',
       'dest_city_wspd', 'dest_visibility', 'origin_cond_Overcast',
       'origin_cond_Partially cloudy', 'origin_cond_Rain', 'origin_cond_Snow',
       'dest_cond_Overcast', 'dest_cond_Partially cloudy', 'dest_cond_Rain',
       'dest_cond_Snow','30 days roll dep_time', 'Type_Federal holiday', 'dest_air by date' , 'orig_air by date']

In [15]:
flights_data = local.replace_nan_with_mean(flights_data, 'arr_delay')

In [16]:
flights_data = flights_data[features_list]

In [17]:
flights_data = flights_data.dropna()

In [33]:
flights_data.shape

(1514251, 39)

In [18]:
features_list.remove('arr_delay')

In [19]:
X = flights_data[features_list]
y = flights_data['arr_delay']

In [20]:
scaler = StandardScaler()

In [21]:
X.columns

Index(['fl_date', 'taxi_out', 'taxi_in', 'crs_elapsed_time', 'air_time',
       'distance', '30d taxi_out', '30d taxi_in',
       '30d roll flts origin_airport_id', '30d roll flts dest_airport_id',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'origin_city_wspd', 'origin_visibility', 'dest_city_wspd',
       'dest_visibility', 'origin_cond_Overcast',
       'origin_cond_Partially cloudy', 'origin_cond_Rain', 'origin_cond_Snow',
       'dest_cond_Overcast', 'dest_cond_Partially cloudy', 'dest_cond_Rain',
       'dest_cond_Snow', '30 days roll dep_time', 'Type_Federal holiday',
       'dest_air by date', 'orig_air by date'],
      dtype='object')

In [22]:
X = scaler.fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = local.quick_split(X,y,train_ratio=0.75)

In [24]:
sgd = SGDRegressor(max_iter = 30000)

In [25]:
parameters = {'penalty':  ['l2', 'l1', 'elasticnet'],
             'alpha':[0.0001, 0.001, 0.01],
             'loss':['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']}

In [26]:
GrdSrch = GridSearchCV(sgd, param_grid=parameters, scoring=['neg_mean_absolute_error', 'r2'],
                       refit='r2', n_jobs = 5, verbose= 4)

In [27]:
GrdSrch.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:   58.9s
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:  5.4min
[Parallel(n_jobs=5)]: Done 180 out of 180 | elapsed: 10.0min finished


GridSearchCV(estimator=SGDRegressor(max_iter=30000), n_jobs=5,
             param_grid={'alpha': [0.0001, 0.001, 0.01],
                         'loss': ['squared_loss', 'huber',
                                  'epsilon_insensitive',
                                  'squared_epsilon_insensitive'],
                         'penalty': ['l2', 'l1', 'elasticnet']},
             refit='r2', scoring=['neg_mean_absolute_error', 'r2'], verbose=4)

In [28]:
y_pred=GrdSrch.best_estimator_.predict(X_test)

In [29]:
r2_score(y_test, y_pred)

0.08920857002449101

In [30]:
GrdSrch.best_score_

0.08635048222366079

In [31]:
GrdSrch.best_params_

{'alpha': 0.0001, 'loss': 'squared_loss', 'penalty': 'l1'}

In [32]:
GrdSrch.best_estimator_.coef_

array([ 10.27389637,  13.83027345,   5.88736327, -59.18750438,
        60.278489  ,  -1.65003579,  -2.68786197,   1.50334959,
        -1.19308782,  -2.34551094,   4.61839131,   2.79940852,
         2.31482004,   1.81250315,   2.83184016,   1.79059622,
        -0.63212824,  -0.90803707,  -2.84180183,  -3.04221643,
        -4.39174202,  -3.66704292,   0.83592205,  -1.15178103,
         2.04322654,  -2.79349925,  -1.11239894,  -0.5016029 ,
         2.55849128,   1.68498746,  -1.18546484,   0.        ,
         1.86670831,   2.60146736,   7.31974385,  -0.67574266,
         0.        ,   0.        ])