In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import local_python_packages.features_adding as local

In [2]:
pd.set_option('display.max_columns', None)
flights_data = pd.read_csv('UA flights 2019.csv',parse_dates=[0])
flights_data = flights_data.sort_values(['fl_date'])

In [3]:
df_weather = pd.read_csv('cities_and_dates_weather_final.csv',parse_dates=[1])

Adding additional columns from our feature engineering

In [4]:
flights_data = local.add_taxi_Ndays_rolling(flights_data, 30)

In [5]:
flights_data = local.add_traffic_rolling(flights_data, 30)

In [6]:
flights_data = local.make_month_dummies(flights_data, 'fl_date')

In [7]:
flights_data = local.merging_weather_flights(flights_data,df_weather)

In [8]:
flights_data = local.add_dep_delay_Ndays_rolling(flights_data, 30)

In [9]:
flights_data = local.make_dates_ordinal(flights_data, 'fl_date')

In [10]:
features_list = ['fl_date','taxi_out','taxi_in', 'arr_delay',
                 'crs_elapsed_time',
       'air_time', 'distance',
       '30d taxi_out', '30d taxi_in',
       '30d roll flts origin_airport_id', '30d roll flts dest_airport_id',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'origin_city_wspd', 'origin_visibility',
       'dest_city_wspd', 'dest_visibility', 'origin_cond_Overcast',
       'origin_cond_Partially cloudy', 'origin_cond_Rain', 'origin_cond_Snow',
       'dest_cond_Overcast', 'dest_cond_Partially cloudy', 'dest_cond_Rain',
       'dest_cond_Snow','30 days roll dep_time']

In [11]:
flights_data = local.distill_features(flights_data,desired_features=features_list)

In [12]:
flights_data.shape

(1571606, 36)

In [13]:
flights_data = local.replace_nan_with_mean(flights_data, 'arr_delay')

In [14]:
features_list.remove('arr_delay')

In [15]:
flights_data.shape

(1571606, 36)

In [16]:
flights_data = flights_data.dropna()

In [17]:
flights_data.shape

(1514251, 36)

In [18]:
X = flights_data[features_list]
y = flights_data['arr_delay']

In [19]:
scaler = StandardScaler()

In [20]:
X = scaler.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = local.quick_split(X,y,train_ratio=0.75)

In [22]:
sgd = SGDRegressor(max_iter = 10000)

In [23]:
parameters = {'penalty': ['l2', 'l1', 'elasticnet'],
             'alpha':[0.0001, 0.001, 0.01],
             'loss':['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']}

In [24]:
GrdSrch = GridSearchCV(sgd, param_grid=parameters, scoring=['neg_mean_absolute_error', 'r2'],
                       refit='r2', n_jobs = 5, verbose= 4)

In [25]:
GrdSrch.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  1.0min
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:  5.6min
[Parallel(n_jobs=5)]: Done 180 out of 180 | elapsed: 10.1min finished


GridSearchCV(estimator=SGDRegressor(max_iter=10000), n_jobs=5,
             param_grid={'alpha': [0.0001, 0.001, 0.01],
                         'loss': ['squared_loss', 'huber',
                                  'epsilon_insensitive',
                                  'squared_epsilon_insensitive'],
                         'penalty': ['l2', 'l1', 'elasticnet']},
             refit='r2', scoring=['neg_mean_absolute_error', 'r2'], verbose=4)

In [26]:
y_pred=GrdSrch.best_estimator_.predict(X_test)

In [27]:
r2_score(y_test, y_pred)

0.08414369323109483

In [28]:
GrdSrch.best_score_

0.087820674715913

In [29]:
GrdSrch.best_params_

{'alpha': 0.0001, 'loss': 'squared_loss', 'penalty': 'elasticnet'}

In [30]:
GrdSrch.best_estimator_.coef_

array([  9.69485843,  14.75409729,   6.0224707 , -58.44652208,
        60.03921092,  -1.93288192,  -2.57331817,   1.26746892,
        -1.10569416,  -1.56538108,   3.45252772,   2.90712537,
         2.23416244,   2.04996265,   2.89644806,   2.31532022,
        -0.51456452,  -1.30414867,  -2.68310714,  -3.22105057,
        -3.9748244 ,  -3.65798429,   0.81828281,  -0.46040396,
         1.71971468,  -3.00343914,  -0.96767542,   0.        ,
         1.2354035 ,   1.17186729,  -1.21532707,   0.48126688,
         1.92505809,   1.43771803,   5.62776   ])

In [33]:
features_list[4]

'air_time'