In [77]:
# import libraries
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [78]:
data = pd.read_csv("merge_df_fl_num.csv", sep=',', index_col=0)
data.head(3)

Unnamed: 0,fl_date,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,carrier,origin,destination,distance,crs_dep_time,crs_arr_time,...,day_of_week,day_of_month,month,arr_delay,binary_delay,avg_month_payload_carrier,avg_month_psngr_carrier,avg_domest_cost_month_carrier,avg_domest_gallons_month_carrier,avg_dep_scheduled_monthly_airport
0,2018-06-19,1673,14771,14747,Alaska Airlines,"San Francisco, CA","Seattle, WA",679,1830,2035,...,1,19,6,2,1,553850.0,3591.942804,80408619.0,43054527.0,16430
1,2019-06-21,1138,14771,14100,Alaska Airlines,"San Francisco, CA","Philadelphia, PA",2521,2205,645,...,4,21,6,-20,0,553850.0,3591.942804,80408619.0,43054527.0,16430
2,2018-06-03,1743,14771,14747,Alaska Airlines,"San Francisco, CA","Seattle, WA",679,800,1005,...,6,3,6,2,1,553850.0,3591.942804,80408619.0,43054527.0,16430


In [79]:
# save the columns in case we need them 
date, carrier, origin, destination = data[['fl_date', 'carrier', 'origin', 'destination']]

In [80]:
# data with categorical data dropped
data_dropped = data.copy()
data_dropped.drop(labels=['fl_date', 'mkt_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'avg_domest_cost_month_carrier', 'avg_domest_gallons_month_carrier', 'taxi_out', 'origin', 'destination', 'binary_delay'], axis=1, inplace = True)

# get target column
y = data_dropped.arr_delay
data_dropped.drop('arr_delay', inplace=True, axis = 1)
data_dropped = pd.get_dummies(data_dropped)

data_dropped['crs_dep_time'] = pd.qcut(data_dropped['crs_dep_time'], 24, labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24])
data_dropped['crs_arr_time'] = pd.qcut(data_dropped['crs_arr_time'], 24, labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24])
data_dropped.columns
data_dropped.head(3)

Unnamed: 0,distance,crs_dep_time,crs_arr_time,day_of_week,day_of_month,month,avg_month_payload_carrier,avg_month_psngr_carrier,avg_dep_scheduled_monthly_airport,carrier_Alaska Airlines,...,carrier_Delta Airlines,carrier_ExpressJet,carrier_Frontier Airlines,carrier_JetBlue Airways,carrier_Republic Airways,carrier_SkyWest Airlines,carrier_Southwest Airlines,carrier_Spirit Airlines,carrier_United Airlines,carrier_ZW
0,679,20,20,1,19,6,553850.0,3591.942804,16430,1,...,0,0,0,0,0,0,0,0,0,0
1,2521,24,2,4,21,6,553850.0,3591.942804,16430,1,...,0,0,0,0,0,0,0,0,0,0
2,679,5,5,6,3,6,553850.0,3591.942804,16430,1,...,0,0,0,0,0,0,0,0,0,0


In [81]:
# split data
X_train, X_test, y_train, y_test = train_test_split(data_dropped, y, test_size=0.2, random_state = 4)

In [82]:
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit(X_test)

In [83]:
regr = LinearRegression()
regr.fit(X_train, y_train)

In [84]:
y_pred = regr.predict(X_test)

In [85]:
def get_evaluators(y_test, y_pred):
    print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
    print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
    print(f'r2_score: {r2_score(y_test, y_pred)}')

In [86]:
get_evaluators(y_test, y_pred)

Mean Squared Error: 328.75798050135023
Mean Absolute Error: 13.816843994740857
r2_score: 0.030756134938464852


In [87]:
def get_cv_scores(model):
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring='r2')
    
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

In [88]:
get_cv_scores(regr)

CV Mean:  0.017135659598413476
STD:  0.004017216957957514




In [89]:
# Train model with default alpha=1
ridge = Ridge(alpha=1).fit(X_train, y_train)
# get cross val scores
get_cv_scores(ridge)

CV Mean:  0.01718943131363302
STD:  0.004006366682105704




In [90]:
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best Score:  0.018154644267294073
Best Params:  {'alpha': 100}


In [91]:
lasso = Lasso(alpha=1).fit(X_train, y_train)
get_cv_scores(lasso)
y_pred = lasso.predict(X_test)
get_evaluators(y_test, y_pred)

CV Mean:  0.014337178108704118
STD:  0.0029153586557029844


Mean Squared Error: 331.22450729411247
Mean Absolute Error: 13.884635935170841
r2_score: 0.02348432374699505


In [92]:
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best Score:  0.017920440519632774
Best Params:  {'alpha': 0.1}


In [93]:
# Train model with default alpha=1 and l1_ratio=0.5
elastic_net = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train, y_train)
# get cross val scores
get_cv_scores(elastic_net)
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha, l1_ratio=l1_ratio)
grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

CV Mean:  0.014356259471306521
STD:  0.003084323541144695


Fitting 5 folds for each of 77 candidates, totalling 385 fits
Best Score:  0.0180645682761267
Best Params:  {'alpha': 0.01, 'l1_ratio': 0}


  model = cd_fast.enet_coordinate_descent(


In [94]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
poly_features = poly.fit_transform(data_dropped)

In [95]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state = 4)

In [96]:
lr = LinearRegression(n_jobs=-1)
lr.fit(X_train, y_train)

In [97]:
y_pred = lr.predict(X_test)

In [98]:
get_evaluators(y_test, y_pred)

Mean Squared Error: 370.7519974113753
Mean Absolute Error: 14.673078790236689
r2_score: -0.0836448558455507


In [99]:
# # data with dummy carriers
# data_dummies = data.copy()
# data_dummies.drop(labels=['fl_date', 'origin', 'destination'], axis = 1, inplace = True)
# data_dummies = pd.get_dummies(data_dummies)

# y = data_dummies.arr_delay
# data_dummies.drop('arr_delay', inplace=True, axis = 1)

# # split data
# X_train, X_test, y_train, y_test = train_test_split(data_dummies, y, test_size=0.3, random_state = 4)

# # scale data
# scaler = StandardScaler()
# scaler.fit_transform(X_train)
# scaler.fit(X_test)

# regr = LinearRegression()
# regr.fit(X_train, y_train)

# y_pred = regr.predict(X_test)

# # use metrics to evaluate model
# print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
# print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
# print(f'r2_score: {r2_score(y_test, y_pred)}')

In [100]:
# set the params for gridsearch
params = [
    
]

In [101]:
target = data_dropped.binary_delay
data_dropped.drop('binary_delay', inplace = True, axis = 1)

AttributeError: 'DataFrame' object has no attribute 'binary_delay'

In [None]:
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit(X_test)

In [None]:
clf = LogisticRegression(solver = 'sag', penalty='l2')

clf.fit(X_train, y_train)