In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
df = pd.read_csv('flight_df_t.csv')
df.drop(columns=df.columns[0], axis=1, inplace=True)
df = df[df['arr_delay'] < 200]

In [3]:
df['crs_dep_time'] = pd.qcut(df['crs_dep_time'], 24, labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24])

In [4]:
df.columns

Index(['fl_date', 'mkt_carrier_fl_num', 'origin_airport_id', 'dest_airport_id',
       'carrier', 'origin', 'destination', 'distance', 'crs_dep_time',
       'crs_arr_time', 'day_of_week', 'day_of_month', 'month', 'arr_delay',
       'binary_delay', 'avg_month_payload_carrier', 'avg_month_psngr_carrier',
       'avg_domest_cost_month_carrier', 'avg_domest_gallons_month_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport'],
      dtype='object')

## Data Sets

Numerical

In [5]:
numerical_test_data = df[['distance', 'crs_dep_time', 'day_of_week', 'day_of_month', 'month', 'avg_month_psngr_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport']]

Categorical

In [6]:
carrier_dummies = pd.get_dummies(df['carrier'], prefix='carrier', drop_first=True)
destination_dummies = pd.get_dummies(df['destination'], prefix='dest', drop_first=True)
origin_dummies = pd.get_dummies(df['origin'], prefix='origin', drop_first=True)

In [7]:
df_with_carrier_dummies = pd.concat([df, carrier_dummies], axis=1)
df_with_dest_dummies = pd.concat([df, destination_dummies], axis=1)
df_with_origin_dummies = pd.concat([df, origin_dummies], axis=1)

Weather

In [8]:
chicago_weather = pd.read_csv('chicago_2019_weather.csv', index_col=0)

In [9]:
chicago_df = df[df['origin'] == 'Chicago, IL']
chicago_df = df[df['fl_date'] > '2018-12-31']

In [10]:
chicago_weather_df = chicago_df.merge(right=chicago_weather, how='inner')

In [11]:
chicago_weather_df.columns
chicago = chicago_weather_df[['distance', 'arr_delay', 'crs_dep_time', 'day_of_week', 'day_of_month', 'month', 'avg_month_psngr_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport', 'weather']]

In [12]:
# chicago['weather'] = chicago.weather.replace({'sunny': 0,
#                                               'cloudy': 1,
#                                               'rainy': 2,
#                                               'snow': 3
#                                           })

Chicago Weather with Categorical

In [13]:
chicago_with_carrier = chicago_weather_df[['carrier', 'distance', 'arr_delay', 'crs_dep_time', 'day_of_week', 'month', 'avg_month_psngr_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport', 'weather']]

In [14]:
# chicago_with_carrier.head(1)
# chicago_with_carrier['weather'] = chicago_with_carrier.weather.replace({'sunny': 0,
#                                               'cloudy': 1,
#                                               'rainy': 2,
#                                               'snow': 3
#                                           })

In [15]:
chicago_carrier_dummies = pd.get_dummies(chicago_with_carrier['carrier'], prefix='carrier', drop_first=True)
chicago_day_of_week_dummies = pd.get_dummies(chicago_with_carrier['day_of_week'], prefix='day_of_week', drop_first=True)
chicago_weather_dummies = pd.get_dummies(chicago_with_carrier['weather'], prefix= 'weather', drop_first=True)
chicago_dummies = pd.concat([chicago_with_carrier, chicago_carrier_dummies, chicago_day_of_week_dummies,chicago_weather_dummies], axis=1)

In [16]:
Data = chicago_dummies

In [17]:
chicago_dummies.drop(['carrier','day_of_week','weather'], axis=1, inplace=True)

In [18]:
carrier_y = chicago_dummies['arr_delay']
carrier_X = chicago_dummies.drop('arr_delay', axis=1)

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [20]:
scaler2 = StandardScaler()
scaled_X_carrier = pd.DataFrame(scaler2.fit_transform(carrier_X))
scaled_X_carrier.columns = carrier_X.columns

In [21]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X_carrier, carrier_y, test_size=0.30)

# Models & Hyperparmaeters 

In [22]:
#Data 
chicago.head(4)

Unnamed: 0,distance,arr_delay,crs_dep_time,day_of_week,day_of_month,month,avg_month_psngr_carrier,avg_dep_scheduled_monthly_airport,avg_monthly_psngr_airport,weather
0,416,24,8,0,18,3,1624.565831,35220,3171023,cloudy
1,783,-2,2,0,18,3,1624.565831,35220,3171023,cloudy
2,733,5,2,0,18,3,1624.565831,35220,3171023,cloudy
3,1120,1,20,0,18,3,5879.758107,35220,3171023,cloudy


In [23]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [24]:
y = carrier_y
X = carrier_X 

In [25]:
scaler = StandardScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X))
scaled_X.columns = X.columns

In [26]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.30)

# Linear Regression 

### DecisionTreeRegressor

In [27]:
from sklearn.tree import DecisionTreeRegressor

In [28]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

DecisionTreeRegressor()

In [29]:
y_pred = dtr.predict(X_test)

In [30]:
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1985.5561139028475
-0.9230540015909532


### DecisionTreeRegressor GridSearch

In [31]:
from sklearn.model_selection import GridSearchCV
import datetime, warnings, scipy
warnings.filterwarnings("ignore")

In [32]:
tree_reg = DecisionTreeRegressor()

In [33]:
parameters={"splitter":["best","random"],
            "max_depth" : [10,20,30,40],
           "min_samples_leaf":[5,6,7,8,9,10],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[5,6,7]}

In [34]:
grid = GridSearchCV(estimator=tree_reg,  param_grid=parameters, n_jobs=-1)

In [35]:
grid.fit(X_train, y_train)



GridSearchCV(estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [10, 20, 30, 40],
                         'max_features': ['auto', 'log2', 'sqrt', None],
                         'max_leaf_nodes': [5, 6, 7],
                         'min_samples_leaf': [5, 6, 7, 8, 9, 10],
                         'splitter': ['best', 'random']})

In [36]:
y_pred = grid.predict(X_test)

In [37]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1005.012268524559
19.982421020235915
0.026623900930598188


In [38]:
print(grid.best_estimator_)

DecisionTreeRegressor(max_depth=10, max_leaf_nodes=7, min_samples_leaf=6,
                      splitter='random')


### LinearRegression

In [39]:
from sklearn.linear_model import LinearRegression

In [40]:
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

In [41]:
y_pred = regr.predict(X_test)

In [42]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

993.2945799188086
19.9240885120782
0.03797273554922398


### LinearRegression GridSearch

In [43]:
# def batch_gradient_descent(X, y, lrate, niter):
   
    
#     m = len(y)

    
#     for i in range(niter):
#         hypothesis = predict(X)

#     return predict

In [44]:
# lrates = [.5, .1, .01, .001, .0001]
# niterations = [25000, 50000, 150000]

In [45]:
# def test(X, y):
#     record = []
    
#     for niter in niterations:
#         for lrate in lrates:
#             weigths, records = batch_gradient_descent(X, y, lrate, niter)
#             delta = time.time() - start
#             record.append(dict(lrate=lrate, niter=niter, w=weigths, history=records))
    
#     return record

In [46]:
# rec = test(X, y)

###  Ridge

In [47]:
from sklearn.linear_model import Ridge

In [48]:
ridge = Ridge(alpha=1).fit(X_train, y_train)
y_pred = ridge.predict(X_test)

In [49]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

993.2940798205085
19.92407052602303
0.03797321990523417


### Ridge Grid Search 

In [50]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [51]:
y_pred = grid.predict(X_test)

In [52]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

993.7654773928895
19.91249543889178
0.037516661170097


###  Lasso

In [53]:
from sklearn.linear_model import Lasso

In [54]:
lasso = Lasso(alpha=1).fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [55]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1002.6439934507165
19.95483752129226
0.02891762651494778


### Lasso Grid Search

In [56]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [57]:
y_pred = grid.predict(X_test)

In [58]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

993.2573174348506
19.921398087360537
0.038008825070130925


### ElasticNet

In [59]:
from sklearn.linear_model import ElasticNet

In [60]:
elastic_net = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [61]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1002.6439934507165
19.95483752129226
0.02891762651494778


### ElasticNet Grid Search 

In [62]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha, l1_ratio=l1_ratio)
grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 77 candidates, totalling 385 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [63]:
y_pred = grid.predict(X_test)

In [64]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

993.7376655518794
19.905185217287396
0.037543597539092644


### Polynomial

In [65]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(X)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state = 4)

In [67]:
lr = LinearRegression(n_jobs=-1)
lr.fit(X_train, y_train)

LinearRegression(n_jobs=-1)

In [68]:
y_pred = lr.predict(X_test)

In [69]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1011.1303265759516
20.299068096148318
0.02201187284798567


### Polynomial Hyperparameters 

In [70]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
poly_features = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state = 4)

lr = LinearRegression(n_jobs=-1)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [71]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1048.5032685227186
20.471698840735325
-0.014136082108970438


###  GradientBoostingRegressor 

In [72]:
from sklearn.ensemble import GradientBoostingRegressor

In [73]:
reg = GradientBoostingRegressor(random_state=0)

In [74]:
reg.fit(X_train, y_train)

GradientBoostingRegressor(random_state=0)

In [75]:
y_pred = reg.predict(X_test)

In [76]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1009.6361103370698
20.106727669428913
0.023457112598604835


### GradientBoostingRegressor Grid Search 

In [77]:
 parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'n_estimators' : [100],
                  'max_depth'    : [4,6,8,10]
                 }

In [None]:
grid_GBR = GridSearchCV(estimator=reg, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(X_train, y_train)



In [None]:
y_pred = grid_GBR.predict(X_test)

In [None]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

# Classification 

### AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [None]:
Data['arr_delay'].values[Data['arr_delay'] > 0] = 1
Data['arr_delay'].values[Data['arr_delay'] <= 0] = 0

In [None]:
carrier_y = Data['arr_delay']

In [None]:
# Data for classification 
X = carrier_X 
y = carrier_y 

In [None]:
scaler = StandardScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X))
scaled_X.columns = X.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X_carrier, carrier_y, test_size=0.30)

In [None]:
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
ROC_AUC = roc_auc_score(y_test, y_pred)
ROC_AUC

### AdaBoostClassifier Grid Search 

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')

In [None]:
y_pred = grid_search.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

In [None]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

### GradientBoostingClassifier

In [None]:
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

### GradientBoostingClassifier Grid Search 

In [None]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }

In [None]:
grid = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1)

In [None]:
y_pred = grid.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(roc_auc_score(y_test, y_pred))

In [None]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred)

### Navie Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
y_pred = gnb.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(roc_auc_score(y_test, y_pred))

In [None]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

### Navie Bayes Grid Search 

In [None]:
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
          'fit_prior': [True, False],
          'class_prior': [None, [0.1,]* len(n_classes), ],
          'binarize': [None, 0.0, 8.5, 10.0]
         }

nb_grid = GridSearchCV(GaussianNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
nb_grid.fit(X_train,Y_train)

In [None]:
y_pred = nb_grid.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(roc_auc_score(y_test, y_pred))

In [None]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))