In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss

from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('df_dummies.csv')

In [3]:
df= df.drop(columns=['Unnamed: 0'], axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168018 entries, 0 to 168017
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Month       168018 non-null  int64
 1   DayofMonth  168018 non-null  int64
 2   ArrTime     168018 non-null  int64
 3   DepDelay    168018 non-null  int64
 4   Distance    168018 non-null  int64
 5   DayofWeek   168018 non-null  int64
 6   Dest_ATL    168018 non-null  int64
 7   Dest_DEN    168018 non-null  int64
 8   Dest_DFW    168018 non-null  int64
 9   Dest_LAX    168018 non-null  int64
 10  Dest_ORD    168018 non-null  int64
dtypes: int64(11)
memory usage: 14.1 MB


In [5]:
X = df.drop(columns=['DepDelay'], axis=1)
y= df['DepDelay']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2, train_size=.8, random_state=42)

### Decision Tree Pipe

In [7]:
dtc_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dtc', DecisionTreeClassifier())])

In [8]:
dtc_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'ss', 'dtc', 'ss__copy', 'ss__with_mean', 'ss__with_std', 'dtc__ccp_alpha', 'dtc__class_weight', 'dtc__criterion', 'dtc__max_depth', 'dtc__max_features', 'dtc__max_leaf_nodes', 'dtc__min_impurity_decrease', 'dtc__min_impurity_split', 'dtc__min_samples_leaf', 'dtc__min_samples_split', 'dtc__min_weight_fraction_leaf', 'dtc__random_state', 'dtc__splitter'])

In [9]:
dtc_param_grid=[{
    'dtc__criterion':['gini', 'entropy'],
    'dtc__random_state': [1,20, 42, 100 ],
    'dtc__max_depth':[5,10,20,30,40,50,60,70],
    'dtc__min_samples_split':[5,10,20,25,30,40,50],
    'dtc__min_samples_leaf':[1,100,200,300,400,500]
            }]

In [10]:
dtc_rs_pipe = RandomizedSearchCV(dtc_pipe, dtc_param_grid)

In [11]:
dtc_rs_pipe.fit(X_train, y_train)
dtc_test = dtc_rs_pipe.predict(X_test)

In [12]:
print(classification_report(y_test, dtc_test))

              precision    recall  f1-score   support

           0       0.54      0.13      0.21     10983
           1       0.69      0.95      0.80     22621

    accuracy                           0.68     33604
   macro avg       0.61      0.54      0.50     33604
weighted avg       0.64      0.68      0.61     33604



### Random Forest Pipe

In [13]:
rfc_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rfc', RandomForestClassifier())])

In [14]:
rfc_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'ss', 'rfc', 'ss__copy', 'ss__with_mean', 'ss__with_std', 'rfc__bootstrap', 'rfc__ccp_alpha', 'rfc__class_weight', 'rfc__criterion', 'rfc__max_depth', 'rfc__max_features', 'rfc__max_leaf_nodes', 'rfc__max_samples', 'rfc__min_impurity_decrease', 'rfc__min_impurity_split', 'rfc__min_samples_leaf', 'rfc__min_samples_split', 'rfc__min_weight_fraction_leaf', 'rfc__n_estimators', 'rfc__n_jobs', 'rfc__oob_score', 'rfc__random_state', 'rfc__verbose', 'rfc__warm_start'])

In [15]:
rfc_param_grid=[{
    'rfc__criterion':['gini', 'entropy'],
    'rfc__random_state': [1,20, 42, 100 ],
    'rfc__max_depth':[5,10,20,30,40,50,60,70],
    'rfc__min_samples_split':[5,10,20,25,30,40,50],
    'rfc__min_samples_leaf':[1,100,200,300,400,500]
            }]

In [16]:
rfc_rs_pipe = RandomizedSearchCV(rfc_pipe, rfc_param_grid)

In [17]:
rfc_rs_pipe.fit(X_train, y_train)
rfc_test = rfc_rs_pipe.predict(X_test)

In [18]:
print(classification_report(y_test, rfc_test))

              precision    recall  f1-score   support

           0       0.57      0.12      0.19     10983
           1       0.69      0.96      0.80     22621

    accuracy                           0.68     33604
   macro avg       0.63      0.54      0.50     33604
weighted avg       0.65      0.68      0.60     33604



### XGBoost Pipe

In [26]:
xgb_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('xgb', XGBClassifier(eval_metric='logloss'))])

In [27]:
xgb_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'ss', 'xgb', 'ss__copy', 'ss__with_mean', 'ss__with_std', 'xgb__objective', 'xgb__use_label_encoder', 'xgb__base_score', 'xgb__booster', 'xgb__colsample_bylevel', 'xgb__colsample_bynode', 'xgb__colsample_bytree', 'xgb__gamma', 'xgb__gpu_id', 'xgb__importance_type', 'xgb__interaction_constraints', 'xgb__learning_rate', 'xgb__max_delta_step', 'xgb__max_depth', 'xgb__min_child_weight', 'xgb__missing', 'xgb__monotone_constraints', 'xgb__n_estimators', 'xgb__n_jobs', 'xgb__num_parallel_tree', 'xgb__random_state', 'xgb__reg_alpha', 'xgb__reg_lambda', 'xgb__scale_pos_weight', 'xgb__subsample', 'xgb__tree_method', 'xgb__validate_parameters', 'xgb__verbosity', 'xgb__eval_metric'])

In [28]:
xgb_param_grid={
    'xgb__learning_rate': [0.1, 0.2, 0.3, 0.4],
    'xgb__gamma': [0, 1, 2, 3],
    'xgb__max_depth': [20,60],
    'xgb__min_child_weight': [1, 2],
    'xgb__subsample': [0.2, 0.5, 0.7, 0.9],
}

In [29]:
xgb_rs_pipe = RandomizedSearchCV(xgb_pipe, xgb_param_grid)

In [30]:
xgb_rs_pipe.fit(X_train, y_train)
xgb_test = xgb_rs_pipe.predict(X_test)

In [31]:
print(classification_report(y_test, xgb_test))

              precision    recall  f1-score   support

           0       0.54      0.39      0.45     10983
           1       0.74      0.84      0.79     22621

    accuracy                           0.69     33604
   macro avg       0.64      0.61      0.62     33604
weighted avg       0.67      0.69      0.68     33604



In [32]:
grids = [dtc_rs_pipe,rfc_rs_pipe,xgb_rs_pipe]

for pipe in grids:
    pipe.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
grid_dict = {0: 'Decision Trees', 
             1: 'Random Forest',
             2: 'XGBoost'}
for i, model in enumerate(grids):
    print('{} Test Accuracy: {}'.format(grid_dict[i],
    model.score(X_test,y_test)))
    print('{} Best Params: {}'.format(grid_dict[i], model.best_params_))