### **Importing Libraries and the Dataset**

In [2]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Used to ignore the warning given as output of the code
import warnings
warnings.filterwarnings('ignore')

# Basic libraries of python for numeric and dataframe computations
import numpy as np
import pandas as pd

# Basic library for data visualization
import matplotlib.pyplot as plt

# Slightly advanced library for data visualization
import seaborn as sns

# To compute the cosine similarity between two vectors
from sklearn.metrics.pairwise import cosine_similarity

# Simple Linear Regression

#Load function from sklearn
from sklearn import linear_model

# A dictionary output that does not raise a key error
from collections import defaultdict

# A performance metrics in sklearn
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

### **Load the dataset**

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
pd.set_option('display.max_rows', None)

In [8]:
def data_preprocessing_2(df_with_path, chosen_columns):
  df = pd.read_csv(df_with_path)

  ndf = df[chosen_columns]

  # numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
  # categorical_features = X.select_dtypes(include=['object']).columns

  ndf_cat = ndf.select_dtypes(include=['object']).columns

  # One hot encoding for ndf_cat

  ndf = pd.get_dummies(ndf, columns=ndf_cat)

  return ndf

In [9]:
# first set of columns I am going to run models on
df_set_1 = ['Arr_Delay_At_Least_15_Minutes','Carrier_Name', 'Week_Day','Dep_Time_Block_Group', 'Scheduled_Departure_Time','Distance_Group' ]

In [10]:
df = data_preprocessing_2('/content/drive/MyDrive/downsampled_data_updated.csv',df_set_1)

In [11]:
df.shape

(2742902, 31)

In [12]:
df.columns

Index(['Arr_Delay_At_Least_15_Minutes', 'Scheduled_Departure_Time',
       'Distance_Group', 'Carrier_Name_Alaska Airlines Inc.',
       'Carrier_Name_Allegiant Air', 'Carrier_Name_American Airlines Inc.',
       'Carrier_Name_Delta Air Lines Inc.', 'Carrier_Name_Endeavor Air Inc.',
       'Carrier_Name_Envoy Air', 'Carrier_Name_Frontier Airlines Inc.',
       'Carrier_Name_Hawaiian Airlines Inc.', 'Carrier_Name_JetBlue Airways',
       'Carrier_Name_PSA Airlines Inc.', 'Carrier_Name_Republic Airline',
       'Carrier_Name_SkyWest Airlines Inc.',
       'Carrier_Name_Southwest Airlines Co.', 'Carrier_Name_Spirit Air Lines',
       'Carrier_Name_United Air Lines Inc.', 'Week_Day_Friday',
       'Week_Day_Monday', 'Week_Day_Saturday', 'Week_Day_Sunday',
       'Week_Day_Thursday', 'Week_Day_Tuesday', 'Week_Day_Wednesday',
       'Dep_Time_Block_Group_Afternoon',
       'Dep_Time_Block_Group_Early Afternoon',
       'Dep_Time_Block_Group_Early Morning', 'Dep_Time_Block_Group_Evening',
   

In [13]:
X = df.drop(['Arr_Delay_At_Least_15_Minutes'], axis=1)
y = df['Arr_Delay_At_Least_15_Minutes']

#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=78978)


In [14]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(use_label_encoder=False)
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)

print("Accuracy: ")
print (metrics.accuracy_score(y_test, y_pred))

print("F1: ")
print (metrics.f1_score(y_test, y_pred))

print("Precision: ")
print (metrics.precision_score(y_test, y_pred))

print("Recall: ")
print (metrics.recall_score(y_test, y_pred))

print("ROC AUC: ")
print (metrics.roc_auc_score(y_test, y_pred))

print("Confusion matrix: ")
print (metrics.confusion_matrix(y_test, y_pred))


print(metrics.classification_report(y_test, y_pred))

Accuracy: 
0.616344714818778
F1: 
0.6103679239344942
Precision: 
0.6208034103704485
Recall: 
0.6002774712970145
ROC AUC: 
0.6163642154665815
Confusion matrix: 
[[173265 100693]
 [109773 164850]]
              precision    recall  f1-score   support

           0       0.61      0.63      0.62    273958
           1       0.62      0.60      0.61    274623

    accuracy                           0.62    548581
   macro avg       0.62      0.62      0.62    548581
weighted avg       0.62      0.62      0.62    548581



In [20]:
import pickle

# Train a simple logistic regression model
#model = LogisticRegression()
#model.fit(X_train, y_train)

# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(xgb_clf, file)

In [None]:
# XGBoost Classifier Hyperparameter Tuning

from scipy import stats
from sklearn.model_selection import RandomizedSearchCV

param_grid = dict(
    n_estimators=stats.randint(10, 1000),
    max_depth=stats.randint(1, 10),
    learning_rate=stats.uniform(0, 1)
)

xgb_clf = XGBClassifier(use_label_encoder=False)
xgb_cv = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, n_iter=150,
    scoring='accuracy', n_jobs=-1, verbose=1
)
xgb_cv.fit(X_train, y_train)
best_params = xgb_cv.best_params_
print(f"Best paramters: {best_params}")

xgb_clf = XGBClassifier(**best_params)
xgb_clf.fit(X_train, y_train)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


In [15]:
y_pred = xgb_clf.predict(X_test)

print("Accuracy: ")
print (metrics.accuracy_score(y_test, y_pred))

print("F1: ")
print (metrics.f1_score(y_test, y_pred))

print("Precision: ")
print (metrics.precision_score(y_test, y_pred))

print("Recall: ")
print (metrics.recall_score(y_test, y_pred))

print("ROC AUC: ")
print (metrics.roc_auc_score(y_test, y_pred))

print("Confusion matrix: ")
print (metrics.confusion_matrix(y_test, y_pred))


print(classification_report(y_test, y_pred))

Accuracy: 
0.616344714818778
F1: 
0.6103679239344942
Precision: 
0.6208034103704485
Recall: 
0.6002774712970145
ROC AUC: 
0.6163642154665815
Confusion matrix: 
[[173265 100693]
 [109773 164850]]
              precision    recall  f1-score   support

           0       0.61      0.63      0.62    273958
           1       0.62      0.60      0.61    274623

    accuracy                           0.62    548581
   macro avg       0.62      0.62      0.62    548581
weighted avg       0.62      0.62      0.62    548581



In [None]:
#######################################################

In [None]:
#####. Best model so far was Naive Bayes


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=78978)

In [16]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict and evaluate
y_pred_class = gnb.predict(X_test)

print("Accuracy: ")
print (metrics.accuracy_score(y_test, y_pred_class))

print("F1: ")
print (metrics.f1_score(y_test, y_pred_class))

print("Precision: ")
print (metrics.precision_score(y_test, y_pred_class))

print("Recall: ")
print (metrics.recall_score(y_test, y_pred_class))

print("ROC AUC: ")
print (metrics.roc_auc_score(y_test, y_pred_class))

print("Confusion matrix: ")
print (metrics.confusion_matrix(y_test, y_pred_class))

print(metrics.classification_report(y_test, y_pred))

Accuracy: 
0.6020423601984027
F1: 
0.6032306842304225
Precision: 
0.6021603695197042
Recall: 
0.6043048105948883
ROC AUC: 
0.6020396142856832
Confusion matrix: 
[[164313 109645]
 [108667 165956]]
              precision    recall  f1-score   support

           0       0.61      0.63      0.62    273958
           1       0.62      0.60      0.61    274623

    accuracy                           0.62    548581
   macro avg       0.62      0.62      0.62    548581
weighted avg       0.62      0.62      0.62    548581



In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(X_train, y_train)
print(nbModel_grid.best_estimator_)


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


PicklingError: Could not pickle the task to send it to the workers.

In [17]:
# Predict and evaluate
y_pred_class = nbModel_grid.predict(X_test)

print("Accuracy: ")
print (metrics.accuracy_score(y_test, y_pred_class))

print("F1: ")
print (metrics.f1_score(y_test, y_pred_class))

print("Precision: ")
print (metrics.precision_score(y_test, y_pred_class))

print("Recall: ")
print (metrics.recall_score(y_test, y_pred_class))

print("ROC AUC: ")
print (metrics.roc_auc_score(y_test, y_pred_class))

print("Confusion matrix: ")
print (metrics.confusion_matrix(y_test, y_pred_class))

print(metrics.classification_report(y_test, y_pred))

NameError: name 'nbModel_grid' is not defined

In [None]:
##############################################

In [None]:
#import pickle


#with open('models/LRmodel.pkl', 'wb') as file:
#    pickle.dump(pipeline, file)

In [None]:


# Train the model
#model = LinearRegression()
#model.fit(X, y)

# Save the model
#with open('model.pkl', 'wb') as f:
#    pickle.dump(model, f)

################################################################################