In [20]:
# Import Dependencies
from path import Path
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from collections import defaultdict

#  Adjuments Needed:
onehotencoding could not be applied - too resources intensive

In [21]:
# Import testing dataset
flights_2019 = pd.read_csv('Database/Data/jan_19_clean_data.csv')
flights_2020 = pd.read_csv('Database/Data/jan_20_clean_data.csv')

flights_2019.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,2,20363,9E,N8688C,3280,11953,1195302,GNV,10397,1039707,ATL,601.0,0.0,0600-0659,722.0,0.0,0,0,300
1,1,2,20363,9E,N348PQ,3281,13487,1348702,MSP,11193,1119302,CVG,1359.0,0.0,1400-1459,1633.0,0.0,0,0,596
2,1,2,20363,9E,N8896A,3282,11433,1143302,DTW,11193,1119302,CVG,1215.0,0.0,1200-1259,1329.0,0.0,0,0,229
3,1,2,20363,9E,N8886A,3283,15249,1524906,TLH,10397,1039707,ATL,1521.0,0.0,1500-1559,1625.0,0.0,0,0,223
4,1,2,20363,9E,N8974C,3284,10397,1039707,ATL,11778,1177801,FSM,1847.0,0.0,1900-1959,1940.0,0.0,0,0,579


In [22]:
flights_2019.columns

Index(['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
       'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK',
       'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE'],
      dtype='object')

In [23]:
#Create a new dataframe with only relavent testing data
filteredList = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER', 'TAIL_NUM'
                , 'ORIGIN', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DISTANCE']

columnsRemoved = ['OP_CARRIER_AIRLINE_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN_AIRPORT_SEQ_ID' , 'ORIGIN_AIRPORT_ID'
                  , 'OP_CARRIER_FL_NUM', 'DEST_AIRPORT_ID', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DEP_TIME_BLK']

machine_model_2019_df = flights_2019.filter(filteredList)
machine_model_2019_df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,DEP_TIME,DEP_DEL15,DISTANCE
0,1,2,9E,N8688C,GNV,ATL,601.0,0.0,300
1,1,2,9E,N348PQ,MSP,CVG,1359.0,0.0,596
2,1,2,9E,N8896A,DTW,CVG,1215.0,0.0,229
3,1,2,9E,N8886A,TLH,ATL,1521.0,0.0,223
4,1,2,9E,N8974C,ATL,FSM,1847.0,0.0,579


In [24]:
machine_model_2019_df.DEP_DEL15 = machine_model_2019_df.DEP_DEL15.astype('int')
machine_model_2019_df.DEP_TIME = machine_model_2019_df.DEP_TIME.astype('int')

In [25]:
machine_model_2019_df.dtypes

DAY_OF_MONTH     int64
DAY_OF_WEEK      int64
OP_CARRIER      object
TAIL_NUM        object
ORIGIN          object
DEST            object
DEP_TIME         int32
DEP_DEL15        int32
DISTANCE         int64
dtype: object

In [26]:
machine_model_2019_df.dtypes[machine_model_2019_df.dtypes == "object"]

OP_CARRIER    object
TAIL_NUM      object
ORIGIN        object
DEST          object
dtype: object

In [27]:
# Generate our categorical variable list
flights_cat = machine_model_2019_df.dtypes[machine_model_2019_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
machine_model_2019_df[flights_cat].nunique()

OP_CARRIER      17
TAIL_NUM      5441
ORIGIN         346
DEST           346
dtype: int64

In [28]:
# Tranform each column with LabelEncoder (https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn)
d = defaultdict(LabelEncoder)

df = machine_model_2019_df[flights_cat]

# Encoding the variable
encode_df = df.apply(lambda x: d[x.name].fit_transform(x))

# # Inverse the encoded
# fit.apply(lambda x: d[x.name].inverse_transform(x))

# # Using the dictionary to label future data
# df.apply(lambda x: d[x.name].transform(x))

In [29]:
encode_df.nunique()

OP_CARRIER      17
TAIL_NUM      5441
ORIGIN         346
DEST           346
dtype: int64

In [30]:
# Merge Back the encoded columns
machine_model_2019_df_encoded = machine_model_2019_df.drop(flights_cat, axis=1)
machine_model_2019_df_encoded = machine_model_2019_df_encoded.merge(encode_df,left_index=True, right_index=True)
machine_model_2019_df_encoded.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,DEP_TIME,DEP_DEL15,DISTANCE,OP_CARRIER,TAIL_NUM,ORIGIN,DEST
0,1,2,601,0,300,0,4316,128,19
1,1,2,1359,0,596,0,1396,225,80
2,1,2,1215,0,229,0,4472,95,80
3,1,2,1521,0,223,0,4468,325,19
4,1,2,1847,0,579,0,4529,19,120


In [31]:
# Create Feature and Target Variables
y = machine_model_2019_df_encoded["DEP_DEL15"]
X = machine_model_2019_df_encoded.drop(columns="DEP_DEL15")

In [32]:
# Create X & Y train and test variables
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=34, 
                                                    stratify=y)
X_train.shape

(424472, 8)

In [33]:
# Preprocess numerical data for neural network

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
# Create the Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=20)

In [46]:
# Fit the model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=20)

In [47]:
# Make predictions 
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,0
3,0,1
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [48]:
# Show the accuracy score of the machine learning model 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8263069735884261


In [49]:
confusion_matrix(y_test, y_pred)

array([[116915,      0],
       [ 24576,      0]], dtype=int64)