In [2]:
# Import Dependencies
from path import Path
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from collections import defaultdict

#  Adjuments Needed:
onehotencoding could not be applied - too resources intensive

In [3]:
# Import testing dataset
flights_2019 = pd.read_csv('Database/Data/jan_19_clean_data.csv')
flights_2020 = pd.read_csv('Database/Data/jan_20_clean_data.csv')

flights_2019.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,2,20363,9E,N8688C,3280,11953,1195302,GNV,10397,1039707,ATL,601.0,0.0,0600-0659,722.0,0.0,0,0,300
1,1,2,20363,9E,N348PQ,3281,13487,1348702,MSP,11193,1119302,CVG,1359.0,0.0,1400-1459,1633.0,0.0,0,0,596
2,1,2,20363,9E,N8896A,3282,11433,1143302,DTW,11193,1119302,CVG,1215.0,0.0,1200-1259,1329.0,0.0,0,0,229
3,1,2,20363,9E,N8886A,3283,15249,1524906,TLH,10397,1039707,ATL,1521.0,0.0,1500-1559,1625.0,0.0,0,0,223
4,1,2,20363,9E,N8974C,3284,10397,1039707,ATL,11778,1177801,FSM,1847.0,0.0,1900-1959,1940.0,0.0,0,0,579


In [4]:
flights_2019.DAY_OF_MONTH.count()

565963

In [5]:
flights_2019.dropna(axis=0).DAY_OF_MONTH.count()

565963

In [6]:
flights_2020.DAY_OF_MONTH.count()

599268

In [7]:
flights_2020.dropna(axis=0).DAY_OF_MONTH.count()

599268

In [8]:
flights = pd.concat([flights_2019, flights_2020])

In [9]:
flights.columns

Index(['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
       'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK',
       'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE'],
      dtype='object')

In [10]:
#Create a new dataframe with only relavent testing data that we will be collecting from the user or building a database for.
filteredList = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_DEL15', 'DISTANCE'
                ,'DEP_TIME_BLK', 'TAIL_NUM', 'DEP_TIME']

columnsRemoved = ['OP_CARRIER_AIRLINE_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN_AIRPORT_SEQ_ID' , 'ORIGIN_AIRPORT_ID'
                  , 'OP_CARRIER_FL_NUM', 'DEST_AIRPORT_ID', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DEP_TIME']

machine_model_df = flights.filter(filteredList)
machine_model_df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,DEP_DEL15,DISTANCE,DEP_TIME_BLK,TAIL_NUM,DEP_TIME
0,1,2,9E,GNV,ATL,0.0,300,0600-0659,N8688C,601.0
1,1,2,9E,MSP,CVG,0.0,596,1400-1459,N348PQ,1359.0
2,1,2,9E,DTW,CVG,0.0,229,1200-1259,N8896A,1215.0
3,1,2,9E,TLH,ATL,0.0,223,1500-1559,N8886A,1521.0
4,1,2,9E,ATL,FSM,0.0,579,1900-1959,N8974C,1847.0


In [12]:
machine_model_df.DEP_DEL15 = machine_model_df.DEP_DEL15.astype('int')
machine_model_df.DEP_TIME = machine_model_df.DEP_TIME.astype('int')

In [13]:
machine_model_df.dtypes

DAY_OF_MONTH     int64
DAY_OF_WEEK      int64
OP_CARRIER      object
ORIGIN          object
DEST            object
DEP_DEL15        int32
DISTANCE         int64
DEP_TIME_BLK    object
TAIL_NUM        object
DEP_TIME         int32
dtype: object

In [14]:
machine_model_df.dtypes[machine_model_df.dtypes == "object"]

OP_CARRIER      object
ORIGIN          object
DEST            object
DEP_TIME_BLK    object
TAIL_NUM        object
dtype: object

In [16]:
# Generate our categorical variable list
flights_cat = machine_model_df.dtypes[machine_model_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
machine_model_df[flights_cat].nunique()

OP_CARRIER        17
ORIGIN           353
DEST             353
DEP_TIME_BLK      19
TAIL_NUM        5854
dtype: int64

In [17]:
# Tranform each column with LabelEncoder (https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn)
d = defaultdict(LabelEncoder)

df = machine_model_df[flights_cat]

# Encoding the variable
encode_df = df.apply(lambda x: d[x.name].fit_transform(x))

# # Inverse the encoded
# fit.apply(lambda x: d[x.name].inverse_transform(x))

# # Using the dictionary to label future data
# df.apply(lambda x: d[x.name].transform(x))

In [18]:
encode_df.nunique()

OP_CARRIER        17
ORIGIN           353
DEST             353
DEP_TIME_BLK      19
TAIL_NUM        5854
dtype: int64

In [19]:
# Concatenate the encoded columns
machine_model_df_encoded = machine_model_df.drop(flights_cat, axis=1)
machine_model_df_encoded = pd.concat([machine_model_df_encoded, encode_df], axis = 1)
machine_model_df_encoded.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE,DEP_TIME,OP_CARRIER,ORIGIN,DEST,DEP_TIME_BLK,TAIL_NUM
0,1,2,0,300,601,0,130,19,1,4648
1,1,2,0,596,1359,0,227,82,9,1542
2,1,2,0,229,1215,0,97,82,7,4810
3,1,2,0,223,1521,0,331,19,10,4806
4,1,2,0,579,1847,0,19,122,14,4867


In [20]:
# Create Feature and Target Variables
y = machine_model_df_encoded["DEP_DEL15"]
X = machine_model_df_encoded.drop(columns="DEP_DEL15")

In [21]:
# Create X & Y train and test variables
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=34, 
                                                    stratify=y)
X_train.shape

(873923, 9)

In [22]:
# Preprocess numerical data for neural network

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
# Create the Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=10)

In [44]:
# Fit the model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=20000, random_state=10)

In [45]:
# Make predictions 
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [46]:
# Because dataset is imbalanced we cant use accuracy metric
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94    246370
           1       1.00      0.26      0.41     44938

    accuracy                           0.89    291308
   macro avg       0.94      0.63      0.67    291308
weighted avg       0.90      0.89      0.86    291308



In [47]:
confusion_matrix(y_test, y_pred)

array([[246343,     27],
       [ 33373,  11565]], dtype=int64)