In [1]:
# Import Dependencies
from path import Path
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Import testing dataset
flights_2019 = pd.read_csv('Database/Data/jan_19_clean_data.csv')
flights_2020 = pd.read_csv('Database/Data/jan_20_clean_data.csv')

flights_2019.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,2,20363,9E,N8688C,3280,11953,1195302,GNV,10397,1039707,ATL,601.0,0.0,0600-0659,722.0,0.0,0,0,300
1,1,2,20363,9E,N348PQ,3281,13487,1348702,MSP,11193,1119302,CVG,1359.0,0.0,1400-1459,1633.0,0.0,0,0,596
2,1,2,20363,9E,N8896A,3282,11433,1143302,DTW,11193,1119302,CVG,1215.0,0.0,1200-1259,1329.0,0.0,0,0,229
3,1,2,20363,9E,N8886A,3283,15249,1524906,TLH,10397,1039707,ATL,1521.0,0.0,1500-1559,1625.0,0.0,0,0,223
4,1,2,20363,9E,N8974C,3284,10397,1039707,ATL,11778,1177801,FSM,1847.0,0.0,1900-1959,1940.0,0.0,0,0,579


In [3]:
flights_2019.columns

Index(['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
       'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK',
       'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE'],
      dtype='object')

In [4]:
#Create a new dataframe with only relavent testing data
filteredList = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER', 'TAIL_NUM'
                , 'ORIGIN', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DISTANCE']

columnsRemoved = ['OP_CARRIER_AIRLINE_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN_AIRPORT_SEQ_ID' , 'ORIGIN_AIRPORT_ID'
                  , 'OP_CARRIER_FL_NUM', 'DEST_AIRPORT_ID', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DEP_TIME_BLK']

machine_model_2019_df = flights_2019.filter(filteredList)
machine_model_2019_df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,DEP_TIME,DEP_DEL15,DISTANCE
0,1,2,9E,N8688C,GNV,ATL,601.0,0.0,300
1,1,2,9E,N348PQ,MSP,CVG,1359.0,0.0,596
2,1,2,9E,N8896A,DTW,CVG,1215.0,0.0,229
3,1,2,9E,N8886A,TLH,ATL,1521.0,0.0,223
4,1,2,9E,N8974C,ATL,FSM,1847.0,0.0,579


In [5]:
machine_model_2019_df.DEP_DEL15 = machine_model_2019_df.DEP_DEL15.astype('int')
machine_model_2019_df.DEP_TIME = machine_model_2019_df.DEP_TIME.astype('int')

In [6]:
machine_model_2019_df.dtypes

DAY_OF_MONTH     int64
DAY_OF_WEEK      int64
OP_CARRIER      object
TAIL_NUM        object
ORIGIN          object
DEST            object
DEP_TIME         int32
DEP_DEL15        int32
DISTANCE         int64
dtype: object

In [7]:
machine_model_2019_df.dtypes[machine_model_2019_df.dtypes == "object"]

OP_CARRIER    object
TAIL_NUM      object
ORIGIN        object
DEST          object
dtype: object

In [8]:
# Generate our categorical variable list
flights_cat = machine_model_2019_df.dtypes[machine_model_2019_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
machine_model_2019_df[flights_cat].nunique()

OP_CARRIER      17
TAIL_NUM      5441
ORIGIN         346
DEST           346
dtype: int64

In [9]:
flights_cat

['OP_CARRIER', 'TAIL_NUM', 'ORIGIN', 'DEST']

In [10]:
# Too many options in Tail number to one hot encode
#Only OneHotEncode ORIGIN, DEST, and OP_CARRIER
flights_cat.remove('TAIL_NUM')

In [11]:
flights_cat

['OP_CARRIER', 'ORIGIN', 'DEST']

In [12]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(machine_model_2019_df[flights_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(flights_cat)
encode_df.head()

Unnamed: 0,OP_CARRIER_9E,OP_CARRIER_AA,OP_CARRIER_AS,OP_CARRIER_B6,OP_CARRIER_DL,OP_CARRIER_EV,OP_CARRIER_F9,OP_CARRIER_G4,OP_CARRIER_HA,OP_CARRIER_MQ,...,DEST_TYS,DEST_UIN,DEST_USA,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_XNA,DEST_YAK,DEST_YUM
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Merge one-hot encoded features and drop the originals
machine_model_2019_df_encoded = machine_model_2019_df.merge(encode_df,left_index=True, right_index=True)
machine_model_2019_df_encoded = machine_model_2019_df_encoded.drop(flights_cat,1)
machine_model_2019_df_encoded.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,TAIL_NUM,DEP_TIME,DEP_DEL15,DISTANCE,OP_CARRIER_9E,OP_CARRIER_AA,OP_CARRIER_AS,OP_CARRIER_B6,...,DEST_TYS,DEST_UIN,DEST_USA,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_XNA,DEST_YAK,DEST_YUM
0,1,2,N8688C,601,0,300,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,N348PQ,1359,0,596,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2,N8896A,1215,0,229,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2,N8886A,1521,0,223,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2,N8974C,1847,0,579,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Need to LabelEncode the Tail Numbers possibly
le = LabelEncoder()
le.fit(machine_model_2019_df_encoded['TAIL_NUM'])
tail_num_enc = le.transform(machine_model_2019_df_encoded['TAIL_NUM'])
machine_model_2019_df_encoded['TAIL_NUM_ENC'] = tail_num_enc
machine_model_2019_df_encoded = machine_model_2019_df_encoded.drop(['TAIL_NUM'], axis = 1)
machine_model_2019_df_encoded.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,DEP_TIME,DEP_DEL15,DISTANCE,OP_CARRIER_9E,OP_CARRIER_AA,OP_CARRIER_AS,OP_CARRIER_B6,OP_CARRIER_DL,...,DEST_UIN,DEST_USA,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_XNA,DEST_YAK,DEST_YUM,TAIL_NUM_ENC
0,1,2,601,0,300,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4316
1,1,2,1359,0,596,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1396
2,1,2,1215,0,229,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4472
3,1,2,1521,0,223,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4468
4,1,2,1847,0,579,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4529


In [15]:
machine_model_2019_df_encoded.dtypes[machine_model_2019_df_encoded.dtypes == "object"]

Series([], dtype: object)

In [18]:
machine_model_2019_df_encoded.to_csv("encoded.csv")

In [16]:
# Create Feature and Target Variables
y = machine_model_2019_df_encoded["DEP_DEL15"]
X = machine_model_2019_df_encoded.drop(columns="DEP_DEL15")

In [17]:
# Create X & Y train and test variables
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=34, 
                                                    stratify=y)
X_train.shape

MemoryError: Unable to allocate 2.24 GiB for an array with shape (709, 424472) and data type float64

In [None]:
# Preprocess numerical data for neural network

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create the Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=34)

In [None]:
# Fit the model
classifier.fit(X_train, y_train)

In [None]:
# Make predictions 
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
# Show the accuracy score of the machine learning model 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))