In [1]:
#importing our dependencies

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

## Filtering and Processing Data

In [3]:
#import flights_csv
flights_df = pd.read_csv('Resources/flights_update.csv')
flights_df.head(20)

Unnamed: 0,MONTH,DATE,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DISTANCE_Miles,MINUTES_DELAY
0,1,1,4,AS,98,N407AS,ANC,SEA,1448,-22.0
1,1,1,4,AA,2336,N3KUAA,LAX,PBI,2330,-9.0
2,1,1,4,US,840,N171US,SFO,CLT,2296,5.0
3,1,1,4,AA,258,N3HYAA,LAX,MIA,2342,-9.0
4,1,1,4,AS,135,N527AS,SEA,ANC,1448,-21.0
5,1,1,4,DL,806,N3730B,SFO,MSP,1589,8.0
6,1,1,4,NK,612,N635NK,LAS,MSP,1299,-17.0
7,1,1,4,US,2013,N584UW,LAX,CLT,2125,-10.0
8,1,1,4,AA,1112,N3LAAA,SFO,DFW,1464,-13.0
9,1,1,4,DL,1173,N826DN,LAS,ATL,1747,-15.0


In [4]:
#check dtypes
flights_df.dtypes

MONTH                    int64
DATE                     int64
DAY_OF_WEEK              int64
AIRLINE                 object
FLIGHT_NUMBER            int64
TAIL_NUMBER             object
ORIGIN_AIRPORT          object
DESTINATION_AIRPORT     object
DISTANCE_Miles           int64
MINUTES_DELAY          float64
dtype: object

In [6]:
#using minutes_delayed as y value, has to be int
flights_df['MINUTES_DELAY'] = flights_df['MINUTES_DELAY'].values.astype(np.int64)

In [7]:
#checking dtypes again
flights_df.dtypes

MONTH                   int64
DATE                    int64
DAY_OF_WEEK             int64
AIRLINE                object
FLIGHT_NUMBER           int64
TAIL_NUMBER            object
ORIGIN_AIRPORT         object
DESTINATION_AIRPORT    object
DISTANCE_Miles          int64
MINUTES_DELAY           int64
dtype: object

In [9]:
#drop columns SORRY MU COMPUTER SUCKS, ADD COLUMNS BACK IN 
flights_df = flights_df.drop(['FLIGHT_NUMBER', 'TAIL_NUMBER', 'DISTANCE_Miles', 'DAY_OF_WEEK'], axis=1)
flights_df

Unnamed: 0,MONTH,DATE,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,MINUTES_DELAY
0,1,1,AS,ANC,SEA,-22
1,1,1,AA,LAX,PBI,-9
2,1,1,US,SFO,CLT,5
3,1,1,AA,LAX,MIA,-9
4,1,1,AS,SEA,ANC,-21
...,...,...,...,...,...,...
1048570,3,10,EV,RDU,EWR,-16
1048571,3,10,UA,LGA,IAH,-2
1048572,3,10,UA,SAN,ORD,3
1048573,3,10,EV,MSY,ORD,-13


In [10]:
#1048574
flights_df1 = flights_df.loc[flights_df["MINUTES_DELAY"] <= -500 ]
print(flights_df1)

         MONTH  DATE AIRLINE ORIGIN_AIRPORT DESTINATION_AIRPORT  \
32           1     1      AS            ANC                 SEA   
42           1     1      AA            PHX                 DFW   
68           1     1      OO            MAF                 IAH   
82           1     1      MQ            SGF                 DFW   
90           1     1      OO            RDD                 SFO   
...        ...   ...     ...            ...                 ...   
1048386      3    10      EV            DTW                 ATW   
1048431      3    10      EV            DFW                 MOB   
1048460      3    10      EV            MSP                 CLE   
1048470      3    10      MQ            ORD                 DBQ   
1048574      3    10      MQ            CID                 ORD   

               MINUTES_DELAY  
32      -9223372036854775808  
42      -9223372036854775808  
68      -9223372036854775808  
82      -9223372036854775808  
90      -9223372036854775808  
...      

In [11]:
#check null values
flights_df.isna().mean() >= 0.25

MONTH                  False
DATE                   False
AIRLINE                False
ORIGIN_AIRPORT         False
DESTINATION_AIRPORT    False
MINUTES_DELAY          False
dtype: bool

In [13]:
#filtering out flights that were late
late_flights_df = flights_df.loc[flights_df['MINUTES_DELAY'] >= 0]
late_flights_df.head(20)

Unnamed: 0,MONTH,DATE,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,MINUTES_DELAY
2,1,1,US,SFO,CLT,5
5,1,1,DL,SFO,MSP,8
14,1,1,DL,SEA,MSP,8
20,1,1,NK,LAS,MCI,6
21,1,1,AA,SEA,MIA,1
27,1,1,NK,MSP,FLL,25
29,1,1,AA,DEN,MIA,2
30,1,1,NK,PHX,ORD,43
33,1,1,DL,SLC,ATL,10
35,1,1,HA,LAS,HNL,15


In [14]:
#filtering out flights that were late
early_flights_df = flights_df.loc[
early_flights_df.head(20)

SyntaxError: unexpected EOF while parsing (<ipython-input-14-7ed3d51d6c04>, line 3)

In [15]:
flights_df['MINUTES_DELAY'] = flights_df['MINUTES_DELAY'].replace([flights_df['MINUTES_DELAY'] <= 0],'-1')
flights_df

Unnamed: 0,MONTH,DATE,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,MINUTES_DELAY
0,1,1,AS,ANC,SEA,-22
1,1,1,AA,LAX,PBI,-9
2,1,1,US,SFO,CLT,5
3,1,1,AA,LAX,MIA,-9
4,1,1,AS,SEA,ANC,-21
...,...,...,...,...,...,...
1048570,3,10,EV,RDU,EWR,-16
1048571,3,10,UA,LGA,IAH,-2
1048572,3,10,UA,SAN,ORD,3
1048573,3,10,EV,MSY,ORD,-13


## Functions to make data ready for machine learning

In [16]:
# one Hot Encoder
def onehot_encode(flights_df, column_dict):
    flights_df = flights_df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(flights_df[column], prefix=prefix)
        flights_df = pd.concat([flights_df, dummies], axis=1)
        flights_df = flights_df.drop(column, axis=1)
        
    return flights_df
        
  

In [19]:
#some functions to help 
#first were gonna make a copy of the flights_df

def preprocessing_inputs(flights_df):
    flights_df = flights_df.copy()
    
    #one-hot encoder nominal feature columns
    flights_df = onehot_encode(
        flights_df,
        column_dict={
            'AIRLINE': 'AL',
            'ORIGIN_AIRPORT': 'OA',
            'DESTINATION_AIRPORT': 'DA'
        })
    
    #fill remaining missing values with columns means
    #Makes everything numaric and fills in null values
    remaining_na_columns = flights_df.loc[:, flights_df.isna().sum() > 0].columns
    for column in remaining_na_columns:
        flights_df[column] = flights_df[column].fillna(flights_df[column].mean())
        
    #training the data in X and y
    #I CHANGED IT TO MINUTES_DELAYED, HE HAD IT AS THE 'CANCELLED' COLUMN WITH WE REMOVED
    #Maybe thats why it wont show anything??

    y = flights_df['MINUTES_DELAY'].copy()
    X = flights_df.drop('MINUTES_DELAY', axis=1).copy()
    
    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    #scaling the data with standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns= X.columns)
    
    return X_train, X_test, y_train, y_test


In [20]:
def evaluate_model(model, X_test, y_test):
    
    model_acc = model.score(X_test, y_test)
    print("Test Accuracy: {:.2f}%".format(model_acc * 100))
    
    #confusion matrix
    y_true = np.array(y_test)
    y_pred = modelpredict(X_test)
    
    cm = confusion_matrix(y_true, y_pred)
    clr = classification_report(y_true, y_pred)
    
    plt.figure(figsize=(8,8))
    sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
    plt.xticks(np.arange(2) + 0.5, ["ON TIME", "DELAYED"])
    plt.yticks(np.arange(2) + 0.5, ["ON TIME", "DELAYED"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

In [21]:
#checking dtypes again
flights_df.dtypes

MONTH                   int64
DATE                    int64
AIRLINE                object
ORIGIN_AIRPORT         object
DESTINATION_AIRPORT    object
MINUTES_DELAY           int64
dtype: object

In [22]:
X_train, X_test, y_train, y_test = preprocessing_inputs(flights_df)

In [23]:
X_train

Unnamed: 0,MONTH,DATE,AL_AA,AL_AS,AL_B6,AL_DL,AL_EV,AL_F9,AL_HA,AL_MQ,...,DA_TYR,DA_TYS,DA_UST,DA_VEL,DA_VLD,DA_VPS,DA_WRG,DA_XNA,DA_YAK,DA_YUM
0,0.434231,1.053215,-0.320013,-0.171003,-0.219569,-0.404727,-0.343752,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
1,1.852229,-0.895772,-0.320013,-0.171003,-0.219569,-0.404727,-0.343752,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
2,-0.983767,-0.895772,-0.320013,-0.171003,4.554384,-0.404727,-0.343752,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
3,-0.983767,1.626446,-0.320013,-0.171003,-0.219569,-0.404727,-0.343752,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
4,1.852229,-0.666479,-0.320013,-0.171003,-0.219569,2.470799,-0.343752,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733997,-0.983767,-0.093248,-0.320013,-0.171003,-0.219569,-0.404727,-0.343752,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
733998,-0.983767,-1.354357,-0.320013,-0.171003,-0.219569,-0.404727,2.909070,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
733999,-0.983767,-1.354357,-0.320013,-0.171003,-0.219569,-0.404727,2.909070,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147
734000,-0.983767,0.594630,-0.320013,5.847836,-0.219569,-0.404727,-0.343752,-0.119135,-0.116928,-0.258238,...,-0.020322,-0.036226,-0.006176,-0.010506,-0.013309,-0.028387,-0.011789,-0.040803,-0.011556,-0.019147


In [24]:
y_train.value_counts()

-9223372036854775808    30064
-8                      19694
-9                      19655
-7                      19647
-6                      19571
                        ...  
 1366                       1
 856                        1
 858                        1
 1371                       1
 511                        1
Name: MINUTES_DELAY, Length: 811, dtype: int64

In [None]:
#APPARENTLLY THIS IS STILL TOO MUCH DATA IDK HOW TO FIX
#UNLESS ITS MY COMPUTER
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
evaluate_model(model, X_test, y_test)