In [1]:
#importing our dependencies

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

## Filtering and Processing Data

In [2]:
#import flights_csv
flights_df = pd.read_csv('flights_update.csv')
flights_df.head(20)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DISTANCE,MINUTES_DELAYED
0,1,1,4,AS,98,N407AS,ANC,SEA,1448,-22.0
1,1,1,4,AA,2336,N3KUAA,LAX,PBI,2330,-9.0
2,1,1,4,US,840,N171US,SFO,CLT,2296,5.0
3,1,1,4,AA,258,N3HYAA,LAX,MIA,2342,-9.0
4,1,1,4,AS,135,N527AS,SEA,ANC,1448,-21.0
5,1,1,4,DL,806,N3730B,SFO,MSP,1589,8.0
6,1,1,4,NK,612,N635NK,LAS,MSP,1299,-17.0
7,1,1,4,US,2013,N584UW,LAX,CLT,2125,-10.0
8,1,1,4,AA,1112,N3LAAA,SFO,DFW,1464,-13.0
9,1,1,4,DL,1173,N826DN,LAS,ATL,1747,-15.0


In [3]:
#check dtypes
flights_df.dtypes

MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
AIRLINE                 object
FLIGHT_NUMBER            int64
TAIL_NUMBER             object
ORIGIN_AIRPORT          object
DESTINATION_AIRPORT     object
DISTANCE                 int64
MINUTES_DELAYED        float64
dtype: object

In [4]:
#using minutes_delayed as y value, has to be int
flights_df['MINUTES_DELAYED'] = flights_df['MINUTES_DELAYED'].values.astype(np.int64)

In [5]:
#checking dtypes again
flights_df.dtypes

MONTH                   int64
DAY_OF_MONTH            int64
DAY_OF_WEEK             int64
AIRLINE                object
FLIGHT_NUMBER           int64
TAIL_NUMBER            object
ORIGIN_AIRPORT         object
DESTINATION_AIRPORT    object
DISTANCE                int64
MINUTES_DELAYED         int64
dtype: object

In [6]:
#drop columns SORRY MU COMPUTER SUCKS, ADD COLUMNS BACK IN 
flights_df = flights_df.drop(['FLIGHT_NUMBER', 'TAIL_NUMBER', 'DISTANCE', 'DAY_OF_WEEK'], axis=1)
flights_df

Unnamed: 0,MONTH,DAY_OF_MONTH,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,MINUTES_DELAYED
0,1,1,AS,ANC,SEA,-22
1,1,1,AA,LAX,PBI,-9
2,1,1,US,SFO,CLT,5
3,1,1,AA,LAX,MIA,-9
4,1,1,AS,SEA,ANC,-21
...,...,...,...,...,...,...
1048570,3,10,EV,RDU,EWR,-16
1048571,3,10,UA,LGA,IAH,-2
1048572,3,10,UA,SAN,ORD,3
1048573,3,10,EV,MSY,ORD,-13


In [7]:
#1048574
flights_df1 = flights_df.loc[flights_df["MINUTES_DELAYED"] <= -500 ]
print(flights_df1)

         MONTH  DAY_OF_MONTH AIRLINE ORIGIN_AIRPORT DESTINATION_AIRPORT  \
32           1             1      AS            ANC                 SEA   
42           1             1      AA            PHX                 DFW   
68           1             1      OO            MAF                 IAH   
82           1             1      MQ            SGF                 DFW   
90           1             1      OO            RDD                 SFO   
...        ...           ...     ...            ...                 ...   
1048386      3            10      EV            DTW                 ATW   
1048431      3            10      EV            DFW                 MOB   
1048460      3            10      EV            MSP                 CLE   
1048470      3            10      MQ            ORD                 DBQ   
1048574      3            10      MQ            CID                 ORD   

             MINUTES_DELAYED  
32      -9223372036854775808  
42      -9223372036854775808  
68    

In [8]:
#creating new df to remove 
new_flights_df = flights_df.loc[flights_df["MINUTES_DELAYED"] >= -100 ]
print(new_flights_df)

         MONTH  DAY_OF_MONTH AIRLINE ORIGIN_AIRPORT DESTINATION_AIRPORT  \
0            1             1      AS            ANC                 SEA   
1            1             1      AA            LAX                 PBI   
2            1             1      US            SFO                 CLT   
3            1             1      AA            LAX                 MIA   
4            1             1      AS            SEA                 ANC   
...        ...           ...     ...            ...                 ...   
1048569      3            10      DL            PHX                 ATL   
1048570      3            10      EV            RDU                 EWR   
1048571      3            10      UA            LGA                 IAH   
1048572      3            10      UA            SAN                 ORD   
1048573      3            10      EV            MSY                 ORD   

         MINUTES_DELAYED  
0                    -22  
1                     -9  
2                 

In [9]:
#filtering out flights that were late
late_flights_df = new_flights_df.loc[new_flights_df['MINUTES_DELAYED'] >= 0]
late_flights_df

Unnamed: 0,MONTH,DAY_OF_MONTH,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,MINUTES_DELAYED
2,1,1,US,SFO,CLT,5
5,1,1,DL,SFO,MSP,8
14,1,1,DL,SEA,MSP,8
20,1,1,NK,LAS,MCI,6
21,1,1,AA,SEA,MIA,1
...,...,...,...,...,...,...
1048563,3,10,EV,IND,EWR,6
1048564,3,10,EV,DEN,SHV,11
1048567,3,10,AS,SCC,BRW,11
1048569,3,10,DL,PHX,ATL,19


In [10]:
#filtering out flights that were early
early_flights_df = new_flights_df.loc[new_flights_df['MINUTES_DELAYED'] <= 0]
early_flights_df

Unnamed: 0,MONTH,DAY_OF_MONTH,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,MINUTES_DELAYED
0,1,1,AS,ANC,SEA,-22
1,1,1,AA,LAX,PBI,-9
3,1,1,AA,LAX,MIA,-9
4,1,1,AS,SEA,ANC,-21
6,1,1,NK,LAS,MSP,-17
...,...,...,...,...,...,...
1048566,3,10,UA,TPA,CLE,-21
1048568,3,10,B6,BOS,STT,-18
1048570,3,10,EV,RDU,EWR,-16
1048571,3,10,UA,LGA,IAH,-2


In [11]:
#new_flights_df datatypes to make sure the minutes_delayed is int
new_flights_df.dtypes

MONTH                   int64
DAY_OF_MONTH            int64
AIRLINE                object
ORIGIN_AIRPORT         object
DESTINATION_AIRPORT    object
MINUTES_DELAYED         int64
dtype: object


## Functions to make data ready for machine learning

In [12]:
# one Hot Encoder
def onehot_encode(new_flights_df, column_dict):
    new_flights_df = new_flights_df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(new_flights_df[column], prefix=prefix)
        new_flights_df = pd.concat([new_flights_df, dummies], axis=1)
        new_flights_df = new_flights_df.drop(column, axis=1)
        
    return new_flights_df
        
  

In [13]:
#some functions to help 
#first were gonna make a copy of the flights_df

def preprocessing_inputs(new_flights_df):
    new_flights_df = new_flights_df.copy()
    
    #one-hot encoder nominal feature columns
    new_flights_df = onehot_encode(
        new_flights_df,
        column_dict={
            'AIRLINE': 'AL',
            'ORIGIN_AIRPORT': 'OA',
            'DESTINATION_AIRPORT': 'DA'
        })
    
    #fill remaining missing values with columns means
    #Makes everything numaric and fills in null values
    remaining_na_columns = new_flights_df.loc[:, new_flights_df.isna().sum() > 0].columns
    for column in remaining_na_columns:
        new_flights_df[column] = new_flights_df[column].fillna(new_flights_df[column].mean())
        
    #training the data in X and y
    #I CHANGED IT TO MINUTES_DELAYED, HE HAD IT AS THE 'CANCELLED' COLUMN WITH WE REMOVED
    #Maybe thats why it wont show anything??

    y = new_flights_df['MINUTES_DELAYED'].copy()
    X = new_flights_df.drop('MINUTES_DELAYED', axis=1).copy()
    
    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    #scaling the data with standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns= X.columns)
    
    return X_train, X_test, y_train, y_test


In [14]:
def evaluate_model(model, X_test, y_test):
    
    model_acc = model.score(X_test, y_test)
    print("Test Accuracy: {:.2f}%".format(model_acc * 100))
    
    #confusion matrix
    y_true = np.array(y_test)
    y_pred = modelpredict(X_test)
    
    cm = confusion_matrix(y_true, y_pred)
    clr = classification_report(y_true, y_pred)
    
    plt.figure(figsize=(8,8))
    sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
    plt.xticks(np.arange(2) + 0.5, ["ON TIME", "DELAYED"])
    plt.yticks(np.arange(2) + 0.5, ["ON TIME", "DELAYED"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

In [15]:
#checking dtypes again
new_flights_df.dtypes

MONTH                   int64
DAY_OF_MONTH            int64
AIRLINE                object
ORIGIN_AIRPORT         object
DESTINATION_AIRPORT    object
MINUTES_DELAYED         int64
dtype: object

In [16]:
X_train, X_test, y_train, y_test = preprocessing_inputs(new_flights_df)

In [17]:
X_train

Unnamed: 0,MONTH,DAY_OF_MONTH,AL_AA,AL_AS,AL_B6,AL_DL,AL_EV,AL_F9,AL_HA,AL_MQ,...,DA_TYR,DA_TYS,DA_UST,DA_VEL,DA_VLD,DA_VPS,DA_WRG,DA_XNA,DA_YAK,DA_YUM
0,0.447081,1.403629,-0.319194,-0.173198,-0.217516,-0.410074,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
1,0.447081,-0.558470,-0.319194,-0.173198,-0.217516,2.438583,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
2,-0.972994,-1.366392,-0.319194,-0.173198,-0.217516,-0.410074,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
3,-0.972994,-0.443052,-0.319194,-0.173198,-0.217516,-0.410074,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
4,-0.972994,0.364871,3.132892,-0.173198,-0.217516,-0.410074,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703847,-0.972994,-0.096799,3.132892,-0.173198,-0.217516,-0.410074,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
703848,-0.972994,-1.366392,-0.319194,-0.173198,-0.217516,-0.410074,2.931012,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
703849,-0.972994,-1.366392,-0.319194,-0.173198,-0.217516,-0.410074,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223
703850,-0.972994,0.595706,-0.319194,-0.173198,-0.217516,-0.410074,-0.341179,-0.1201,-0.118741,-0.246779,...,-0.019806,-0.03522,-0.004915,-0.010185,-0.013487,-0.028643,-0.012215,-0.040595,-0.011861,-0.019223


In [18]:
y_train.value_counts()

-8       19810
-7       19797
-6       19591
-9       19492
-10      19183
         ...  
 1165        1
-78          1
 653         1
 946         1
 755         1
Name: MINUTES_DELAYED, Length: 812, dtype: int64

In [19]:
#APPARENTLLY THIS IS STILL TOO MUCH DATA IDK HOW TO FIX
#UNLESS ITS MY COMPUTER
model = LogisticRegression()
model.fit(X_train, y_train)

MemoryError: Unable to allocate 2.13 GiB for an array with shape (703852, 812) and data type int32

In [20]:
evaluate_model(model, X_test, y_test)

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [None]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)

In [None]:
classifier.fit(X_train, y_train)