# Classification

In [50]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

________________________
# Load Data

In [51]:
df_flights = pd.read_csv('../data/flightsmerged.csv', low_memory=False)

In [52]:
df_flights.head()

Unnamed: 0,MONTH,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AC,DESTINATION_AC,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_TIME,ELAPSED_TIME,...,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,AIRLINE_CODE,AIRLINE_NAME,ORIGIN_AIRPORT,DESTINATION_AIRPORT
0,1,98,N407AS,ANC,SEA,00:05:00,23:54:00,-11.0,205.0,194.0,...,,,,,,2015-01-01,AS,Alaska Airlines Inc.,Ted Stevens Anchorage International Airport,Seattle-Tacoma International Airport
1,1,2336,N3KUAA,LAX,PBI,00:10:00,00:02:00,-8.0,280.0,279.0,...,,,,,,2015-01-01,AA,American Airlines Inc.,Los Angeles International Airport,Palm Beach International Airport
2,1,840,N171US,SFO,CLT,00:20:00,00:18:00,-2.0,286.0,293.0,...,,,,,,2015-01-01,US,US Airways Inc.,San Francisco International Airport,Charlotte Douglas International Airport
3,1,258,N3HYAA,LAX,MIA,00:20:00,00:15:00,-5.0,285.0,281.0,...,,,,,,2015-01-01,AA,American Airlines Inc.,Los Angeles International Airport,Miami International Airport
4,1,135,N527AS,SEA,ANC,00:25:00,00:24:00,-1.0,235.0,215.0,...,,,,,,2015-01-01,AS,Alaska Airlines Inc.,Seattle-Tacoma International Airport,Ted Stevens Anchorage International Airport


____

# Feature Engineering

## Add Columns

In [53]:
# Add column for flight Class (Early, On_Time, Delayed, Cancelled)

# Default value:
df_flights['CLASS'] = 'On_Time'

In [54]:
# 1- Cancelled flights

cancelled_flights = list(df_flights[df_flights['CANCELLED'] == 1].index)    
df_flights.at[cancelled_flights, 'CLASS'] = 'Cancelled'
    
print("# Cancelled flights = ", len(df_flights[df_flights['CLASS'] == 'Cancelled']))

# Cancelled flights =  3117


In [55]:
# 2- Delayed flights
    
delayed_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] > 0) | (df_flights['ARRIVAL_DELAY'] > 0)].index)    
df_flights.at[delayed_flights, 'CLASS'] = 'Delayed'

print("# Delayed flights = ", len(df_flights[df_flights['CLASS'] == 'Delayed']))

# Delayed flights =  2582805


In [56]:
# 3- Early flights

early_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] < 0) & (df_flights['ARRIVAL_DELAY'] < 0)].index)
df_flights.at[early_flights, 'CLASS'] = 'Early'
print("# Early flights = ", len(early_flights))

# Remove the early flights
#df = df[~df.index.isin(early_flights)]
#print("Was: ", df.shape, " Now: ", df.shape)


# Early flights =  2708248


In [57]:
# 4- On-time flights

print("On-time flights = ",len(df_flights[df_flights['CLASS'] == 'On_Time']))

On-time flights =  319120


In [58]:
df_flights['CLASS'].value_counts()

Early        2708248
Delayed      2582805
On_Time       319120
Cancelled       1191
Name: CLASS, dtype: int64

## Missing Values

In [59]:
df_flights.isnull().sum()

MONTH                        0
FLIGHT_NUMBER                0
TAIL_NUMBER                  0
ORIGIN_AC                    0
DESTINATION_AC               0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME               0
DEPARTURE_DELAY              0
SCHEDULED_TIME               4
ELAPSED_TIME             17369
AIR_TIME                 17369
DISTANCE                     0
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME              5319
ARRIVAL_DELAY            17369
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON          0
AIR_SYSTEM_DELAY       4667937
SECURITY_DELAY         4667937
AIRLINE_DELAY          4667937
LATE_AIRCRAFT_DELAY    4667937
WEATHER_DELAY          4667937
DATE                         0
AIRLINE_CODE                 0
AIRLINE_NAME                 0
ORIGIN_AIRPORT          477872
DESTINATION_AIRPORT     477872
CLASS                        0
dtype: int64

In [60]:
msv_columns = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY' ,'AIRLINE_DELAY' ,'LATE_AIRCRAFT_DELAY' ,'WEATHER_DELAY']
df_flights[msv_columns] = df_flights[msv_columns].fillna(0.0)

In [61]:
df_flights.isnull().sum()

MONTH                       0
FLIGHT_NUMBER               0
TAIL_NUMBER                 0
ORIGIN_AC                   0
DESTINATION_AC              0
SCHEDULED_DEPARTURE         0
DEPARTURE_TIME              0
DEPARTURE_DELAY             0
SCHEDULED_TIME              4
ELAPSED_TIME            17369
AIR_TIME                17369
DISTANCE                    0
SCHEDULED_ARRIVAL           0
ARRIVAL_TIME             5319
ARRIVAL_DELAY           17369
DIVERTED                    0
CANCELLED                   0
CANCELLATION_REASON         0
AIR_SYSTEM_DELAY            0
SECURITY_DELAY              0
AIRLINE_DELAY               0
LATE_AIRCRAFT_DELAY         0
WEATHER_DELAY               0
DATE                        0
AIRLINE_CODE                0
AIRLINE_NAME                0
ORIGIN_AIRPORT         477872
DESTINATION_AIRPORT    477872
CLASS                       0
dtype: int64

## Categorical Columns

In [62]:
# get_dummies

def get_dum(df):
    categorical = ['MONTH','ORIGIN_AC','DESTINATION_AC', 'AIRLINE_CODE']
    df_dummies = pd.get_dummies(df[categorical])
    new_df = pd.concat([df_dummies, df], axis =1)    
    
    return new_df, df_dummies

In [63]:
new_df, df_dummies = get_dum(df_flights)

MemoryError: 

__________________
# Training The Models

_________
# Testing The Models

____________
# Model Evaluation