# Classification

In [11]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

________________________
# Load Data

In [12]:
# Cleaned data can be downloaded here:
# 

df_flights = pd.read_csv('../data/flightsmerged.csv', low_memory=False)

In [13]:
df_flights.head()

Unnamed: 0,MONTH,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AC,DESTINATION_AC,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,...,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,AIRLINE_CODE,AIRLINE_NAME,ORIGIN_AIRPORT,DESTINATION_AIRPORT
0,1,98,N407AS,ANC,SEA,00:05:00,23:54:00,-11.0,205.0,1448,...,,,,,,2015-01-01,AS,Alaska Airlines Inc.,Ted Stevens Anchorage International Airport,Seattle-Tacoma International Airport
1,1,2336,N3KUAA,LAX,PBI,00:10:00,00:02:00,-8.0,280.0,2330,...,,,,,,2015-01-01,AA,American Airlines Inc.,Los Angeles International Airport,Palm Beach International Airport
2,1,840,N171US,SFO,CLT,00:20:00,00:18:00,-2.0,286.0,2296,...,,,,,,2015-01-01,US,US Airways Inc.,San Francisco International Airport,Charlotte Douglas International Airport
3,1,258,N3HYAA,LAX,MIA,00:20:00,00:15:00,-5.0,285.0,2342,...,,,,,,2015-01-01,AA,American Airlines Inc.,Los Angeles International Airport,Miami International Airport
4,1,135,N527AS,SEA,ANC,00:25:00,00:24:00,-1.0,235.0,1448,...,,,,,,2015-01-01,AS,Alaska Airlines Inc.,Seattle-Tacoma International Airport,Ted Stevens Anchorage International Airport


_____________________
# Missing Values

In [15]:
print("Delayed Flights Missing Values:\n", df_flights.isnull().sum())

Delayed Flights Missing Values:
 MONTH                        0
FLIGHT_NUMBER                0
TAIL_NUMBER              14721
ORIGIN_AC                    0
DESTINATION_AC               0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME           86153
DEPARTURE_DELAY          86153
SCHEDULED_TIME               4
DISTANCE                     0
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME             91472
ARRIVAL_DELAY           103522
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON          0
AIR_SYSTEM_DELAY       4754090
SECURITY_DELAY         4754090
AIRLINE_DELAY          4754090
LATE_AIRCRAFT_DELAY    4754090
WEATHER_DELAY          4754090
DATE                         0
AIRLINE_CODE                 0
AIRLINE_NAME                 0
ORIGIN_AIRPORT          480211
DESTINATION_AIRPORT     480211
dtype: int64


In [29]:
df_flights['FLIGHT_NUMBER'].value_counts()

469     3913
326     3460
327     3459
188     3304
667     3298
403     3266
315     3248
407     3235
61      3226
761     3197
223     3194
657     3172
330     3128
719     3127
404     3122
520     3119
604     3116
511     3112
466     3108
357     3105
311     3090
470     3074
711     3068
50      3057
660     3054
612     3014
15      3011
746     3003
720     3002
406     2948
        ... 
6812       1
6799       1
6645       1
6625       1
9320       1
8445       1
6793       1
8442       1
6828       1
6794       1
6783       1
6777       1
6623       1
6850       1
6752       1
9793       1
6689       1
6775       1
6690       1
6657       1
6765       1
6731       1
6705       1
8409       1
8410       1
6663       1
6757       1
6758       1
6759       1
6096       1
Name: FLIGHT_NUMBER, Length: 6952, dtype: int64

### DEPARTURE_DELAY & ARRIVAL_DELAY

In [20]:
print("Departure Delays Missing Values = ", df_flights['DEPARTURE_DELAY'].isnull().sum())
print("Arrival Delays Missing Values = ", df_flights['ARRIVAL_DELAY'].isnull().sum())
print("Cancelled flights = ", df_flights['CANCELLED'].sum())

Departure Delays Missing Values =  86153
Arrival Delays Missing Values =  103522
Cancelled flights =  89270


In [23]:
print("Flights without departure and arrival delays = ", len(df_flights[(df_flights['DEPARTURE_DELAY'].isnull()) &
                                                             (df_flights['ARRIVAL_DELAY'].isnull())].index))

print("Cancelled flights without arrival delay = ", len(df_flights[(df_flights['ARRIVAL_DELAY'].isnull()) &
                                                               (df_flights['CANCELLED'] == 1)].index))

print("Cancelled flights without departure delay = ", len(df_flights[(df_flights['DEPARTURE_DELAY'].isnull()) &
                                                                     (df_flights['CANCELLED'] == 1)].index))

Flights without departure and arrival delays =  86153
Cancelled flights without arrival delay =  89270
Cancelled flights without departure delay =  86153


#### Conclosions:
- Flights without departure delay, doesn't have arrival delay too. However, there are some flights without arrival delay, although having departure delay.
-  All cancelled flights, doesn't have arrival delay. > Makes sense.

In [None]:
# Fill missing values with mean delay for each airline.

# airlines_dep_delays = df_flights.groupby('AIRLINE', sort=False)['DEPARTURE_DELAY'].mean()
# airlines_arr_delays = df_flights.groupby('AIRLINE', sort=False)['ARRIVAL_DELAY'].mean()

# print(airlines_dep_delays.sort_values())
# print(airlines_arr_delays.sort_values())

In [None]:

# for idx in df_flights.index[df_flights['DEPARTURE_DELAY'].isnull()]:    
    
#     # If the flight is cancelled, delay = max delay
#     if(df_flights.loc[idx,'CANCELLED'] == 1):
#         df_flights.at[idx,'DEPARTURE_DELAY'] = airlines_dep_delays.max()
#         df_flights.at[idx,'ARRIVAL_DELAY'] = airlines_arr_delays.max()
    
#     else:
#         airline_code = df_flights.loc[idx,'AIRLINE']
#         df_flights.at[idx,'DEPARTURE_DELAY'] = airlines_dep_delays.at[airline_code]
#         df_flights.at[idx,'ARRIVAL_DELAY'] = airlines_arr_delays.at[airline_code]


In [19]:
# Remove flights with missing delays

missing = []
missing += list(df_flights[(df_flights['DEPARTURE_DELAY'].isnull())
                                   | df_flights['ARRIVAL_DELAY'].isnull()].index)

len(missing)

18918

In [21]:
df_flights = df_flights[~df_flights.index.isin(missing)]
print(len(df_flights))

5714008


____

# Feature Engineering

## Add Columns

In [4]:
# Add column for flight Class (Early, On_Time, Delayed, Cancelled)

# Default value:
df_flights['CLASS'] = 'On_Time'

In [9]:
df_flights['CANCELLED'].value_counts()

0    5592910
Name: CANCELLED, dtype: int64

In [7]:
# 1- Cancelled flights

cancelled_flights = list(df_flights[df_flights['CANCELLED'] == 1].index)    
df_flights.at[cancelled_flights, 'CLASS'] = 'Cancelled'
    
print("# Cancelled flights = ", len(df_flights[df_flights['CLASS'] == 'Cancelled']))

# Cancelled flights =  0


In [6]:
# 2- Delayed flights
    
delayed_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] > 0) | (df_flights['ARRIVAL_DELAY'] > 0)].index)    
df_flights.at[delayed_flights, 'CLASS'] = 'Delayed'

print("# Delayed flights = ", len(df_flights[df_flights['CLASS'] == 'Delayed']))

# Delayed flights =  2572700


In [None]:
# 3- Early flights

early_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] < 0) & (df_flights['ARRIVAL_DELAY'] < 0)].index)
df_flights.at[early_flights, 'CLASS'] = 'Early'
print("# Early flights = ", len(early_flights))

# Remove the early flights
#df = df[~df.index.isin(early_flights)]
#print("Was: ", df.shape, " Now: ", df.shape)


In [None]:
# 4- On-time flights

print("On-time flights = ",len(df_flights[df_flights['CLASS'] == 'On_Time']))

In [None]:
df_flights['CLASS'].value_counts()

## Missing Values

In [None]:
df_flights.isnull().sum()

In [None]:
msv_columns = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY' ,'AIRLINE_DELAY' ,'LATE_AIRCRAFT_DELAY' ,'WEATHER_DELAY']
df_flights[msv_columns] = df_flights[msv_columns].fillna(0.0)

In [None]:
df_flights.isnull().sum()

## Categorical Columns

In [None]:
# get_dummies

def get_dum(df):
    categorical = ['MONTH','ORIGIN_AC','DESTINATION_AC', 'AIRLINE_CODE']
    df_dummies = pd.get_dummies(df[categorical])
    new_df = pd.concat([df_dummies, df], axis =1)    
    
    return new_df, df_dummies

In [None]:
new_df, df_dummies = get_dum(df_flights)

__________________
# Training The Models

_________
# Testing The Models

____________
# Model Evaluation