# Classification

In [None]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

________________________
# Load Data

In [None]:
# Cleaned data can be downloaded here:
# 

df_flights = pd.read_csv('../data/flightsmerged.csv', low_memory=False)

In [None]:
df_flights.columns

____________________
# Change Format

In [None]:
bool_cols = ['CANCELLED', 'DIVERTED']
df_flights[bool_cols] = df_flights[bool_cols].replace(to_replace=[0,1], value=[False,True])
df_flights[bool_cols].head()

_____________________
# Missing Values

In [None]:
print("Missing Values:\n", df_flights.isnull().sum())

## DELAY CAUSES

In [None]:
delay_cause_cols = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
df_flights[delay_cause_cols] = df_flights[delay_cause_cols].fillna(0.0)

## DEPARTURE_DELAY & ARRIVAL_DELAY

In [None]:
print("Departure Delays Missing Values = ", df_flights['DEPARTURE_DELAY'].isnull().sum())
print("Arrival Delays Missing Values = ", df_flights['ARRIVAL_DELAY'].isnull().sum())
print("Cancelled flights = ", df_flights['CANCELLED'].sum())

In [None]:
print("Flights without departure and arrival delays = ",
      len(df_flights[(df_flights['DEPARTURE_DELAY'].isnull()) & (df_flights['ARRIVAL_DELAY'].isnull())].index))

print("Cancelled flights without arrival delay = ",
      len(df_flights[(df_flights['ARRIVAL_DELAY'].isnull()) & (df_flights['CANCELLED'] == 1)].index))

print("Cancelled flights without departure delay = ",
      len(df_flights[(df_flights['DEPARTURE_DELAY'].isnull()) & (df_flights['CANCELLED'] == 1)].index))

#### Conclosions:
- Flights without departure delay, doesn't have arrival delay too. However, there are some flights without arrival delay, although having departure delay.
-  All cancelled flights, doesn't have arrival delay. > Makes sense.

In [None]:
# Mean delay for each airline.

airlines_dep_delays = df_flights.groupby('AIRLINE_NAME', sort=False)['DEPARTURE_DELAY'].mean()
airlines_arr_delays = df_flights.groupby('AIRLINE_NAME', sort=False)['ARRIVAL_DELAY'].mean()

print(airlines_dep_delays.sort_values())
print(airlines_arr_delays.sort_values())

In [None]:

for idx in df_flights.index[df_flights['DEPARTURE_DELAY'].isnull()]:    
    
    # If the flight isn't cancelled, fill arr and dep delays with mean delay of that airline
    if(df_flights.loc[idx,'CANCELLED'] == 0):
    
        airline_code = df_flights.loc[idx,'AIRLINE_NAME']
        df_flights.at[idx,'DEPARTURE_DELAY'] = airlines_dep_delays.at[airline_code]
        df_flights.at[idx,'ARRIVAL_DELAY'] = airlines_arr_delays.at[airline_code]


In [None]:
# for the rest missing values in arrival delay
for idx in df_flights.index[df_flights['ARRIVAL_DELAY'].isnull()]:    
    
    # If the flight isn't cancelled, replace arrival delay with departure delay
    if(df_flights.loc[idx,'CANCELLED'] == 0):
        df_flights.at[idx,'ARRIVAL_DELAY'] = df_flights.loc[idx,'DEPARTURE_DELAY']


In [None]:

missing = []
missing += list(df_flights[(df_flights['DEPARTURE_DELAY'].isnull())
                                   | df_flights['ARRIVAL_DELAY'].isnull()].index)

len(missing)

#### Missing values in Arrival and Departure Delays = Cancelled flights

In [None]:
df_flights.isnull().sum()

________________
## Add Columns

In [None]:
# Add column for flight Class (Early, On_Time, Delayed, Cancelled)

# Default value:
df_flights['CLASS'] = 'On_Time'

In [None]:
df_flights['CANCELLED'].value_counts()

In [None]:
# 1- Cancelled flights

cancelled_flights = list(df_flights[df_flights['CANCELLED'] == True].index)    
df_flights.at[cancelled_flights, 'CLASS'] = 'Cancelled'
    
print("# Cancelled flights = ", len(df_flights[df_flights['CLASS'] == 'Cancelled']))

In [None]:
# 2- Delayed flights
    
delayed_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] > 0) | (df_flights['ARRIVAL_DELAY'] > 0)].index)    
df_flights.at[delayed_flights, 'CLASS'] = 'Delayed'

print("# Delayed flights = ", len(df_flights[df_flights['CLASS'] == 'Delayed']))

In [None]:
# 3- Early flights

early_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] < 0) & (df_flights['ARRIVAL_DELAY'] < 0)].index)
df_flights.at[early_flights, 'CLASS'] = 'Early'
print("# Early flights = ", len(early_flights))

# Remove the early flights
#df = df[~df.index.isin(early_flights)]
#print("Was: ", df.shape, " Now: ", df.shape)


In [None]:
# 4- On-time flights

print("On-time flights = ",len(df_flights[df_flights['CLASS'] == 'On_Time']))

In [None]:
df_flights['CLASS'].value_counts()

____

# Feature Engineering

## Categorical Columns

In [None]:
# get_dummies

def get_dum(df):
    categorical = ['MONTH','FLIGHT_NUMBER ', 'AIRLINE_NAME']
    df_dummies = pd.get_dummies(df[categorical])
    new_df = pd.concat([df_dummies, df], axis =1)    
    
    return new_df, df_dummies

In [None]:
new_df, df_dummies = get_dum(df_flights)

__________________
# Training The Models

_________
# Testing The Models

____________
# Model Evaluation