# Classification

In [4]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

# Problem definition

Predict the class of a flight ( Early, On-Time, Delayed, Cancelled)

________________________
# Load Data

In [5]:
df_flights = pd.read_csv('../../Data/flightsmerged.csv', low_memory=False)

In [6]:
df_flights.head()

Unnamed: 0,MONTH,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AC,DESTINATION_AC,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,...,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,CLASS,AIRLINE_CODE,AIRLINE_NAME,ORIGIN_AIRPORT,DESTINATION_AIRPORT
0,1,98,N407AS,ANC,SEA,00:05:00,23:54:00,-11.0,205.0,1448,...,0.0,0.0,0.0,0.0,2015-01-01,Early,AS,Alaska Airlines Inc.,Ted Stevens Anchorage International Airport,Seattle-Tacoma International Airport
1,1,2336,N3KUAA,LAX,PBI,00:10:00,00:02:00,-8.0,280.0,2330,...,0.0,0.0,0.0,0.0,2015-01-01,Early,AA,American Airlines Inc.,Los Angeles International Airport,Palm Beach International Airport
2,1,840,N171US,SFO,CLT,00:20:00,00:18:00,-2.0,286.0,2296,...,0.0,0.0,0.0,0.0,2015-01-01,Delayed,US,US Airways Inc.,San Francisco International Airport,Charlotte Douglas International Airport
3,1,258,N3HYAA,LAX,MIA,00:20:00,00:15:00,-5.0,285.0,2342,...,0.0,0.0,0.0,0.0,2015-01-01,Early,AA,American Airlines Inc.,Los Angeles International Airport,Miami International Airport
4,1,135,N527AS,SEA,ANC,00:25:00,00:24:00,-1.0,235.0,1448,...,0.0,0.0,0.0,0.0,2015-01-01,Early,AS,Alaska Airlines Inc.,Seattle-Tacoma International Airport,Ted Stevens Anchorage International Airport


In [7]:
df_flights.columns

Index(['MONTH', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AC', 'DESTINATION_AC',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
       'SCHEDULED_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DATE', 'CLASS', 'AIRLINE_CODE',
       'AIRLINE_NAME', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'],
      dtype='object')

____

# Feature Engineering

## Label encoding

In [8]:
# get_dummies

categorical = ['AIRLINE_NAME','MONTH']

for col in categorical:
    df_dummies = pd.get_dummies(df_flights[col], prefix=col)
    df_flights = pd.concat([df_flights, df_dummies], axis=1)
    # Remove the original columns
    del df_flights[col]


In [9]:
df_flights.columns

Index(['FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AC', 'DESTINATION_AC',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
       'SCHEDULED_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DATE', 'CLASS', 'AIRLINE_CODE',
       'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'AIRLINE_NAME_Alaska Airlines Inc.',
       'AIRLINE_NAME_American Airlines Inc.',
       'AIRLINE_NAME_American Eagle Airlines Inc.',
       'AIRLINE_NAME_Atlantic Southeast Airlines',
       'AIRLINE_NAME_Delta Air Lines Inc.',
       'AIRLINE_NAME_Frontier Airlines Inc.',
       'AIRLINE_NAME_Hawaiian Airlines Inc.', 'AIRLINE_NAME_JetBlue Airways',
       'AIRLINE_NAME_Skywest Airlines Inc.',
       'AIRLINE_NAME_Southwest Airlines Co.', 'AIRLINE_NAME_Spirit Air Lines',
       'AIRLINE_NAME_US Airways Inc.', 'AI

## Select Model Columns

In [16]:
# selecting the columns

X_columns = ['AIRLINE_NAME_Alaska Airlines Inc.',
             'AIRLINE_NAME_American Airlines Inc.',
             'AIRLINE_NAME_American Eagle Airlines Inc.',
             'AIRLINE_NAME_Atlantic Southeast Airlines',
             'AIRLINE_NAME_Delta Air Lines Inc.',
             'AIRLINE_NAME_Frontier Airlines Inc.',
             'AIRLINE_NAME_Hawaiian Airlines Inc.', 'AIRLINE_NAME_JetBlue Airways',
             'AIRLINE_NAME_Skywest Airlines Inc.',
             'AIRLINE_NAME_Southwest Airlines Co.', 'AIRLINE_NAME_Spirit Air Lines',
             'AIRLINE_NAME_US Airways Inc.', 'AIRLINE_NAME_United Air Lines Inc.',
             'AIRLINE_NAME_Virgin America', 'MONTH_1', 'MONTH_10', 'MONTH_11',
             'MONTH_12', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8', 'MONTH_9',
             'DISTANCE']

y_column = ['CLASS']

In [17]:
# splitting the data

threshold = 0.7

X = df_flights[X_columns]
y = df_flights[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True, random_state=50)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (3988254, 27)
y_train (3988254, 1)
X_test (1709252, 27)
y_test (1709252, 1)


_________
# Training and Testing The Models

In [18]:
# # sklearn :: utils
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import KFold
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score

# models = [
#     ('Naive Bayes', GaussianNB()),
#     ('RandomForestClassifier-100', RandomForestClassifier(n_estimators=100, random_state=42)),
#     ('KNeighborsClassifier-10', KNeighborsClassifier(n_neighbors=10)),
#     ('GradientBoostingClassifier-150', GradientBoostingClassifier(n_estimators=150))
# ]

# results = []
# for m in models:
#     print('MODEL', m[0])
#     model = m[1]
#     model.fit(X_train, y_train.values.ravel())
#     y_pred = model.predict(X_test)
#     precision = precision_score(y_test, y_pred, average='weighted')
#     recall = recall_score(y_test, y_pred, average='weighted')
#     print(confusion_matrix(y_test, y_pred))
#     print('Precision', precision)
#     print('Recall', recall)
#     results.append([m[0], precision, recall])
    
#     # print top 5feature importance
#     importance = []
    
#     if hasattr(model, 'feature_importances_'):
#         print('Feature Importance')
#         importance = []
#         for i in range(len(X_columns)):
#             importance.append([X_columns[i], model.feature_importances_[i]])
#         print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
#     elif hasattr(model, 'coef_'):
#         print('Feature Importance')
#         for i in range(len(X_columns)):
#             importance.append([X_columns[i], model.coef_[i]])
#         print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
        
#     print('')


In [19]:

# # sort the results and print as a table
# df_results = pd.DataFrame(results)
# df_results.columns = ['model', 'precision', 'recall']
# df_results = df_results.sort_values(by='precision', ascending=False)
# df_results

In [20]:
X_train = X_train.iloc[:1000000,:]
y_train = y_train.iloc[:1000000,:]
X_test = X_test.iloc[:200000,:]
y_test = y_test.iloc[:200000,:]

In [21]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train.values.ravel())
rf_pred = rf_model.predict(X_test)



In [22]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train.values.ravel())
gb_pred = gb_model.predict(X_test)

KeyboardInterrupt: 

____________
# Model Evaluation

In [None]:
rf_kappa = cohen_kappa_score(y_test, rf_pred, weights ='quadratic')
print('kappa', round(rf_kappa, 4))
print(confusion_matrix(y_test, rf_pred))

In [None]:
gb_kappa = cohen_kappa_score(y_test, gb_pred, weights ='quadratic')
print('kappa', round(gb_kappa, 4))
print(confusion_matrix(y_test, gb_pred))

Using Cross Validation

In [None]:
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

# Tuning the Thresholds


In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
for i in range(1,10):
    print(i)
    y_pred = model.predict_proba(X_test)[:,1]
    y_pred = [1 if x > i/10.0 else 0 for x in y_pred]
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)