In [48]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
####################################################
# Load the data
df = pd.read_csv("n/full_data_flightdelay.csv.xz", compression='xz')
# df = pd.read_csv("test.csv.xz", compression='xz')
#print(df.head())

# take a random sample of 1000 rows
df = df.sample(n=10000)

ontime = df[df['DEP_DEL15'] == 0]
delayed = df[df['DEP_DEL15'] == 1]

In [49]:
# DATA CLEANING
# encode the categorical data
le = LabelEncoder()

def clean_labels_encoder(list_of_labels, df):
    for label in list_of_labels:
        df[label] = le.fit_transform(df[label])
    return df

# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'DEP_TIME_BLK']
df = clean_labels_encoder(list_of_labels, df)

# Drop the columns that are not needed
# df = df.drop(['CARRIER_HISTORICAL', 'DEP_AIRPORT_HIST', 'DAY_HISTORICAL',
#        'DEP_BLOCK_HIST'], axis=1)

# Fill the missing values
df.fillna(df.mean(), inplace=True)

In [50]:

# Perform Random Forest, KNN, SVM, and LGBM with the scaled data
def perform_classification(df):
    X = df.drop(['DEP_DEL15'], axis=1)
    y = df['DEP_DEL15']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifiers = {RandomForestClassifier(): 'Random Forest', 
               KNeighborsClassifier(n_neighbors=5): 'KNN', 
               SVC(): 'SVM',
               LGBMClassifier(verbose=-1): 'LGBM'}

    scalers = {StandardScaler():"Standard Scaler", MinMaxScaler(): "MinMax Scaler"}
    for scaler, name_scaler in scalers.items():
        print(name_scaler)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        for clf, name in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            print(name)
            print(accuracy_score(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            print(classification_report(y_test, y_pred))
perform_classification(df)

Standard Scaler
Random Forest
0.8115
[[1608   21]
 [ 356   15]]
              precision    recall  f1-score   support

           0       0.82      0.99      0.90      1629
           1       0.42      0.04      0.07       371

    accuracy                           0.81      2000
   macro avg       0.62      0.51      0.48      2000
weighted avg       0.74      0.81      0.74      2000

KNN
0.7905
[[1546   83]
 [ 336   35]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1629
           1       0.30      0.09      0.14       371

    accuracy                           0.79      2000
   macro avg       0.56      0.52      0.51      2000
weighted avg       0.72      0.79      0.74      2000

SVM
0.8145
[[1628    1]
 [ 370    1]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      1629
           1       0.50      0.00      0.01       371

    accuracy                           0.8

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
# Perform Random Forest, KNN, SVM, and LGBM with the PCA data
def perform_classification_pca(df):
    X = df.drop(['DEP_DEL15'], axis=1)
    y = df['DEP_DEL15']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifiers = {RandomForestClassifier(): 'Random Forest', 
               KNeighborsClassifier(n_neighbors=5): 'KNN', 
               SVC(): 'SVM',
               LGBMClassifier(verbose=-1): 'LGBM'}

    scalers = {StandardScaler():"Standard Scaler", MinMaxScaler(): "MinMax Scaler"}
    for scaler, name_scaler in scalers.items():
        print(name_scaler)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        pca = PCA(n_components=2)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        for clf, name in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            print(name)
            print(accuracy_score(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            print(classification_report(y_test, y_pred))

perform_classification_pca(df)


Standard Scaler
Random Forest
0.7865
[[1554   75]
 [ 352   19]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1629
           1       0.20      0.05      0.08       371

    accuracy                           0.79      2000
   macro avg       0.51      0.50      0.48      2000
weighted avg       0.70      0.79      0.73      2000

KNN
0.7865
[[1553   76]
 [ 351   20]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1629
           1       0.21      0.05      0.09       371

    accuracy                           0.79      2000
   macro avg       0.51      0.50      0.48      2000
weighted avg       0.70      0.79      0.73      2000

SVM
0.8145
[[1629    0]
 [ 371    0]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      1629
           1       0.00      0.00      0.00       371

    accuracy                           0.8

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest
0.783
[[1547   82]
 [ 352   19]]
              precision    recall  f1-score   support

           0       0.81      0.95      0.88      1629
           1       0.19      0.05      0.08       371

    accuracy                           0.78      2000
   macro avg       0.50      0.50      0.48      2000
weighted avg       0.70      0.78      0.73      2000

KNN
0.7855
[[1551   78]
 [ 351   20]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1629
           1       0.20      0.05      0.09       371

    accuracy                           0.79      2000
   macro avg       0.51      0.50      0.48      2000
weighted avg       0.70      0.79      0.73      2000

SVM
0.8145
[[1629    0]
 [ 371    0]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      1629
           1       0.00      0.00      0.00       371

    accuracy                           0.81      2000
   ma

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
# Perform Random Forest, KNN, SVM, and LGBM with weighted data

def perform_classification_weighted(df):
    X = df.drop(['DEP_DEL15'], axis=1)
    y = df['DEP_DEL15']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    weights = (y_train == 0).sum() / (1.0 * (y_train == 1).sum())
    classifiers = {RandomForestClassifier(class_weight={0: 1, 1: weights}): 'Random Forest', 
               KNeighborsClassifier(n_neighbors=5): 'KNN', 
               SVC(class_weight={0: 1, 1: weights}): 'SVM',
               LGBMClassifier(boosting_type='dart', verbose=-1, 
                              class_weight={0: 1, 1: weights}, 
                              random_state=42): 'LGBM'}

    scalers = {StandardScaler():"Standard Scaler", MinMaxScaler(): "MinMax Scaler"}
    for scaler, name_scaler in scalers.items():
        print(name_scaler)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        for clf, name in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            print(name)
            print(accuracy_score(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            print(classification_report(y_test, y_pred))

perform_classification_weighted(df)

Standard Scaler
Random Forest
0.812
[[1614   15]
 [ 361   10]]
              precision    recall  f1-score   support

           0       0.82      0.99      0.90      1629
           1       0.40      0.03      0.05       371

    accuracy                           0.81      2000
   macro avg       0.61      0.51      0.47      2000
weighted avg       0.74      0.81      0.74      2000

KNN
0.7905
[[1546   83]
 [ 336   35]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1629
           1       0.30      0.09      0.14       371

    accuracy                           0.79      2000
   macro avg       0.56      0.52      0.51      2000
weighted avg       0.72      0.79      0.74      2000

SVM
0.616
[[1049  580]
 [ 188  183]]
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      1629
           1       0.24      0.49      0.32       371

    accuracy                           0.62 

In [53]:
# Perform Recursive Feature Elimination

from sklearn.ensemble import RandomForestClassifier


def perform_rfe(df):
    X = df.drop(['DEP_DEL15'], axis=1)
    y = df['DEP_DEL15']


    clf = RandomForestClassifier()
    selector = RFECV(clf, step=1, scoring='neg_mean_squared_error',cv=5, verbose=1, n_jobs=-1)
    selector.fit(X, y)
    selector.transform(X)
    print(selector)
    print("Optimal number of features {}".format(selector.n_features_))
    print(np.where(selector.support_ == False)[0])
    print(X.columns[selector.support_ == False])
    X.drop(X.columns[selector.support_ == False], axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    weights = (y_train == 0).sum() / (1.0 * (y_train == 1).sum())
    classifiers = {RandomForestClassifier(): 'Random Forest', 
               KNeighborsClassifier(n_neighbors=5): 'KNN', 
               SVC(): 'SVM',
               LGBMClassifier(verbose=-1, class_weight={0: 1, 1: weights}): 'LGBM'}
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    for clf, name in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(accuracy_score(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

perform_rfe(df)


Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
RFECV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
      scoring='neg_mean_squared_error', verbose=1)
Optimal number of features 20
[ 7 12 13 21 22]
Index(['CARRIER_NAME', 'AVG_MONTHLY_PASS_AIRLINE', 'FLT_ATTENDANTS_PER_PASS',
       'SNOW', 'SNWD'],
      dtype='object')
Random Forest
0.811
[[1610   19]
 [ 359   12]]
              precision    recall  f1-score   support

           0       0.82      0.99      0.89      1629
           1       0.39      0.03      0.06       371

    accuracy                           0.81      2000
   macro avg       0.60      0.51      0.48      2000
weighted avg       0.74      0.81      0.74      2000

KNN
0.7815
[[1533   96]
 [ 341   30]]
              precision    recall  f1-score   support

           0       0.82      0.94      0.88      1629
           1      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
# Perform Hyperparameter tuning on the LGBM model


def perform_hyperparameter_tuning(df):
    X = df.drop(['DEP_DEL15'], axis=1)
    y = df['DEP_DEL15']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    weights = (y_train == 0).sum() / (1.0 * (y_train == 1).sum())
    lgbm = LGBMClassifier(boosting_type='dart', verbose=-1, 
                          class_weight={0: 1, 1: weights}, 
                          random_state=42)
    param_grid = {
        'num_leaves': [31, 127],
        'reg_alpha': [0.1, 0.5],
        'min_data_in_leaf': [30, 50, 100, 300, 400],
        'lambda_l1': [0, 1, 1.5],
        'lambda_l2': [0, 1],
        
    }
    grid_search = GridSearchCV(lgbm, param_grid, cv=5, verbose=1, n_jobs=-1, scoring='recall')
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    y_pred = grid_search.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

perform_hyperparameter_tuning(df)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
{'lambda_l1': 1.5, 'lambda_l2': 1, 'min_data_in_leaf': 400, 'num_leaves': 31, 'reg_alpha': 0.1}
0.6020408163265306
0.6215
[[1038  591]
 [ 166  205]]
              precision    recall  f1-score   support

           0       0.86      0.64      0.73      1629
           1       0.26      0.55      0.35       371

    accuracy                           0.62      2000
   macro avg       0.56      0.59      0.54      2000
weighted avg       0.75      0.62      0.66      2000

