# Imports

In [1]:
import pandas as pd

In [2]:
# importing training data
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')

y_train = y_train['DEP_DELAY']

X_train

Unnamed: 0,OP_UNIQUE_CARRIER,DEST,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,DEP_TIME_M,CRS_ARR_M,Temperature,Dew Point,Humidity,...,Drizzle,Snow,Wintry Mix,Freezing Rain,MONTH_sin,MONTH_cos,DAY_OF_MONTH_sin,DAY_OF_MONTH_cos,DAY_OF_WEEK_sin,DAY_OF_WEEK_cos
0,3,20,194,1069,750,744,944,47,42,93,...,2,0,0,0,5.000000e-01,0.866025,0.485302,-0.874347,7.818315e-01,0.623490
1,1,56,401,2586,420,414,641,35,7,40,...,0,0,0,0,-5.000000e-01,0.866025,-0.101168,-0.994869,-7.818315e-01,0.623490
2,1,56,405,2586,1275,1274,60,47,22,46,...,0,0,0,0,-5.000000e-01,0.866025,0.101168,-0.994869,-9.749279e-01,-0.222521
3,0,4,72,187,636,628,708,33,11,52,...,0,0,0,0,5.000000e-01,0.866025,-0.201299,0.979530,-4.338837e-01,-0.900969
4,3,28,393,2475,1169,1170,1382,45,42,10,...,0,0,0,0,5.000000e-01,0.866025,0.485302,-0.874347,7.818315e-01,0.623490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16405,4,58,227,1598,490,491,777,42,39,10,...,0,0,0,0,-2.449294e-16,1.000000,-0.201299,0.979530,7.818315e-01,0.623490
16406,1,28,390,2475,750,740,960,44,22,51,...,0,0,0,0,5.000000e-01,0.866025,-0.571268,0.820763,9.749279e-01,-0.222521
16407,3,28,374,2475,630,625,824,49,31,59,...,0,0,0,0,-2.449294e-16,1.000000,0.101168,-0.994869,-2.449294e-16,1.000000
16408,0,50,93,264,1335,64,1428,25,7,60,...,0,0,0,0,-2.449294e-16,1.000000,-0.651372,-0.758758,-4.338837e-01,-0.900969


## Creating different models, and doing the training

Models considered
- Logistic Regression
- Random Forest
- XGBoost
- LightGBM
- CatBoost
- SVM

In [3]:
# creating different models for LR,  SVM,  RF, XGBoost, LightGBM, CatBoost, 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV

# creating a list of models
models = []
models.append(('LR', LogisticRegression()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('LGBM', LGBMClassifier()))
models.append(('CatBoost', CatBoostClassifier()))

# creating a list of parameters for each model
params = []
params.append({'C': [0.01, 0.1, 1, 10, 100], 'max_iter': [100, 200, 300]})
params.append({'C': [0.01, 0.1, 1, 10, 100]})
params.append({'n_estimators': [10, 50, 100, 200, 300, 400, 500]})
# XGBoost
params.append({
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        })
# LightGBM params
params.append({'num_leaves': [30, 50, 70],})
# CatBoost params
params.append({'iterations': [100, 200, 300], 'learning_rate': [0.01, 0.1, 1], 'depth': [4, 5, 6]})



In [None]:
# training each model

for i, (name, model) in enumerate(models):
    clf = GridSearchCV(model, params[i], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    print(name, clf.best_score_, clf.best_params_)

First Round of training
| Model | Accuracy | Parameters |
| --- | --- | --- |
| Logistic Regression | 0.8655697745277269 | {'C': 0.1, 'max_iter': 100} |
| SVM | 0.9606337599024986 | {'C': 100} |
| Random Forest | 0.894759293113955 | {'n_estimators': 100} |
| XGBoost | 0.9725776965265084 | {'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 1.0} |
| LightGBM | 0.9828762949421085 | {'num_leaves': 70} |
| CatBoost | 0.9729433272394882 | {'depth': 4, 'iterations': 300, 'learning_rate': 1} |


In [None]:
# testing the best models

# importing test data
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

y_test = y_test['DEP_DELAY']

# testing the best models
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# models with best parameters
best_models = []
best_models.append(('LR', LogisticRegression(C=0.1, max_iter=100)))
best_models.append(('SVM', SVC(C=100)))
best_models.append(('RF', RandomForestClassifier(n_estimators=100)))
best_models.append(('XGB', XGBClassifier(colsample_bytree=1.0, gamma=0.5, max_depth=4, min_child_weight=1, subsample=1)))
best_models.append(('LGBM', LGBMClassifier(num_leaves=70)))
best_models.append(('CatBoost', CatBoostClassifier(depth=4, iterations=300, learning_rate=1)))

# testing the best models
for name, model in best_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # get accuracy, precision, recall, f1-score
    print(name, accuracy_score(y_test, y_pred).round(3)
          , precision_score(y_test, y_pred).round(3)
          , recall_score(y_test, y_pred).round(3)
          , f1_score(y_test, y_pred).round(3), sep='\t')
    print(confusion_matrix(y_test, y_pred))


Accuracy on test data

| Model               | Accuracy           | Precision          | Recall             | F1 Score             |
| ------------------- | ------------------ | ------------------ | ------------------ | -------------------- |
| Logistic Regression | 0.8698513282963685 | 0.8                | 0.0074487895716946 | 0.014760147601476016 |
| SVM                 | 0.9658786253960516 | 1.0                | 0.7392923649906891 | 0.8501070663811564   |
| Random Forest       | 0.8986107726054107 | 0.8903225806451613 | 0.2569832402234637 | 0.3988439306358382   |
| XGBoost             | 0.9744089690470388 | 0.9716157205240175 | 0.8286778398510242 | 0.8944723618090451   |
| LightGBM            | 0.9836704850109675 | 0.9644268774703557 | 0.9087523277467412 | 0.9357622243528283   |
| CatBoost            | 0.9746526931513527 | 0.9409368635437881 | 0.8603351955307262 | 0.8988326848249028   |


### Training on the whole dataset

In [16]:
df_full = pd.read_csv('data/df_preprocessed.csv')

# splitting the data into train and test
from sklearn.model_selection import train_test_split

X = df_full.drop('DEP_DELAY', axis=1)
y = df_full['DEP_DELAY']

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
for name, model in best_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # get accuracy, precision, recall, f1-score
    print(name, accuracy_score(y_test, y_pred).round(3)
          , precision_score(y_test, y_pred).round(3)
          , recall_score(y_test, y_pred).round(3)
          , f1_score(y_test, y_pred).round(3), sep='\t')
    print(confusion_matrix(y_test, y_pred))

| Model               | Accuracy | Recall | Precision | F1 Score |
|---------------------|----------|--------|-----------|----------|
| Logistic Regression | 0.865    | 0.778  | 0.018     | 0.035    |
| SVM                 | 0.964    | 0.997  | 0.743     | 0.852    |
| Random Forest       | 0.895    | 0.864  | 0.281     | 0.424    |
| XGBoost             | 0.974    | 0.978  | 0.829     | 0.897    |
| LightGBM            | 0.982    | 0.983  | 0.886     | 0.932    |
| CatBoost            | 0.979    | 0.965  | 0.881     | 0.921    |