In [38]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, f1_score, precision_score

In [2]:
train = pd.read_csv('flight_delays_train.csv')
test = pd.read_csv('flight_delays_test.csv')

In [3]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


**We need to map Origin/Destination points and Carriers, so we can use these features in CatBoost.**

In [5]:
places = pd.Series((test['Origin'].append(test['Dest']).append(train['Origin']).append(train['Dest'])).unique()).to_dict()
places_map = {v: k for k, v in places.items()}
places_map;

In [6]:
carriers = pd.Series((test['UniqueCarrier'].append(test['UniqueCarrier']).append(train['UniqueCarrier']).append(train['UniqueCarrier'])).unique()).to_dict()
carriers_map = {v: k for k, v in carriers.items()}
carriers

{0: 'YV',
 1: 'WN',
 2: 'MQ',
 3: 'UA',
 4: 'NW',
 5: 'B6',
 6: 'US',
 7: 'AA',
 8: 'OH',
 9: 'OO',
 10: 'FL',
 11: 'DL',
 12: 'EV',
 13: 'CO',
 14: 'XE',
 15: '9E',
 16: 'HA',
 17: 'AS',
 18: 'AQ',
 19: 'F9',
 20: 'DH',
 21: 'TZ',
 22: 'HP'}

**Function to prepare test and train dataframes for CatBoost:**

In [7]:
def prepare_df(df):
    df_copy = df.copy()
    df_copy['Dest'] = df_copy['Dest'].map(places_map)
    df_copy['Origin'] = df_copy['Origin'].map(places_map)
    df_copy['UniqueCarrier'] = df_copy['UniqueCarrier'].map(carriers_map)
    df_copy['Month'] = df_copy['Month'].str.replace('c-', '').astype(int)
    df_copy['DayofMonth'] = df_copy['DayofMonth'].str.replace('c-', '').astype(int)
    df_copy['DayOfWeek'] = df_copy['DayOfWeek'].str.replace('c-', '').astype(int)
    return df_copy

In [8]:
X_train, y_train = prepare_df(train), train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_train = X_train.drop(columns=['dep_delayed_15min'])
X_test = prepare_df(test)

In [9]:
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)

**Scaling**

In [17]:
col_names = ['DepTime','Distance']
X_train_scale = X_train.copy()
X_test_scale = X_test.copy()

scaler = StandardScaler()
X_train_scale[col_names] = scaler.fit(X_train[col_names])

X_test_scale[col_names] = scaler.transform(X_test[col_names])
X_train_part[col_names] = scaler.transform(X_train_part[col_names])
X_valid[col_names] = scaler.transform(X_valid[col_names])

**Training CatBoost and estimating holdout ROC AUC:**

In [16]:
model = CatBoostClassifier(random_state=17, learning_rate=0.1, max_depth=5, verbose=False)

model.fit(X_train_part, y_train_part)
model_valid_pred = model.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, model_valid_pred)

0.7433514032850161

**Metrics (tutorial)**

In [37]:
#from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, f1_score, precision_score

y_pred_bin = [1 if i > 0.5 else 0 for i in model_valid_pred]
print('Confusion matrix')
print(confusion_matrix(y_valid, y_pred_bin))
print()
print('Accuracy:  %.4f' % accuracy_score(y_valid, y_pred_bin))
print('Precision: %.4f' % precision_score(y_valid, y_pred_bin))
print('Recall:    %.4f' % recall_score(y_valid, y_pred_bin))
print('F1:        %.4f' % f1_score(y_valid, y_pred_bin))

Confusion matrix
[[23913   422]
 [ 4926   739]]

Accuracy:  0.8217
Precision: 0.6365
Recall:    0.1305
F1:        0.2165


In [19]:
actual = pd.Series(y_valid, name='Actual')
predict = pd.Series(y_pred_bin, name='Predicted')
df_confusion = pd.crosstab(actual, predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
df_confusion

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,23913,422,24335
1,4926,739,5665
All,28839,1161,30000


**Making predictions to test set and forming a submission file:**

In [20]:
model.fit(X_train, y_train)
model_test_pred = model.predict_proba(X_test)[:, 1]

pd.Series(model_test_pred, name='dep_delayed_15min').to_csv('cat_fe_sub.csv', index_label='id', header=True)

**Lightgbm + GridSearchCV**

In [36]:
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import GridSearchCV

gbm = LGBMClassifier(n_estimators=200)
gbm.fit(X_train_part, y_train_part,
        eval_set=[(X_valid, y_valid)],
        eval_metric='l1',
        early_stopping_rounds=5,
        verbose=0)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=200, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [25]:
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'nthread': 3,
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}

# Create parameters to search
gridParams = {
    'learning_rate': [0.5], # 0.005
    'n_estimators': [400], # 40, 1000
    'num_leaves': [5, 10, 15],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], 
    'colsample_bytree' : [0.66],
    'subsample' : [0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.4],
    }

# Create classifier to use
mdl = LGBMClassifier(boosting_type= 'gbdt',
          objective = 'auc',
          n_jobs = 3, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

In [31]:
grid = GridSearchCV(mdl, gridParams,
                    verbose=3,
                    cv=4,
                    n_jobs=-1)
# Run the grid
grid.fit(X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.0min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_bin=512,
        max_depth=-1, min_child_samples=5, min_child_weight=1,
        min_split_gain=0.5, n_estimators=100, n_jobs=3, num_leaves=31,
        objective='auc', random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        scale_pos_weight=1, silent=True, subsample=1,
        subsample_for_bin=200, subsample_freq=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.5], 'n_estimators': [400], 'num_leaves': [5, 10, 15], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'random_state': [501], 'colsample_bytree': [0.66], 'subsample': [0.75], 'reg_alpha': [1, 1.2], 'reg_lambda': [1, 1.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [32]:
grid.best_params_, grid.best_score_

({'boosting_type': 'gbdt',
  'colsample_bytree': 0.66,
  'learning_rate': 0.5,
  'n_estimators': 400,
  'num_leaves': 5,
  'objective': 'binary',
  'random_state': 501,
  'reg_alpha': 1,
  'reg_lambda': 1.4,
  'subsample': 0.75},
 0.81831)

In [33]:
gbm = LGBMClassifier(**grid.best_params_)
gbm.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.66,
        importance_type='split', learning_rate=0.5, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=400, n_jobs=-1, num_leaves=5, objective='binary',
        random_state=501, reg_alpha=1, reg_lambda=1.4, silent=True,
        subsample=0.75, subsample_for_bin=200000, subsample_freq=0)

In [34]:
my_submission = pd.DataFrame({'id': X_test.index, 'dep_delayed_15min': gbm.predict_proba(X_test)[:, 1]})
my_submission.to_csv('light_grig_sub.csv', index=False)