<img src="https://habrastorage.org/webt/fs/42/ms/fs42ms0r7qsoj-da4x7yfntwrbq.jpeg" width=40% />

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('flight_delays_train.csv')
test = pd.read_csv('flight_delays_test.csv')

In [3]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [5]:
test.shape, train.shape

((100000, 8), (100000, 9))

**Кросс-валидация для временных рядов (хотя у нас порядок не соблюден)**

In [6]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

time_split = TimeSeriesSplit(n_splits=5)

**Отберем только 2 признака, выполним масштабирование. Разобъем на обучающую и тестовую выбоки.**

In [41]:
X_train, y_train = train[['Distance', 'DepTime']].values, train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test[['Distance', 'DepTime']].values

In [42]:
from sklearn.model_selection import train_test_split

X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=17)

In [43]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)
X_train_part = scaler.transform(X_train_part)
X_valid = scaler.transform(X_valid)

In [46]:
X_valid[:5]

array([[ 0.38565514, -1.34247694],
       [-0.87954204, -1.33198103],
       [-0.68288829,  0.54258839],
       [-0.244333  ,  1.65935315],
       [-0.17124045, -0.04518254]])

**Выбор модели**

In [52]:
%%time
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression()

logit.fit(X_train_part, y_train_part)
logit_valid_pred = logit.predict_proba(X_valid)[:, 1]

print('Train test split LogisticRegression score:% s ROC AUC' % round(roc_auc_score(y_valid, logit_valid_pred), 4))
cross_score_lr = np.mean(cross_val_score(logit, X_train, y_train, scoring = 'roc_auc', cv=time_split))
print('Cross validation LogisticRegression score:% s ROC AUC' % round(cross_score_lr, 4))

Train test split LogisticRegression score:0.6796 ROC AUC
Cross validation LogisticRegression score:0.6812 ROC AUC
Wall time: 743 ms


In [50]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(random_state = 42, n_estimators = 20)
clf_rf.fit(X_train_part, y_train_part)
preds_rf = clf_rf.predict_proba(X_valid)[:, 1]
print('Train test split RandomForestClassifier score:% s ROC AUC' % round(roc_auc_score(y_valid, preds_rf), 6))
cross_score_rf = np.mean(cross_val_score(clf_rf, X_train, y_train, scoring='roc_auc', cv=time_split))
print('Cross validation RandomForestClassifier score:% s ROC AUC' % round(cross_score_rf, 6))

Train test split RandomForestClassifier score:0.663494 ROC AUC
Cross validation RandomForestClassifier score:0.65498 ROC AUC
Wall time: 12.5 s


In [53]:
%%time
from xgboost import XGBRegressor

reg_xgb = XGBRegressor(2, 0.01, 50, objective='binary:logistic', random_state=42, booster='gblinear', scale_pos_weight=109)
reg_xgb.fit(X_train_part, y_train_part)
preds_xgb_reg = reg_xgb.predict(X_valid)
print('Train test split XGBRegressor score:% s ROC AUC' % round(roc_auc_score(y_valid, preds_xgb_reg), 4))
cross_score_xgb_reg = np.mean(cross_val_score(reg_xgb, X_train, y_train, scoring='roc_auc', cv=time_split))
print('Cross validation XGBRegressor score:% s ROC AUC' % round(cross_score_xgb_reg, 4))

Train test split XGBRegressor score:0.6796 ROC AUC
Cross validation XGBRegressor score:0.6107 ROC AUC
Wall time: 9.24 s


**Подберем оптимальные параметры для XGBoost**

In [54]:
param_grid = [
    {'max_depth' : [2],
    'colsample_bytree' : [1],
    'subsample' : [1],
    'gamma' : [0],
    'reg_alpha' : [1],
    'reg_lambda' : [0.9]}]

In [55]:
%%time
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=reg_xgb, param_grid = param_grid,
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.6s finished


Wall time: 18.4 s


In [56]:
grid_search.best_params_, grid_search.best_score_

({'colsample_bytree': 1,
  'gamma': 0,
  'max_depth': 2,
  'reg_alpha': 1,
  'reg_lambda': 0.9,
  'subsample': 1},
 0.611662318903449)

In [57]:
%%time
from catboost import CatBoostClassifier

ctb = CatBoostClassifier(random_seed=17, verbose=0)
ctb.fit(X_train_part, y_train_part, verbose=0)
logit_valid_pred = ctb.predict_proba(X_valid)[:, 1]
print('Train test split CatBoostClassifier score:% s ROC AUC' % round(roc_auc_score(y_valid, logit_valid_pred), 4))
cross_score_lr = np.mean(cross_val_score(ctb, X_train, y_train, scoring = 'roc_auc', cv=2, verbose=0)) # time_split
print('Cross validation CatBoostClassifier score:% s ROC AUC' % round(cross_score_lr, 4))

Train test split CatBoostClassifier score:0.7022 ROC AUC
Cross validation CatBoostClassifier score:0.7009 ROC AUC
Wall time: 1min 23s


**Сделаем прогнозов для тестовой выборки и запишим в файл**

In [59]:
ctb.fit(X_train_scaled, y_train, verbose=0)
pred = ctb.predict_proba(X_test_scaled)[:, 1]

pd.Series(pred, name='dep_delayed_15min').to_csv('ctb_sub.csv', index_label='id', header=True)