In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
train = pd.read_csv("../../data/flight_delays_train.csv")
test = pd.read_csv("../../data/flight_delays_test.csv")

In [3]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


Итак, надо по времени вылета самолета, коду авиакомпании-перевозчика, месту вылета и прилета и расстоянию между аэропортами вылета и прилета предсказать задержку вылета более 15 минут. В качестве простейшего бенчмарка возьмем логистическую регрессию и два признака, которые проще всего взять: `DepTime` и `Distance`. У такой модели результат – 0.68202 на LB.

In [5]:
X_train, y_train = (
    train[["Distance", "DepTime"]],
    train["dep_delayed_15min"].map({"Y": 1, "N": 0}),
)
X_test = test[["Distance", "DepTime"]]

X_train_part, X_valid, y_train_part, y_valid = train_test_split(
    X_train, y_train, test_size=0.3, random_state=17
)

In [6]:
logit = LogisticRegression(random_state=17)

logit.fit(X_train_part, y_train_part)
logit_valid_pred = logit.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, logit_valid_pred)

0.6795697123357751

In [7]:
logit.fit(X_train, y_train)
logit_test_pred = logit.predict_proba(X_test)[:, 1]

pd.Series(logit_test_pred, name="dep_delayed_15min").to_csv(
    "logit_2feat.csv", index_label="id", header=True
)

Как был получен бенчмарк в соревновании:
- Признаки `Distance` и  `DepTime` брались без изменений
- Создан признак "маршрут" из исходных `Origin` и `Dest`
- К признакам `Month`, `DayofMonth`, `DayOfWeek`, `UniqueCarrier` и "маршрут" применено OHE-преобразование (`LabelBinarizer`)
- Выделена отложенная выборка
- Обучалась логистическая регрессия и градиентный бустинг (xgboost), гиперпараметры бустинга настраивались на кросс-валидации, сначала те, что отвечают за сложность модели, затем число деревьев фиксировалось равным 500 и настраивался шаг градиентного спуска
- С помощью `cross_val_predict` делались прогнозы обеих моделей на кросс-валидации (именно предсказанные вероятности), настраивалась линейная смесь ответов логистической регрессии и градиентного бустинга вида $w_1 * p_{logit} + (1 - w_1) * p_{xgb}$, где $p_{logit}$ – предсказанные логистической регрессией вероятности класса 1, $p_{xgb}$ – аналогично. Вес $w_1$ подбирался вручную. 
- В качестве ответа для тестовой выборки бралась аналогичная комбинация ответов двух моделей, но уже обученных на всей обучающей выборке.

Описанный план ни к чему не обязывает – это просто то, как решение получил автор задания. Возможно, мы не захотите следовать намеченному плану, а добавите, скажем, пару хороших признаков и обучите лес из тысячи деревьев.

Удачи!

## Заметный дисбаланс классов


In [8]:
train.value_counts("dep_delayed_15min")

dep_delayed_15min
N    80956
Y    19044
Name: count, dtype: int64

## Просто случайный лес, без предобработки и без категориальных признаков


In [9]:
from sklearn.model_selection import StratifiedKFold

scv = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

params = {
    "n_jobs" : -1,
    "n_estimators" : 100
}

rf_no_tune_cv = cross_val_score(RandomForestClassifier(**params), X_train, y_train, cv=scv, scoring="roc_auc")

In [11]:
print(f"Результат на кроссвалидации: {np.mean(rf_no_tune_cv)}")

Результат на кроссвалидации: 0.6782710381641122


In [12]:
rf_no_tune = RandomForestClassifier(**params).fit(X_train_part, y_train_part)
prob_rf_no_tune = rf_no_tune.predict_proba(X_valid)[:, 1]
prob_rf_no_tune

array([0.    , 0.01  , 0.    , ..., 0.39  , 0.0175, 0.    ])

In [13]:
print(f"Результат на отложенной выборке: {roc_auc_score(y_valid, prob_rf_no_tune)}")

Результат на отложенной выборке: 0.6763220645335384


## Воспользуемся LabelEncoder

In [14]:
y = train["dep_delayed_15min"].map({"Y": 1, "N": 0})
train = train.drop("dep_delayed_15min", axis=1)

In [15]:
cat_features =  list(train.columns[train.dtypes == "object"])
cat_features

['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']

In [16]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_label_encoded = train.copy()
for feature in cat_features:
    train_label_encoded[feature] = label_encoder.fit_transform(train_label_encoded[feature])
train_label_encoded

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,10,13,6,1934,0,18,78,732
1,6,12,2,1548,18,217,171,834
2,11,11,4,1422,20,228,59,416
3,2,17,5,1015,15,78,175,872
4,1,28,5,1828,19,174,199,423
...,...,...,...,...,...,...,...,...
99995,7,25,2,1618,15,246,224,199
99996,0,9,2,804,4,92,72,884
99997,0,16,1,1901,13,85,131,1076
99998,6,19,3,1515,12,79,107,140


In [17]:
rf_label_encoded_cv = cross_val_score(RandomForestClassifier(**params), train_label_encoded, y, cv=scv, scoring="roc_auc")

In [18]:
print(f"Результат на кроссвалидации: {np.mean(rf_label_encoded_cv)}")

Результат на кроссвалидации: 0.727701233281608


## OneHotEncoding для всех категориальных признаков признаков

In [19]:
pd.get_dummies(train, dtype=float)

Unnamed: 0,DepTime,Distance,Month_c-1,Month_c-10,Month_c-11,Month_c-12,Month_c-2,Month_c-3,Month_c-4,Month_c-5,...,Dest_TYS,Dest_VCT,Dest_VIS,Dest_VLD,Dest_VPS,Dest_WRG,Dest_WYS,Dest_XNA,Dest_YAK,Dest_YUM
0,1934,732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1548,834,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1422,416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1015,872,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1828,423,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1618,199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,804,884,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,1901,1076,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,1515,140,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Другой вариант
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
train_onehot_encoded = train.copy()
train_onehot_encoded = train_onehot_encoded[["DepTime", "Distance"]]
one_hot_encoded = encoder.fit_transform(train[cat_features])

In [21]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
train_onehot_encoded = pd.concat([train_onehot_encoded, one_hot_df], axis=1)

In [22]:
X_train_onehot, X_valid_onehot, y_train_onehot, y_valid_onehot = train_test_split(
    train_onehot_encoded, y, test_size=0.3, random_state=17, stratify=y)

rf_onehot = RandomForestClassifier(**params).fit(X_train_onehot, y_train_onehot)

In [23]:
prob_onehot = rf_onehot.predict_proba(X_valid_onehot)[:, 1]
print(f"Результат на отложенной выборке: {roc_auc_score(y_valid_onehot, prob_onehot)}")

Результат на отложенной выборке: 0.7343303805920667


## Попробуем XGBoost

In [24]:
import xgboost as xgb

xgb_params = {
    "max_depth": 1,
    "eta": 0.3,
    "silent": 0,
    "objective": "binary:logistic",
    "eval_metric": "auc"
}

dtrain = xgb.DMatrix(X_train_onehot, y_train_onehot)
dvalid = xgb.DMatrix(X_valid_onehot, y_valid_onehot)

In [25]:
watchlist = [(dvalid, "valid"), (dtrain, "train")]
boost = xgb.train(xgb_params, dtrain, 1, watchlist)

Parameters: { "silent" } are not used.



[0]	valid-auc:0.63607	train-auc:0.63047


In [26]:
test_encoded = encoder.transform(test[cat_features])
one_hot_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out())
test_encoded = pd.concat([test[["DepTime", "Distance"]], one_hot_df], axis=1)

In [27]:
test_encoded

Unnamed: 0,DepTime,Distance,Month_c-1,Month_c-10,Month_c-11,Month_c-12,Month_c-2,Month_c-3,Month_c-4,Month_c-5,...,Dest_TYS,Dest_VCT,Dest_VIS,Dest_VLD,Dest_VPS,Dest_WRG,Dest_WYS,Dest_XNA,Dest_YAK,Dest_YUM
0,615,598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,739,1235,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,651,577,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1614,377,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1505,258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,852,187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,1446,1515,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,1509,438,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,804,761,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
dtest = xgb.DMatrix(test_encoded)
xgb_test_pred = boost.predict(dtest)

pd.Series(xgb_test_pred, name="dep_delayed_15min").to_csv(
    "xgb_1.csv", index_label="id", header=True
)

## План:
* Добавить новые признаки в модель
* Подобрать гиперпараметры с помощью Hyperopt

In [29]:
train

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732
1,c-4,c-20,c-3,1548,US,PIT,MCO,834
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423
...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140


In [30]:
num_features = list(train.columns[train.dtypes != "object"])
num_features

['DepTime', 'Distance']

## Сделаем всё красиво с помощью пайплайна

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

std_scaler = StandardScaler()
pipe_num = Pipeline([("scaler", std_scaler)])

one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
pipe_cat = Pipeline([("encoder", one_hot_encoder)])

col_transformer = ColumnTransformer([("num", pipe_num, num_features),
                                     ("cat", pipe_cat, cat_features)])

In [32]:
res = col_transformer.fit_transform(train)

## Вернём названия для столбцов

In [33]:
res_df = pd.DataFrame(res, columns= [col.split("__")[-1] for col in col_transformer.get_feature_names_out()])
res_df

Unnamed: 0,DepTime,Distance,Month_c-1,Month_c-10,Month_c-11,Month_c-12,Month_c-2,Month_c-3,Month_c-4,Month_c-5,...,Dest_TYS,Dest_VCT,Dest_VIS,Dest_VLD,Dest_VPS,Dest_WRG,Dest_WYS,Dest_XNA,Dest_YAK,Dest_YUM
0,1.243715,0.004530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.433431,0.182040,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.168934,-0.545405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.685433,0.248172,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.021202,-0.533223,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.580374,-0.923050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,-1.128360,0.269055,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,1.174442,0.603192,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.364158,-1.025727,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Можно добавить модель и параметры к ней через "model__param="

In [34]:
model1 = Pipeline([("preproc", col_transformer),
                   ("rf", RandomForestClassifier(n_jobs=-1))])

model1.set_params(rf__n_estimators=1)
t = model1.fit(train, y)
t

In [35]:
train_new = train.copy()
test_new = test.copy()

In [36]:
train_new["Route"] = train_new["Origin"] +"_"+ train_new["Dest"]
test_new["Route"] = test_new["Origin"] +"_"+ test_new["Dest"]

In [37]:
train_new.drop(["Origin", "Dest"], axis=1)
test_new.drop(["Origin", "Dest"], axis=1)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Distance,Route
0,c-7,c-25,c-3,615,YV,598,MRY_PHX
1,c-4,c-17,c-2,739,WN,1235,LAS_HOU
2,c-12,c-2,c-7,651,MQ,577,GSP_ORD
3,c-3,c-25,c-7,1614,WN,377,BWI_MHT
4,c-6,c-6,c-3,1505,UA,258,ORD_STL
...,...,...,...,...,...,...,...
99995,c-6,c-5,c-2,852,WN,187,CRP_HOU
99996,c-11,c-24,c-6,1446,UA,1515,ORD_LAS
99997,c-1,c-30,c-2,1509,OO,438,ORD_SGF
99998,c-1,c-5,c-5,804,DL,761,LGA_ATL


In [38]:
train_new = col_transformer.fit_transform(train_new)
test_new = col_transformer.transform(test_new)

In [39]:
X_train_new, X_valid_new, y_train_new, y_valid_new = train_test_split(train_new, y, test_size=0.3, stratify=y, shuffle=True)

In [40]:
xgb_params = {
    "max_depth": 1,
    "eta": 0.3,
    "verbosity": 2,
    "objective": "binary:logistic",
    "eval_metric": "auc"
}

dtrain = xgb.DMatrix(X_train_new, y_train_new)
dvalid = xgb.DMatrix(X_valid_new, y_valid_new)
watchlist = [(dvalid, "valid"), (dtrain, "train")]
boost = xgb.train(xgb_params, dtrain, 100, watchlist)



[0]	valid-auc:0.63669	train-auc:0.63043
[1]	valid-auc:0.64932	train-auc:0.64454
[2]	valid-auc:0.67061	train-auc:0.66557
[3]	valid-auc:0.67688	train-auc:0.67241
[4]	valid-auc:0.68155	train-auc:0.67678
[5]	valid-auc:0.68823	train-auc:0.68511
[6]	valid-auc:0.68824	train-auc:0.68484
[7]	valid-auc:0.68939	train-auc:0.68639
[8]	valid-auc:0.68936	train-auc:0.68638
[9]	valid-auc:0.68963	train-auc:0.68657
[10]	valid-auc:0.69065	train-auc:0.68796
[11]	valid-auc:0.69171	train-auc:0.68903
[12]	valid-auc:0.69277	train-auc:0.68964
[13]	valid-auc:0.69386	train-auc:0.69113
[14]	valid-auc:0.69539	train-auc:0.69308
[15]	valid-auc:0.69664	train-auc:0.69475
[16]	valid-auc:0.69669	train-auc:0.69474
[17]	valid-auc:0.69707	train-auc:0.69532
[18]	valid-auc:0.69701	train-auc:0.69521
[19]	valid-auc:0.69835	train-auc:0.69656
[20]	valid-auc:0.69922	train-auc:0.69764
[21]	valid-auc:0.70026	train-auc:0.69861
[22]	valid-auc:0.70069	train-auc:0.69929
[23]	valid-auc:0.70065	train-auc:0.69924
[24]	valid-auc:0.70076	tra

In [41]:
dtrain_full = xgb.DMatrix(train_new, y)
boost_final = xgb.train(xgb_params, dtrain_full, 1)

In [42]:
dtest = xgb.DMatrix(test_new)
xgb_test_pred = boost_final.predict(dtest)

# pd.Series(xgb_test_pred, name="dep_delayed_15min").to_csv(
#     "xgb_2.csv", index_label="id", header=True
# )

## Попытаемся настроить параметры

In [43]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import log_loss

def score(params):
    print("Params:", params)
    dtrain = xgb.DMatrix(X_train_new, y_train_new)
    dvalid = xgb.DMatrix(X_valid_new, y_valid_new)
    model = xgb.train(params, dtrain, params["num_round"])
    pred = model.predict(dvalid)
    score = roc_auc_score(y_valid_new, pred)
    print(f"Score: {score}")
    return {"loss": -score, "status": STATUS_OK}

def optimize(trials, space, max_evals=10):
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
    return best


In [44]:
space = {
    "num_round": 100,
    "learning_rate": 0.1,
    "max_depth": hp.choice("max_depth", np.arange(1, 14, dtype=int)),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "subsample": hp.quniform("subsample", 0.5, 1, 0.05),
    "gamma": hp.quniform("gamma", 0.5, 1, 0.01),
    "colsample_bytree": hp.quniform("colsample_bytree", 0.4, 1, 0.05),
    "eval_metric": "auc",
    "objective": "binary:logistic",
}

trials = Trials()
best_params = optimize(trials, space, 1)
best_params

Params:                                              
{'colsample_bytree': 0.9500000000000001, 'eval_metric': 'auc', 'gamma': 0.87, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 8.0, 'num_round': 100, 'objective': 'binary:logistic', 'subsample': 0.8}
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

Parameters: { "num_round" } are not used.




Score: 0.7358211810857921                            
100%|██████████| 1/1 [00:03<00:00,  3.71s/trial, best loss: -0.7358211810857921]


{'colsample_bytree': 0.9500000000000001,
 'gamma': 0.87,
 'max_depth': 4,
 'min_child_weight': 8.0,
 'subsample': 0.8}

In [45]:
best_params

{'colsample_bytree': 0.9500000000000001,
 'gamma': 0.87,
 'max_depth': 4,
 'min_child_weight': 8.0,
 'subsample': 0.8}

In [46]:
best_params_saved = {
    ## Настраивались
    'colsample_bytree': 0.65,
    'gamma': 0.98,
    'max_depth': 12,
    'min_child_weight': 3.0,
    'subsample': 0.9,
    ## Не настраивались
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "silent": 0,
    "learning_rate": 0.1
}

In [47]:
boost_final_tuned = xgb.train(best_params_saved, dtrain_full, 1000)

Parameters: { "silent" } are not used.



In [48]:
xgb_test_pred = boost_final_tuned.predict(dtest)

# pd.Series(xgb_test_pred, name="dep_delayed_15min").to_csv(
#     "xgb_3.csv", index_label="id", header=True
# )

## Теперь настроим learning_rate

In [49]:
space_lr = {
    "num_round": 1500,
    'colsample_bytree': 0.65,
    'gamma': 0.98,
    'max_depth': 12,
    'min_child_weight': 3.0,
    'subsample': 0.9,
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "silent": 0,
    "learning_rate": hp.quniform("eta", 0.005, 0.05, 0.005),
}

trials = Trials()
best_params = optimize(trials, space_lr, max_evals=15)
best_params

Params:                                               
{'colsample_bytree': 0.65, 'eval_metric': 'auc', 'gamma': 0.98, 'learning_rate': 0.015, 'max_depth': 12, 'min_child_weight': 3.0, 'num_round': 1500, 'objective': 'binary:logistic', 'silent': 0, 'subsample': 0.9}
  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]

Parameters: { "num_round", "silent" } are not used.




Score: 0.7554562764022572                             
Params:                                                                          
{'colsample_bytree': 0.65, 'eval_metric': 'auc', 'gamma': 0.98, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 3.0, 'num_round': 1500, 'objective': 'binary:logistic', 'silent': 0, 'subsample': 0.9}
  7%|▋         | 1/15 [00:46<10:43, 46.00s/trial, best loss: -0.7554562764022572]

Parameters: { "num_round", "silent" } are not used.




  7%|▋         | 1/15 [01:14<17:26, 74.74s/trial, best loss: -0.7554562764022572]


KeyboardInterrupt: 

In [103]:
best_params_saved_final = {
    'colsample_bytree': 0.65,
    'gamma': 0.98,
    'max_depth': 12,
    'min_child_weight': 3.0,
    'subsample': 0.9,
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "learning_rate": 0.03
}

In [52]:
boost_final_tuned = xgb.train(best_params_saved_final, dtrain_full, 1500)

In [None]:
xgb_test_pred = boost_final_tuned.predict(dtest)

pd.Series(xgb_test_pred, name="dep_delayed_15min").to_csv(
    "xgb_5.csv", index_label="id", header=True
)

## Обучим и настроим линейную модель

In [None]:
space_logit = {
    "penalty" : hp.choice("penalty", ["l1", "l2"]),
    "C" : hp.loguniform("C", -4 * np.log(10), 4 * np.log(10)),
    "solver" : "liblinear"
}

def score_logit(params):
    print("Params: ", params)
    lin_model = LogisticRegression(**params)
    lin_model.fit(X_train_new, y_train_new)
    pred = lin_model.predict_proba(X_valid_new)[:, 1]
    score = roc_auc_score(y_valid_new, pred)
    print(f"Score:{score}")
    return {"loss": -score, "status": STATUS_OK}

def optimize_logit(trials, space, max_evals=10):
    best = fmin(score_logit, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
    return best

In [None]:
trials = Trials()
optimize_logit(trials, space_logit, 1)

In [71]:
best_logit_params = {
    'C': 0.18126298865195695,
    'penalty': "l1",
    'solver' : 'liblinear'
}

## Объединим и настроим смесь моделей

In [None]:
logit_train_tuned = LogisticRegression(**best_logit_params).fit(X_train_new, y_train_new)
train_logit_pred = logit_train_tuned.predict_proba(X_valid_new)[:, 1]

In [None]:
boost_train_tuned = xgb.train(best_params_saved_final, xgb.DMatrix(X_train_new, y_train_new), 1500)
train_xgb_pred = boost_train_tuned.predict(xgb.DMatrix(X_valid_new))

In [None]:
weights = {
    "weight" : hp.uniform("w", 0, 1)
}

def score_my_model(params):
    print(params)
    w = params["weight"]
    pred = w * train_xgb_pred + (1 - w) * train_logit_pred
    score = roc_auc_score(y_valid_new, pred)
    print(f"Score:{score}")
    return {"loss": -score, "status": STATUS_OK}

def optimize_my_model(trials, space, max_evals=10):
    best = fmin(score_my_model, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
    return best

In [None]:
trials = Trials()
best_my_model_weight = optimize_my_model(trials, weights, 100)

In [None]:
best_my_model_weight

In [54]:
best_weight = {'w': 0.9833484263607899}

## Настройка на кроссвалидации

In [146]:
from sklearn.base import BaseEstimator

class My_Model(BaseEstimator):
    def __init__(self, w=None, logit_params=None, xgb_params=None):
        self.logit = None
        self.boost = None
        self.w = w
        self.logit_params = logit_params
        self.xgb_params = xgb_params

    def get_params(self, deep=True):
        return {"w": self.w, "logit_params" : self.logit_params, "xgb_params" : self.xgb_params}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def fit(self, X: np.array, y: np.array):
        dtrain = xgb.DMatrix(X, y)
        self.boost = xgb.train(self.xgb_params, dtrain, 1500)
        self.logit = LogisticRegression(**self.logit_params).fit(X, y)
        return self

    def predict(self, X):
        dtest = xgb.DMatrix(X)
        boost_pred = self.boost.predict(dtest)
        logit_pred = self.logit.predict_proba(X)[:, 1]
        return self.w * boost_pred + (1 - self.w) * logit_pred

In [155]:
mm = My_Model(0.9833484263607899, best_logit_params, best_params_saved_final)

In [156]:
mm.fit(train_new, y)

In [157]:
my_model_test_pred = mm.predict(test_new)

In [158]:
pd.Series(my_model_test_pred, name="dep_delayed_15min").to_csv(
    "mm_6.4.csv", index_label="id", header=True
)