In [1]:
!pip -q install faiss-cpu

import faiss
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from tqdm.auto import tqdm
from scipy.optimize import differential_evolution
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import warnings
import shap

warnings.filterwarnings('ignore')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [2]:
init_path = '/kaggle/input/spaceship-titanic/'
train_data = pd.read_csv(init_path + 'train.csv')
test_data = pd.read_csv(init_path + 'test.csv')
test_ids = test_data['PassengerId']

In [3]:
def knn_cluster_mean(X_train, y_train, X_test, k=5, metric="cosine"):
    X_train = X_train.astype("float32")
    X_test = X_test.astype("float32")
    
    if metric == "cosine":
        X_train /= np.linalg.norm(X_train, axis=1, keepdims=True) + 1e-9
        X_test /= np.linalg.norm(X_test, axis=1, keepdims=True) + 1e-9
    
    kmeans = faiss.Kmeans(X_train.shape[1], k, niter=25, verbose=False)
    kmeans.train(X_train)
    
    _, lbl_train = kmeans.index.search(X_train, 1)
    lbl_train = lbl_train.ravel()
    
    s = np.bincount(lbl_train, weights=y_train, minlength=k)
    c = np.bincount(lbl_train, minlength=k)
    cluster_means = s / (c + 1e-9)
    
    _, lbl_test = kmeans.index.search(X_test, 1)
    lbl_test = lbl_test.ravel()
    
    train_result = np.column_stack((lbl_train, cluster_means[lbl_train]))
    test_result = np.column_stack((lbl_test, cluster_means[lbl_test]))
    
    return train_result, test_result

def process_Cabin(series):
    values = []
    for cabin in series:
        if str(cabin) == 'nan':
            values.append([np.nan] * 3)
        else:
            sample = cabin.split('/')
            sample[1] = int(sample[1])
            values.append(sample)

    values = np.array(values).T
    cols = ['cabin_type', 'cabin_num', 'cabin_side']
    values = pd.DataFrame({col: lst for col, lst in zip(cols, values)})

    return values, ['cabin_type', 'cabin_side'], ['cabin_num']

def process_PassengerId(series):
    groups = series.apply(lambda value: int(value.split('_')[0]))
    numbers = series.apply(lambda value: int(value.split('_')[1]))

    group_sizes = {}
    for group in groups:
        if group not in group_sizes:
            group_sizes[group] = 1
        else:
            group_sizes[group] += 1
    group_sizes = [group_sizes[group] for group in groups]

    values = pd.DataFrame({'group': groups, 'number': numbers, 'group_size': group_sizes})

    return values, [], ['group', 'number', 'group_size']

def process_Name(series, max_features=1000):
    vectorizer = TfidfVectorizer(analyzer='char', max_features=max_features, ngram_range=(2, 4))
    data = vectorizer.fit_transform(series.fillna('No_Name').tolist()).toarray()
    data[series.isna()] = [np.nan] * max_features

    cols = [f'Name_TF-IDF_{i + 1}' for i in range(max_features)]
    df = pd.DataFrame(data, columns=cols)

    return df, [], cols

def oof_nan_imputer(all_data):
    all_data = all_data.copy()
    cols_with_nan_values = [col for col in all_data.columns if all_data[col].isna().sum() > 0]
    
    all_test_preds = []
    for col in tqdm(cols_with_nan_values, desc='NaN imputer'):
        col_is_cat = str(all_data[col].dtype) in ['object', 'category']
    
        y_train = all_data[col][all_data[col].notna()].astype('category' if col_is_cat else 'float')
        all_x = all_data.drop([col, 'Transported'], axis=1)
        x_train = all_x[all_data[col].notna()]
        x_test = all_x[all_data[col].isna()]
        
        oof_train_preds = np.zeros((len(x_train),))
        test_preds = np.zeros((len(x_test),))
    
        N_SPLITS = 5
        kfold = (StratifiedKFold if col_is_cat else KFold)(n_splits=5, shuffle=True, random_state=42)
    
        for train_idx, val_idx in tqdm(list(kfold.split(x_train, y_train)), leave=False, desc=col):
            x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
            y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
            model = (lgb.LGBMClassifier if col_is_cat else lgb.LGBMRegressor)(
                n_estimators=5000,
                learning_rate=0.03,
                max_depth=7,
                verbosity=-1,
                seed=42,
                device='gpu',
            )
            model.fit(
                x_train_fold, y_train_fold,
                eval_set=[(x_val_fold, y_val_fold)],
                eval_metric='auc' if col_is_cat else 'mae',
                callbacks=[lgb.early_stopping(50, verbose=False)]
            )
            oof_train_preds[val_idx] = model.predict(x_val_fold)
            test_preds += (model.predict_proba(x_test)[:, 1] if col_is_cat else model.predict(x_test)) / N_SPLITS
    
        all_test_preds.append(test_preds)
    
    for col, test_preds in zip(cols_with_nan_values, all_test_preds):
        all_data[col][all_data[col].isna()] = test_preds
    
    return all_data

In [4]:
all_data = pd.concat([train_data, test_data], axis=0)

cat_cols = ['HomePlanet', 'CryoSleep', 'VIP', 'Destination', 'Has_Cabin']
fairs ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
num_cols = [*fairs, 'Age', 'TotalFair']

all_data['Has_Cabin'] = all_data['Cabin'].notna().astype(int)
all_data['TotalFair'] = (all_data['RoomService'] + all_data['FoodCourt'] + all_data['ShoppingMall'] + \
                         all_data['Spa'] + all_data['VRDeck'])

all_data = all_data.drop('Name', axis=1)
# name_df, new_cat_cols, new_num_cols = process_Name(all_data['Name'], max_features=500)
# cat_cols.extend(new_cat_cols); num_cols.extend(new_num_cols)
# all_data = pd.concat([all_data.drop('Name', axis=1).reset_index(drop=True), name_df], axis=1)

cabin_df, new_cat_cols, new_num_cols = process_Cabin(all_data['Cabin'])
cat_cols.extend(new_cat_cols); num_cols.extend(new_num_cols)
all_data = pd.concat([all_data.drop('Cabin', axis=1).reset_index(drop=True), cabin_df], axis=1)

passengerid_df, new_cat_cols, new_num_cols = process_PassengerId(all_data['PassengerId'])
cat_cols.extend(new_cat_cols); num_cols.extend(new_num_cols)
all_data = pd.concat([all_data.drop('PassengerId', axis=1).reset_index(drop=True), passengerid_df], axis=1)

all_data['Spa_binned'] = pd.qcut(all_data['Spa'], q=25, duplicates='drop')
all_data['VRDeck_binned'] = pd.qcut(all_data['VRDeck'], q=25, duplicates='drop')
cat_cols.extend(['Spa_binned', 'VRDeck_binned'])

fairs = all_data[fairs]

raise

for col in cat_cols:
    all_data[col] = LabelEncoder().fit_transform(all_data[col])
    all_data[col] = all_data[col].astype('str').astype('category')

for col in num_cols:
    all_data[col] = all_data[col].astype('float')

# all_data = oof_nan_imputer(all_data)

train_data, test_data = all_data[:len(train_data)], all_data[len(train_data):].drop('Transported', axis=1)

In [5]:
y_train = train_data['Transported'].astype(int)
x_train = train_data.drop('Transported', axis=1)

In [6]:
train_result, test_result = knn_cluster_mean(x_train[num_cols].fillna(-1).to_numpy(), y_train.values,
                                             test_data[num_cols].fillna(-1).to_numpy(), k=5)

cols = ['cluster', 'cluster_mean_label']
train_result = pd.DataFrame(train_result, columns=cols)
test_result = pd.DataFrame(test_result, columns=cols)

cat_cols.append(cols[0]); num_cols.append(cols[1])
train_result[cols[0]] = train_result[cols[0]].astype(int).astype('category')
test_result[cols[0]] = test_result[cols[0]].astype(int).astype('category')

x_train = pd.concat([x_train.reset_index(drop=True), train_result], axis=1)
test_data = pd.concat([test_data.reset_index(drop=True), test_result], axis=1)

In [7]:
def cross_val_score(model, X, y, cv=5):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model_clone = model.__class__(**model.get_params())
        
        model_clone.fit(
            X_train, y_train,
            eval_set=(X_val, y_val)
        )
        
        score = model_clone.score(X_val, y_val)
        scores.append(score)
    
    return np.array(scores)

def add_new_features(x_train, test_data):
    x_train = x_train.copy()
    test_data = test_data.copy()

    features = x_train.columns.tolist()
    
    model = CatBoostClassifier(
        iterations=1500,
        eval_metric='Accuracy',
        learning_rate=0.03,
        max_depth=6,
        verbose=100,
        l2_leaf_reg=2,
        early_stopping_rounds=200,
        random_state=42,
        cat_features=cat_cols
    )
    x_t, x_v, y_t, y_v = train_test_split(x_train, y_train, random_state=42, test_size=0.2, shuffle=True, stratify=y_train)
    model.fit(x_t, y_t, eval_set=(x_v, y_v))
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(x_train)
    
    shap_interaction_values = explainer.shap_interaction_values(x_train)
    
    if isinstance(shap_interaction_values, list):
        shap_interaction_values = shap_interaction_values[1]
    
    interaction_matrix = np.abs(shap_interaction_values).mean(0)
    
    interaction_df = pd.DataFrame(
        interaction_matrix,
        index=features,
        columns=features
    )
    
    np.fill_diagonal(interaction_df.values, 0)
    
    print("Top 10 interactions:")
    interactions = []
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            interactions.append({
                'feature1': features[i],
                'feature2': features[j],
                'interaction_strength': interaction_df.iloc[i, j]
            })
    
    interactions_df = pd.DataFrame(interactions).sort_values('interaction_strength', ascending=False)
    print(interactions_df.head(10))
    
    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=4,
        lambda_l2=2.0,
        verbosity=-1,
        seed=42,
        # device='gpu'
    )
    
    baseline_score = cross_val_score(model, x_train[features], y_train, cv=5).mean()
    print(f"Baseline: {baseline_score:.5f}\n")
    
    current_score = baseline_score
    NEW_FEATURES = 20
    new_features_counter = 0
    added_features = []
    
    for idx, row in tqdm(list(interactions_df.iterrows())):
        if new_features_counter >= NEW_FEATURES:
            break
            
        f1, f2 = row['feature1'], row['feature2']
        
        both_cat = f1 in cat_cols and f2 in cat_cols
        both_num = f1 not in cat_cols and f2 not in cat_cols
        
        if not both_cat and not both_num:
            continue
        
        x_temp = x_train.copy()
        
        if both_cat:
            feat_name = f'{f1}_X_{f2}'
            x_temp[feat_name] = x_temp[f1].astype(str) + '_' + x_temp[f2].astype(str)
            x_temp[feat_name] = x_temp[feat_name].astype('category')
        else:
            feat_name = f'{f1}_x_{f2}'
            x_temp[feat_name] = x_temp[f1] * x_temp[f2]
        
        test_score = cross_val_score(model, x_temp, y_train, cv=3).mean()
        improvement = test_score - current_score
        
        if improvement > 0.0001:
            if both_cat:
                x_train[feat_name] = x_train[f1].astype(str) + '_' + x_train[f2].astype(str)
                test_data[feat_name] = test_data[f1].astype(str) + '_' + test_data[f2].astype(str)
                x_train[feat_name] = x_train[feat_name].astype('category')
                test_data[feat_name] = test_data[feat_name].astype('category')
                cat_cols.append(feat_name)
            else:
                x_train[feat_name] = x_train[f1] * x_train[f2]
                test_data[feat_name] = test_data[f1] * test_data[f2]
            
            current_score = test_score
            new_features_counter += 1
            added_features.append(feat_name)
            print(f'✅ {feat_name:40s} {test_score:.5f} (+{improvement:.5f})')
        else:
            print(f'❌ {feat_name:40s} {test_score:.5f} ({improvement:+.5f})')
    
    final_score = cross_val_score(model, x_train, y_train, cv=5).mean()
    
    print(f'\nBaseline: {baseline_score:.5f}')
    print(f'Final: {final_score:.5f}')
    print(f'Improvement: {final_score - baseline_score:+.5f}')
    print(f'Added features: {new_features_counter}')

    return x_train, test_data

In [8]:
x_train, test_data = add_new_features(x_train, test_data)

New features adding:   0%|          | 0/2 [00:00<?, ?it/s]

0:	learn: 0.7490653	test: 0.7383554	best: 0.7383554 (0)	total: 82.5ms	remaining: 2m 3s
100:	learn: 0.8129134	test: 0.7952846	best: 0.7952846 (66)	total: 1.59s	remaining: 22s
200:	learn: 0.8231234	test: 0.7998850	best: 0.7998850 (195)	total: 3.05s	remaining: 19.7s
300:	learn: 0.8327581	test: 0.8016101	best: 0.8016101 (299)	total: 4.51s	remaining: 18s
400:	learn: 0.8435433	test: 0.8062105	best: 0.8062105 (400)	total: 6.08s	remaining: 16.7s
500:	learn: 0.8528904	test: 0.8067855	best: 0.8085106 (438)	total: 7.61s	remaining: 15.2s
600:	learn: 0.8625252	test: 0.8050604	best: 0.8085106 (438)	total: 9.11s	remaining: 13.6s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8085106383
bestIteration = 438

Shrink model to first 439 iterations.
Top 10 interactions:
        feature1     feature2  interaction_strength
0     HomePlanet    CryoSleep              0.081463
195    cabin_num   cabin_side              0.052231
24     CryoSleep  RoomService              0.044124
27     Cry

  0%|          | 0/231 [00:00<?, ?it/s]

✅ HomePlanet_X_CryoSleep                   0.80847 (+0.00230)
❌ Spa_x_VRDeck                             0.80732 (-0.00115)
✅ cabin_type_X_cabin_side                  0.80858 (+0.00011)
❌ RoomService_x_VRDeck                     0.80663 (-0.00196)
✅ HomePlanet_X_cabin_type                  0.80870 (+0.00012)
✅ RoomService_x_Spa                        0.81031 (+0.00161)
❌ HomePlanet_X_cluster                     0.80858 (-0.00173)
❌ FoodCourt_x_TotalFair                    0.80927 (-0.00104)
❌ HomePlanet_X_cabin_side                  0.80916 (-0.00115)
❌ RoomService_x_TotalFair                  0.80973 (-0.00058)
❌ cabin_num_x_group                        0.81008 (-0.00023)
❌ FoodCourt_x_Spa                          0.80973 (-0.00058)
✅ Destination_X_cabin_type                 0.81054 (+0.00023)
✅ HomePlanet_X_VRDeck_binned               0.81203 (+0.00150)
❌ Destination_X_VRDeck_binned              0.81100 (-0.00104)
❌ cabin_type_X_VRDeck_binned               0.80743 (-0.00460)
✅ Age_x_

  0%|          | 0/630 [00:00<?, ?it/s]

✅ Spa_x_VRDeck                             0.81422 (+0.00011)
❌ ShoppingMall_x_ShoppingMall_x_cluster_mean_label 0.81364 (-0.00058)
❌ HomePlanet_X_CryoSleep                   0.81422 (+0.00000)
❌ CryoSleep_X_HomePlanet_X_cabin_type      0.81387 (-0.00035)
❌ cabin_num_x_group                        0.81341 (-0.00081)
❌ RoomService_x_Spa                        0.81422 (+0.00000)
✅ FoodCourt_x_Age_x_FoodCourt              0.81663 (+0.00242)
❌ RoomService_x_RoomService_x_group        0.81399 (-0.00265)
❌ CryoSleep_X_VRDeck_binned                0.81295 (-0.00368)
❌ Age_x_cabin_num                          0.81468 (-0.00196)
❌ RoomService_x_VRDeck                     0.81272 (-0.00391)
❌ FoodCourt_x_cluster_mean_label           0.81491 (-0.00173)
❌ Age_x_ShoppingMall_x_ShoppingMall_x_cluster_mean_label 0.81445 (-0.00219)
❌ VRDeck_binned_X_HomePlanet_X_cabin_type  0.81054 (-0.00610)
❌ Age_x_TotalFair_x_TotalFair_x_group      0.81353 (-0.00311)
❌ FoodCourt_x_TotalFair                    0.813

In [9]:
def optimize_weights_acc(predictions, y_true):
    def objective(weights):
        weights = weights / weights.sum()
        probs = np.dot(predictions, weights)
        preds = (probs >= 0.5).astype(int)
        score = accuracy_score(y_true, preds)
        return -score

    bounds = [(0, 1) for _ in range(predictions.shape[1])]
    result = differential_evolution(objective, bounds, seed=42, maxiter=10000, polish=True)

    weights = result.x / result.x.sum()
    best_f1 = -result.fun

    return weights, best_f1

In [10]:
ITERATIONS = 1
N = 20

best_score = 0
best_test_preds = None
used_indexes = set()
additional_x_trains = []
additional_y_trains = []
for _ in tqdm(range(ITERATIONS)):
    oof_train_preds_1 = np.zeros((len(x_train), 3))
    test_preds_1 = np.zeros((len(test_data), 3))
    
    N_SPLITS = 5
    kfold = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)
    splits = list(kfold.split(x_train, y_train))

    for train_idx, val_idx in tqdm(splits, desc='1/2 ensemble\'s level', leave=False):
        x_train_fold, x_val_fold = pd.concat([x_train.iloc[train_idx], *additional_x_trains], axis=0), x_train.iloc[val_idx]
        y_train_fold, y_val_fold = pd.concat([y_train.iloc[train_idx], *additional_y_trains], axis=0), y_train.iloc[val_idx]
    
        model = CatBoostClassifier(
            iterations=1500,
            eval_metric='Accuracy',
            learning_rate=0.03,
            max_depth=6,
            verbose=50,
            l2_leaf_reg=2,
            early_stopping_rounds=200,
            random_state=42,
            cat_features=cat_cols,
            # task_type='GPU'
        )
        model.fit(
            x_train_fold, y_train_fold,
            eval_set=(x_val_fold, y_val_fold)
        )
        oof_train_preds_1[val_idx, 0] = model.predict_proba(x_val_fold)[:, 1]
        test_preds_1[:, 0] += model.predict_proba(test_data)[:, 1] / N_SPLITS

        model = lgb.LGBMClassifier(
            n_estimators=1500,
            learning_rate=0.03,
            max_depth=5,
            lambda_l2=4.8,
            verbosity=-1,
            seed=42,
            # device='gpu'
        )
        model.fit(
            x_train_fold, y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            eval_metric='acc',
            callbacks=[
                lgb.log_evaluation(200),
                lgb.early_stopping(200, verbose=True)
            ]
        )
        oof_train_preds_1[val_idx, 1] = model.predict_proba(x_val_fold)[:, 1]
        test_preds_1[:, 1] += model.predict_proba(test_data)[:, 1] / N_SPLITS

        model = XGBClassifier(
            enable_categorical=True,
            n_estimators=1500,
            learning_rate=0.03,
            max_depth=7,
            early_stopping_rounds=200,
            seed=42,
            # device='cuda'
        )
        model.fit(
            x_train_fold, y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            verbose=200
        )
        oof_train_preds_1[val_idx, 2] = model.predict_proba(x_val_fold)[:, 1]
        test_preds_1[:, 2] += model.predict_proba(test_data)[:, 1] / N_SPLITS

    weights, score = optimize_weights_acc(oof_train_preds_1, y_train)
    print(f'1st level; Metric: {round(score, 5)}; Weights: {weights.round(5)}')

    # oof_train_preds_2 = np.zeros((len(oof_train_preds_1), 4))
    # test_preds_2 = np.zeros((len(test_preds_1), 4))

    # oof_train_preds_2[:, 3] = np.dot(oof_train_preds_1, weights)
    # test_preds_2[:, 3] = np.dot(test_preds_1, weights)

    # for train_idx, val_idx in tqdm(splits, desc='2/2 ensemble\'s level', leave=False):
    #     x_train_fold, x_val_fold = oof_train_preds_1[train_idx], oof_train_preds_1[val_idx]
    #     y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    #     model = CatBoostClassifier(
    #         iterations=1500,
    #         eval_metric='Accuracy',
    #         learning_rate=0.01,
    #         max_depth=4,
    #         verbose=50,
    #         l2_leaf_reg=2,
    #         early_stopping_rounds=50,
    #         random_state=42,
    #         # task_type='GPU'
    #     )
    #     model.fit(
    #         x_train_fold, y_train_fold,
    #         eval_set=(x_val_fold, y_val_fold)
    #     )
    #     oof_train_preds_2[val_idx, 0] = model.predict_proba(x_val_fold)[:, 1]
    #     test_preds_2[:, 0] += model.predict_proba(test_preds_1)[:, 1] / N_SPLITS

    #     model = lgb.LGBMClassifier(
    #         n_estimators=1000,
    #         learning_rate=0.01,
    #         max_depth=4,
    #         lambda_l2=2.0,
    #         verbosity=-1,
    #         seed=42,
    #         # device='gpu'
    #     )
    #     model.fit(
    #         x_train_fold, y_train_fold,
    #         eval_set=[(x_val_fold, y_val_fold)],
    #         eval_metric='acc',
    #         callbacks=[
    #             lgb.log_evaluation(200),
    #             lgb.early_stopping(200, verbose=True)
    #         ]
    #     )
    #     oof_train_preds_2[val_idx, 1] = model.predict_proba(x_val_fold)[:, 1]
    #     test_preds_2[:, 1] += model.predict_proba(test_preds_1)[:, 1] / N_SPLITS

    #     model = XGBClassifier(
    #         n_estimators=1000,
    #         learning_rate=0.01,
    #         max_depth=4,
    #         early_stopping_rounds=50,
    #         seed=42,
    #         # device='cuda'
    #     )
    #     model.fit(
    #         x_train_fold, y_train_fold,
    #         eval_set=[(x_val_fold, y_val_fold)],
    #         verbose=200
    #     )
    #     oof_train_preds_2[val_idx, 2] = model.predict_proba(x_val_fold)[:, 1]
    #     test_preds_2[:, 2] += model.predict_proba(test_preds_1)[:, 1] / N_SPLITS

    # weights, score = optimize_weights_acc(oof_train_preds_2, y_train)
    # print(f'2nd level; Metric: {round(score, 5)}; Weights: {weights.round(5)}')

    # if score > best_score:
    #     best_score = score
    #     best_test_preds = test_preds_2
    # else:
    #     print('Метрика не улучшилась. Цикл прекращается.')
    #     break
    break

    test_preds_2 = np.dot(test_preds_2, weights)
    indexes = abs(test_preds_2 - 0.5).argsort()[::-1]
    sorted_test_preds = test_preds[indexes]
    additional_indexes = np.concatenate([
        indexes[test_preds_2[indexes] < 0.5][:N],
        indexes[test_preds_2[indexes] >= 0.5][:N]
    ], axis=0)
    additional_indexes = list(set(additional_indexes) - used_indexes)
    used_indexes.update(additional_indexes)

    if len(additional_indexes) == 0:
        print('Нет новых уверенных сэмплов. Цикл прекращается.')
        break

    additional_x_trains.append(test_data.iloc[additional_indexes])
    additional_y_trains.append(pd.Series(test_preds[additional_indexes] >= 0.5).astype(int))

  0%|          | 0/1 [00:00<?, ?it/s]

1/2 ensemble's level:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.7517975	test: 0.7561817	best: 0.7561817 (0)	total: 36.1ms	remaining: 54.2s
50:	learn: 0.7947944	test: 0.7975848	best: 0.7981599 (46)	total: 1.32s	remaining: 37.6s
100:	learn: 0.8083118	test: 0.8102358	best: 0.8102358 (90)	total: 2.64s	remaining: 36.6s
150:	learn: 0.8150705	test: 0.8154112	best: 0.8159862 (148)	total: 3.94s	remaining: 35.2s
200:	learn: 0.8202473	test: 0.8182864	best: 0.8188614 (199)	total: 5.21s	remaining: 33.7s
250:	learn: 0.8261432	test: 0.8171363	best: 0.8194365 (231)	total: 6.52s	remaining: 32.4s
300:	learn: 0.8337647	test: 0.8125359	best: 0.8194365 (231)	total: 7.87s	remaining: 31.3s
350:	learn: 0.8385102	test: 0.8119609	best: 0.8194365 (231)	total: 9.16s	remaining: 30s
400:	learn: 0.8467069	test: 0.8148361	best: 0.8194365 (231)	total: 10.4s	remaining: 28.6s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8194364577
bestIteration = 231

Shrink model to first 232 iterations.
Training until validation scores don't improve for 200 roun

In [12]:
# preds = np.dot(test_preds, weights) >= 0.5
# preds = preds.astype(bool)
preds = (np.dot(test_preds_1, weights) > 0.5).astype(bool)
submission = pd.DataFrame({'PassengerId': test_ids, 'Transported': preds})
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
5,0027_01,True
6,0029_01,True
7,0032_01,True
8,0032_02,True
9,0033_01,True
