In [62]:
!pip -q install faiss-cpu

import faiss
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from tqdm.auto import tqdm
from scipy.optimize import differential_evolution
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

warnings.filterwarnings('ignore')

In [63]:
init_path = '/kaggle/input/spaceship-titanic/'
train_data = pd.read_csv(init_path + 'train.csv')
test_data = pd.read_csv(init_path + 'test.csv')
test_ids = test_data['PassengerId']

In [64]:
def knn_cluster_mean(X_train, y_train, X_test, k=5, metric="cosine"):
    X_train = X_train.astype("float32")
    X_test = X_test.astype("float32")
    
    if metric == "cosine":
        X_train /= np.linalg.norm(X_train, axis=1, keepdims=True) + 1e-9
        X_test /= np.linalg.norm(X_test, axis=1, keepdims=True) + 1e-9
    
    kmeans = faiss.Kmeans(X_train.shape[1], k, niter=25, verbose=False)
    kmeans.train(X_train)
    
    _, lbl_train = kmeans.index.search(X_train, 1)
    lbl_train = lbl_train.ravel()
    
    s = np.bincount(lbl_train, weights=y_train, minlength=k)
    c = np.bincount(lbl_train, minlength=k)
    cluster_means = s / (c + 1e-9)
    
    _, lbl_test = kmeans.index.search(X_test, 1)
    lbl_test = lbl_test.ravel()
    
    train_result = np.column_stack((lbl_train, cluster_means[lbl_train]))
    test_result = np.column_stack((lbl_test, cluster_means[lbl_test]))
    
    return train_result, test_result

def process_Cabin(series):
    values = []
    for cabin in series:
        if str(cabin) == 'nan':
            values.append([np.nan] * 3)
        else:
            sample = cabin.split('/')
            sample[1] = int(sample[1])
            values.append(sample)

    values = np.array(values).T
    cols = ['cabin_type', 'cabin_num', 'cabin_side']
    values = pd.DataFrame({col: lst for col, lst in zip(cols, values)})

    return values, ['cabin_type', 'cabin_side'], ['cabin_num']

def process_PassengerId(series):
    groups = series.apply(lambda value: int(value.split('_')[0]))
    numbers = series.apply(lambda value: int(value.split('_')[1]))

    group_sizes = {}
    for group in groups:
        if group not in group_sizes:
            group_sizes[group] = 1
        else:
            group_sizes[group] += 1
    group_sizes = [group_sizes[group] for group in groups]

    values = pd.DataFrame({'group': groups, 'number': numbers, 'group_size': group_sizes})

    return values, [], ['group', 'number', 'group_size']

def process_Name(series, max_features=1000):
    vectorizer = TfidfVectorizer(analyzer='char', max_features=max_features, ngram_range=(2, 4))
    data = vectorizer.fit_transform(series.fillna('No_Name').tolist()).toarray()
    data[series.isna()] = [np.nan] * max_features

    cols = [f'Name_TF-IDF_{i + 1}' for i in range(max_features)]
    df = pd.DataFrame(data, columns=cols)

    return df, [], cols

def oof_nan_imputer(all_data):
    all_data = all_data.copy()
    cols_with_nan_values = [col for col in all_data.columns if all_data[col].isna().sum() > 0]
    
    all_test_preds = []
    for col in tqdm(cols_with_nan_values, desc='NaN imputer'):
        col_is_cat = str(all_data[col].dtype) in ['object', 'category']
    
        y_train = all_data[col][all_data[col].notna()].astype('category' if col_is_cat else 'float')
        all_x = all_data.drop([col, 'Transported'], axis=1)
        x_train = all_x[all_data[col].notna()]
        x_test = all_x[all_data[col].isna()]
        
        oof_train_preds = np.zeros((len(x_train),))
        test_preds = np.zeros((len(x_test),))
    
        N_SPLITS = 5
        kfold = (StratifiedKFold if col_is_cat else KFold)(n_splits=5, shuffle=True, random_state=42)
    
        for train_idx, val_idx in tqdm(list(kfold.split(x_train, y_train)), leave=False, desc=col):
            x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
            y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
            model = (lgb.LGBMClassifier if col_is_cat else lgb.LGBMRegressor)(
                n_estimators=5000,
                learning_rate=0.03,
                max_depth=7,
                verbosity=-1,
                seed=42,
                device='gpu',
            )
            model.fit(
                x_train_fold, y_train_fold,
                eval_set=[(x_val_fold, y_val_fold)],
                eval_metric='auc' if col_is_cat else 'mae',
                callbacks=[lgb.early_stopping(50, verbose=False)]
            )
            oof_train_preds[val_idx] = model.predict(x_val_fold)
            test_preds += (model.predict_proba(x_test)[:, 1] if col_is_cat else model.predict(x_test)) / N_SPLITS
    
        all_test_preds.append(test_preds)
    
    for col, test_preds in zip(cols_with_nan_values, all_test_preds):
        all_data[col][all_data[col].isna()] = test_preds
    
    return all_data

In [65]:
all_data = pd.concat([train_data, test_data], axis=0)

cat_cols = ['HomePlanet', 'CryoSleep', 'VIP', 'Destination', 'Has_Cabin']
num_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age', 'TotalFair']

all_data['Has_Cabin'] = all_data['Cabin'].notna().astype(int)
all_data['TotalFair'] = (all_data['RoomService'] + all_data['FoodCourt'] + all_data['ShoppingMall'] + \
                         all_data['Spa'] + all_data['VRDeck'])

all_data = all_data.drop('Name', axis=1)
# name_df, new_cat_cols, new_num_cols = process_Name(all_data['Name'], max_features=500)
# cat_cols.extend(new_cat_cols); num_cols.extend(new_num_cols)
# all_data = pd.concat([all_data.drop('Name', axis=1).reset_index(drop=True), name_df], axis=1)

cabin_df, new_cat_cols, new_num_cols = process_Cabin(all_data['Cabin'])
cat_cols.extend(new_cat_cols); num_cols.extend(new_num_cols)
all_data = pd.concat([all_data.drop('Cabin', axis=1).reset_index(drop=True), cabin_df], axis=1)

passengerid_df, new_cat_cols, new_num_cols = process_PassengerId(all_data['PassengerId'])
cat_cols.extend(new_cat_cols); num_cols.extend(new_num_cols)
all_data = pd.concat([all_data.drop('PassengerId', axis=1).reset_index(drop=True), passengerid_df], axis=1)

all_data['Spa_binned'] = pd.qcut(all_data['Spa'], q=25, duplicates='drop')
all_data['VRDeck_binned'] = pd.qcut(all_data['VRDeck'], q=25, duplicates='drop')
cat_cols.extend(['Spa_binned', 'VRDeck_binned'])

for col in cat_cols:
    all_data[col] = all_data[col].astype('str').astype('category')

for col in num_cols:
    all_data[col] = all_data[col].astype('float')

all_data = oof_nan_imputer(all_data)

train_data, test_data = all_data[:len(train_data)], all_data[len(train_data):].drop('Transported', axis=1)

In [66]:
y_train = train_data['Transported'].astype(int)
x_train = train_data.drop('Transported', axis=1)

In [67]:
train_result, test_result = knn_cluster_mean(x_train[num_cols].fillna(-1).to_numpy(), y_train.values,
                                             test_data[num_cols].fillna(-1).to_numpy(), k=5)

cols = ['cluster', 'cluster_mean_label']
train_result = pd.DataFrame(train_result, columns=cols)
test_result = pd.DataFrame(test_result, columns=cols)

cat_cols.append(cols[0]); num_cols.append(cols[1])
train_result[cols[0]] = train_result[cols[0]].astype(int).astype('category')
test_result[cols[0]] = test_result[cols[0]].astype(int).astype('category')

x_train = pd.concat([x_train.reset_index(drop=True), train_result], axis=1)
test_data = pd.concat([test_data.reset_index(drop=True), test_result], axis=1)

In [68]:
def optimize_weights_acc(predictions, y_true):
    def objective(weights):
        weights = weights / weights.sum()
        probs = np.dot(predictions, weights)
        preds = (probs >= 0.5).astype(int)
        score = accuracy_score(y_true, preds)
        return -score

    bounds = [(0, 1) for _ in range(predictions.shape[1])]
    result = differential_evolution(objective, bounds, seed=42, maxiter=10000, polish=True)

    weights = result.x / result.x.sum()
    best_f1 = -result.fun

    return weights, best_f1

In [88]:
ITERATIONS = 1
N = 20

best_score = 0
best_test_preds = None
used_indexes = set()
additional_x_trains = []
additional_y_trains = []
for _ in tqdm(range(ITERATIONS)):
    oof_train_preds = np.zeros((len(x_train), 3))
    test_preds = np.zeros((len(test_data), 3))
    
    N_SPLITS = 15
    kfold = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)
    
    for train_idx, val_idx in tqdm(list(kfold.split(x_train, y_train)), leave=False):
        x_train_fold, x_val_fold = pd.concat([x_train.iloc[train_idx], *additional_x_trains], axis=0), x_train.iloc[val_idx]
        y_train_fold, y_val_fold = pd.concat([y_train.iloc[train_idx], *additional_y_trains], axis=0), y_train.iloc[val_idx]
    
        model = CatBoostClassifier(
            iterations=1500,
            eval_metric='Accuracy',
            learning_rate=0.03,
            max_depth=6,
            verbose=50,
            l2_leaf_reg=2,
            early_stopping_rounds=200,
            random_state=42,
            cat_features=cat_cols,
            task_type='GPU'
        )
        model.fit(
            x_train_fold, y_train_fold,
            eval_set=(x_val_fold, y_val_fold)
        )
        oof_train_preds[val_idx, 0] = model.predict_proba(x_val_fold)[:, 1]
        test_preds[:, 0] += model.predict_proba(test_data)[:, 1] / N_SPLITS

        model = lgb.LGBMClassifier(
            n_estimators=1500,
            learning_rate=0.03,
            max_depth=5,
            lambda_l2=4.8,
            verbosity=-1,
            seed=42,
            device='gpu'
        )
        model.fit(
            x_train_fold, y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            eval_metric='acc',
            callbacks=[
                lgb.log_evaluation(200),
                lgb.early_stopping(200, verbose=True)
            ]
        )
        oof_train_preds[val_idx, 1] = model.predict_proba(x_val_fold)[:, 1]
        test_preds[:, 1] += model.predict_proba(test_data)[:, 1] / N_SPLITS

        model = XGBClassifier(
            enable_categorical=True,
            n_estimators=1500,
            learning_rate=0.03,
            max_depth=7,
            early_stopping_rounds=200,
            seed=42,
            device='gpu'
        )
        model.fit(
            x_train_fold, y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            verbose=0
        )
        oof_train_preds[val_idx, 2] = model.predict_proba(x_val_fold)[:, 1]
        test_preds[:, 2] += model.predict_proba(test_data)[:, 1] / N_SPLITS

    weights, score = optimize_weights_acc(oof_train_preds, y_train)
    print(weights, score)

    if score > best_score:
        best_score = score
        best_test_preds = test_preds
    else:
        print('Метрика не улучшилась. Цикл прекращается.')
        break

    test_preds = np.dot(test_preds, weights)
    indexes = abs(test_preds - 0.5).argsort()[::-1]
    sorted_test_preds = test_preds[indexes]
    additional_indexes = np.concatenate([
        indexes[test_preds[indexes] < 0.5][:N],
        indexes[test_preds[indexes] >= 0.5][:N]
    ], axis=0)
    additional_indexes = list(set(additional_indexes) - used_indexes)
    used_indexes.update(additional_indexes)

    if len(additional_indexes) == 0:
        print('Нет новых уверенных сэмплов. Цикл прекращается.')
        break

    additional_x_trains.append(test_data.iloc[additional_indexes])
    additional_y_trains.append(pd.Series(test_preds[additional_indexes] >= 0.5).astype(int))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.7735116	test: 0.7711328	best: 0.7711328 (0)	total: 73.9ms	remaining: 1m 50s
50:	learn: 0.8005464	test: 0.7970098	best: 0.7987349 (47)	total: 2.4s	remaining: 1m 8s
100:	learn: 0.8070175	test: 0.8033353	best: 0.8039103 (97)	total: 4.63s	remaining: 1m 4s
150:	learn: 0.8153581	test: 0.8085106	best: 0.8108108 (140)	total: 6.86s	remaining: 1m 1s
200:	learn: 0.8188093	test: 0.8102358	best: 0.8108108 (140)	total: 9.03s	remaining: 58.4s
250:	learn: 0.8206787	test: 0.8131110	best: 0.8136860 (247)	total: 11.2s	remaining: 55.6s
300:	learn: 0.8249928	test: 0.8136860	best: 0.8148361 (280)	total: 13.3s	remaining: 53.1s
350:	learn: 0.8281565	test: 0.8148361	best: 0.8159862 (324)	total: 15.4s	remaining: 50.3s
400:	learn: 0.8301697	test: 0.8142611	best: 0.8165612 (365)	total: 17.5s	remaining: 47.8s
450:	learn: 0.8326143	test: 0.8171363	best: 0.8182864 (442)	total: 19.5s	remaining: 45.5s
500:	learn: 0.8341961	test: 0.8148361	best: 0.8182864 (442)	total: 21.6s	remaining: 43.2s
550:	learn: 0.83

  0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.7735202	test: 0.7682576	best: 0.7682576 (0)	total: 83.4ms	remaining: 2m 4s
50:	learn: 0.8052617	test: 0.7981599	best: 0.7981599 (41)	total: 2.28s	remaining: 1m 4s
100:	learn: 0.8084072	test: 0.8039103	best: 0.8044853 (61)	total: 4.44s	remaining: 1m 1s
150:	learn: 0.8152702	test: 0.8062105	best: 0.8085106 (146)	total: 6.56s	remaining: 58.6s
200:	learn: 0.8191307	test: 0.8062105	best: 0.8090857 (151)	total: 8.71s	remaining: 56.3s
250:	learn: 0.8235631	test: 0.8090857	best: 0.8102358 (232)	total: 10.9s	remaining: 54.3s
300:	learn: 0.8264226	test: 0.8079356	best: 0.8102358 (232)	total: 13.1s	remaining: 52.1s
350:	learn: 0.8292822	test: 0.8119609	best: 0.8119609 (345)	total: 15.2s	remaining: 49.6s
400:	learn: 0.8338576	test: 0.8119609	best: 0.8119609 (345)	total: 17.3s	remaining: 47.5s
450:	learn: 0.8365742	test: 0.8119609	best: 0.8131110 (422)	total: 19.4s	remaining: 45.1s
500:	learn: 0.8384329	test: 0.8108108	best: 0.8131110 (422)	total: 21.6s	remaining: 43s
550:	learn: 0.8402

In [72]:
# test_preds = np.dot(test_preds, weights)
# test_preds = test_preds[abs(test_preds - 0.5).argsort()[::-1]]

In [73]:
# N = 30
# used_indexes = set()

# test_preds = np.dot(test_preds, weights)
# indexes = abs(test_preds - 0.5).argsort()[::-1]
# sorted_test_preds = test_preds[indexes]
# additional_indexes = np.concatenate([
#     indexes[test_preds[indexes] < 0.5][:N],
#     indexes[test_preds[indexes] >= 0.5][:N]
# ], axis=0)
# additional_indexes = set(additional_indexes) - used_indexes
# if len(additional_indexes) == 0:
#     print('Нет новых уверенных сэмплов. Цикл прекращается.')
#     break

In [99]:
# preds = np.dot(test_preds, weights) >= 0.5
# preds = preds.astype(bool)
preds = (np.dot(best_test_preds, weights) > 0.5).astype(bool)
submission = pd.DataFrame({'PassengerId': test_ids, 'Transported': preds})
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
5,0027_01,True
6,0029_01,True
7,0032_01,True
8,0032_02,True
9,0033_01,True
