In [1]:
import pandas as pd
import numpy as np
import pathlib
from sklearn import model_selection
import xgboost as xgb

project_path = pathlib.Path('/home/mpuscian/Desktop/repozytoria/MINI_projects/AML_Project_2')
data_path = project_path.joinpath('data')

X_train_path = data_path.joinpath("X_train.parquet")
y_train_path = data_path.joinpath("y_train.parquet")

X_test_path = data_path.joinpath("X_test.parquet")
y_test_path = data_path.joinpath("y_test.parquet")


In [2]:
X_train = pd.read_parquet(X_train_path)
y_train = pd.read_parquet(y_train_path)

X_test = pd.read_parquet(X_test_path)
y_test = pd.read_parquet(y_test_path)

In [24]:
# X_train.to_parquet(project_path.joinpath('data/X_train.parquet'))
# y_train.to_parquet(project_path.joinpath('data/y_train.parquet'))
# X_test.to_parquet(project_path.joinpath('data/X_test.parquet'))
# y_test.to_parquet(project_path.joinpath('data/y_test.parquet'))

In [None]:
def euros_gained(y_true, y_pred_proba, num_of_features : int):
    """
    Metric used in this project specific that it can be used in xgboost.cv

    Args
    ----
    y_pred
        predictions of the model
    data
        X data of type xgb.DMatrix

    Returns
    -------
    list of length two: [metric : str, value : numeric scalar] 
    """
    n_selected = int(1/5 * len(y_true))
    positive_class_probas = y_pred_proba[:, 1]

    # Choosing 20% of households
    top_k_indices = np.argsort(positive_class_probas)[-n_selected:]
    true_positives = y_true[top_k_indices].sum()
    
    max_reward = n_selected * 10
    reward = true_positives * 10 * 10000/max_reward
    cost = num_of_features * 200 # 200 euros for each feature
    return 'Euros_gained',- (reward - cost)

def xgb_euros_gained(y_pred : np.ndarray, data : xgb.DMatrix):
    """
    Metric used in this project specific that it can be used in xgboost.cv

    Args
    ----
    y_pred
        predictions of the model
    data
        X data of type xgb.DMatrix

    Returns
    -------
    list of length two: [metric : str, value : numeric scalar] 
    """
    y_true = data.get_label()
    
    n_selected = int(1/5 * len(y_true))
    top_k_probas = np.argsort(y_pred)[-n_selected:]
    true_positives = y_true[top_k_probas].sum()
    
    max_reward = n_selected * 10
    reward = true_positives * 10 * 10000/max_reward
    cost = data.num_col() * 200
    # print(f"reward: {reward}, cost: {cost}, max_reward: {max_reward}")
    return 'Euros_gained',- (reward - cost)

class SaveBestModel(xgb.callback.TrainingCallback):
    def __init__(self, cvboosters):
        self._cvboosters = cvboosters
    
    def after_training(self, model):
        self._cvboosters[:] = [cvpack.bst for cvpack in model.cvfolds]
        return model

In [81]:
data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)
params = {'objective':'binary:logistic'
          ,'eval_metric':'logloss',
          'eta':0.01,
          'max_depth' : 8,
          'device' : 'cuda',
          'colsample_bytree':0.85,
          'verbosity' : 1,
          'alpha' : 7,}


cvboosters = []
xgb_cv = xgb.cv(
    dtrain=data_dmatrix,
    params=params,
    nfold=5,
    metrics = 'logloss',
    seed=42,
    num_boost_round = 200,
    custom_metric = xgb_euros_gained,
    #early_stopping_rounds=20,
    callbacks=[SaveBestModel(cvboosters), ],
    ) 

In [95]:
X_train_fs, X_val_fs, y_train_fs, y_val_fs = model_selection.train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [119]:
from sklearn.inspection import permutation_importance

X_train_fs_iter = X_train_fs.copy(deep=True)
X_val_fs_iter =X_val_fs.copy(deep=True)
feature_args = np.array([i for i in range(500)])
deleted_features_arg = []
for i in range(500):
    clf = xgb.XGBClassifier(eta=0.01, device = 'cuda', n_estimators=150, max_depth=10)
    clf.fit(X_train_fs_iter, y_train_fs)
    r = permutation_importance(clf, X_val_fs_iter, y_val_fs,n_repeats=5,random_state=0)
    min_importance_arg = r.importances_mean.argsort()[0]
    min_importance = r.importances_mean[min_importance_arg]
    if len(feature_args) > 10:
        print(f"Deleting feature no. {min_importance_arg} of importance score {min_importance}")
        feature_args = np.delete(feature_args, min_importance_arg)
        X_train_fs_iter = X_train_fs.copy(deep=True).iloc[:, feature_args]
        X_val_fs_iter = X_val_fs.copy(deep=True).iloc[:, feature_args]
    else:
        break
        


Deleting feature no. 264 of importance score -0.0011764705882352678
Deleting feature no. 265 of importance score -0.0028235294117647134
Deleting feature no. 296 of importance score -0.004470588235294138
Deleting feature no. 75 of importance score -0.00352941176470587
Deleting feature no. 1 of importance score -0.005647058823529361
Deleting feature no. 79 of importance score -0.0021176470588234906
Deleting feature no. 2 of importance score -0.003529411764705914
Deleting feature no. 211 of importance score -0.008235294117647073
Deleting feature no. 315 of importance score -0.004235294117646981
Deleting feature no. 54 of importance score -0.003294117647058803
Deleting feature no. 219 of importance score -0.0023529411764705577
Deleting feature no. 255 of importance score -0.002823529411764669
Deleting feature no. 68 of importance score -0.0035294117647059363
Deleting feature no. 441 of importance score -0.003999999999999937
Deleting feature no. 414 of importance score -0.002588235294117669

In [285]:
import re

def get_feature_args_from_str(log_text : str, drop_limit = 498):
    f_args = list(range(500))
    deleted_features = list(map(int, re.findall(r"Deleting feature no\. (\d+)", log_text)))
    for i, d in enumerate(deleted_features):
        f_args = np.delete(f_args, d)

        if i >= drop_limit:
            break

    return f_args

In [None]:
from sklearn.inspection import permutation_importance

feature_args = np.array(selected_features)
X_train_fs_iter = X_train_fs.copy(deep=True).iloc[:, feature_args]
X_val_fs_iter =X_val_fs.copy(deep=True).iloc[:, feature_args]
deleted_features_arg = []
for i in range(500):
    clf = xgb.XGBClassifier(eta=0.01, device = 'cuda', n_estimators=150, max_depth=10)
    clf.fit(X_train_fs_iter, y_train_fs)
    r = permutation_importance(clf, X_val_fs_iter, y_val_fs,n_repeats=5,random_state=0)
    min_importance_arg = r.importances_mean.argsort()[0]
    min_importance = r.importances_mean[min_importance_arg]
    if len(feature_args) > 2:
        print(f"Deleting feature no. {min_importance_arg} of importance score {min_importance}")
        feature_args = np.delete(feature_args, min_importance_arg)
        X_train_fs_iter = X_train_fs.copy(deep=True).iloc[:, feature_args]
        X_val_fs_iter = X_val_fs.copy(deep=True).iloc[:, feature_args]
    else:
        break
        


Deleting feature no. 1 of importance score 0.08211764705882355
Deleting feature no. 0 of importance score 0.08094117647058828


IndexError: list index out of range

# Model on selected features

In [None]:
params = {'objective':'binary:logistic',
          'eval_metric':'logloss',
          'eta':0.1,
          'n_estimators' : 400,
          'max_depth' : 8,
          'device' : 'cuda',
          'colsample_bytree':0.8,
          'verbosity' : 1,
          'alpha' : 7,}

In [825]:
log_text = """
Deleting feature no. 264 of importance score -0.0011764705882352678
Deleting feature no. 265 of importance score -0.0028235294117647134
Deleting feature no. 296 of importance score -0.004470588235294138
Deleting feature no. 75 of importance score -0.00352941176470587
Deleting feature no. 1 of importance score -0.005647058823529361
Deleting feature no. 79 of importance score -0.0021176470588234906
Deleting feature no. 2 of importance score -0.003529411764705914
Deleting feature no. 211 of importance score -0.008235294117647073
Deleting feature no. 315 of importance score -0.004235294117646981
Deleting feature no. 54 of importance score -0.003294117647058803
Deleting feature no. 219 of importance score -0.0023529411764705577
Deleting feature no. 255 of importance score -0.002823529411764669
Deleting feature no. 68 of importance score -0.0035294117647059363
Deleting feature no. 441 of importance score -0.003999999999999937
Deleting feature no. 414 of importance score -0.002588235294117669
Deleting feature no. 0 of importance score -0.002823529411764669
Deleting feature no. 483 of importance score -0.003529411764705892
Deleting feature no. 305 of importance score -0.004705882352941182
Deleting feature no. 16 of importance score -0.0023529411764705356
Deleting feature no. 344 of importance score -0.002823529411764758
Deleting feature no. 212 of importance score -0.001882352941176424
Deleting feature no. 208 of importance score -0.0028235294117646913
Deleting feature no. 350 of importance score -0.0025882352941176247
Deleting feature no. 461 of importance score -0.002117647058823513
Deleting feature no. 55 of importance score -0.0032941176470588696
Deleting feature no. 197 of importance score -0.0021176470588234906
Deleting feature no. 222 of importance score -0.002588235294117647
Deleting feature no. 287 of importance score -0.003294117647058825
Deleting feature no. 160 of importance score -0.0023529411764705577
Deleting feature no. 444 of importance score -0.001882352941176424
Deleting feature no. 350 of importance score -0.003294117647058803
Deleting feature no. 58 of importance score -0.0037647058823529144
Deleting feature no. 364 of importance score -0.0023529411764706466
Deleting feature no. 451 of importance score -0.0023529411764705577
Deleting feature no. 35 of importance score -0.0023529411764706466
Deleting feature no. 196 of importance score -0.0021176470588235353
Deleting feature no. 1 of importance score -0.0037647058823529144
Deleting feature no. 419 of importance score -0.0021176470588235353
Deleting feature no. 398 of importance score -0.0025882352941176247
Deleting feature no. 448 of importance score -0.001882352941176535
Deleting feature no. 220 of importance score -0.0021176470588235353
Deleting feature no. 293 of importance score -0.0018823529411764683
Deleting feature no. 331 of importance score -0.0021176470588236017
Deleting feature no. 420 of importance score -0.001882352941176535
Deleting feature no. 33 of importance score -0.0030588235294118247
Deleting feature no. 340 of importance score -0.0021176470588235353
Deleting feature no. 318 of importance score -0.003529411764705914
Deleting feature no. 358 of importance score -0.00235294117647058
Deleting feature no. 344 of importance score -0.0016470588235294238
Deleting feature no. 200 of importance score -0.0023529411764705577
Deleting feature no. 74 of importance score -0.0023529411764705577
Deleting feature no. 284 of importance score -0.0016470588235293793
Deleting feature no. 441 of importance score -0.001882352941176424
Deleting feature no. 260 of importance score -0.0018823529411764683
Deleting feature no. 186 of importance score -0.00235294117647058
Deleting feature no. 81 of importance score -0.004235294117647115
Deleting feature no. 6 of importance score -0.002588235294117669
Deleting feature no. 387 of importance score -0.0030588235294117805
Deleting feature no. 287 of importance score -0.001411764705882379
Deleting feature no. 101 of importance score -0.0021176470588236017
Deleting feature no. 48 of importance score -0.0030588235294118026
Deleting feature no. 313 of importance score -0.002117647058823513
Deleting feature no. 323 of importance score -0.00235294117647058
Deleting feature no. 168 of importance score -0.002352941176470602
Deleting feature no. 434 of importance score -0.0030588235294118026
Deleting feature no. 427 of importance score -0.0037647058823529144
Deleting feature no. 314 of importance score -0.0021176470588236017
Deleting feature no. 93 of importance score -0.003294117647058803
Deleting feature no. 171 of importance score -0.0016470588235294238
Deleting feature no. 241 of importance score -0.0016470588235293793
Deleting feature no. 166 of importance score -0.006352941176470584
Deleting feature no. 33 of importance score -0.002588235294117691
Deleting feature no. 72 of importance score -0.003294117647058803
Deleting feature no. 202 of importance score -0.0014117647058824012
Deleting feature no. 218 of importance score -0.001882352941176535
Deleting feature no. 237 of importance score -0.0025882352941176247
Deleting feature no. 106 of importance score -0.002823529411764669
Deleting feature no. 160 of importance score -0.0021176470588234906
Deleting feature no. 84 of importance score -0.002588235294117713
Deleting feature no. 122 of importance score -0.0023529411764706466
Deleting feature no. 29 of importance score -0.001647058823529468
Deleting feature no. 277 of importance score -0.0021176470588235795
Deleting feature no. 32 of importance score -0.001882352941176535
Deleting feature no. 339 of importance score -0.0035294117647058478
Deleting feature no. 325 of importance score -0.0023529411764705577
Deleting feature no. 34 of importance score -0.002588235294117669
Deleting feature no. 318 of importance score -0.0021176470588236017
Deleting feature no. 98 of importance score -0.0023529411764706245
Deleting feature no. 125 of importance score -0.001882352941176535
Deleting feature no. 336 of importance score -0.001647058823529468
Deleting feature no. 330 of importance score -0.002117647058823513
Deleting feature no. 274 of importance score -0.0018823529411764683
Deleting feature no. 301 of importance score -0.0030588235294117584
Deleting feature no. 343 of importance score -0.003764705882352981
Deleting feature no. 378 of importance score -0.002823529411764647
Deleting feature no. 203 of importance score -0.0021176470588234906
Deleting feature no. 112 of importance score -0.002588235294117691
Deleting feature no. 34 of importance score -0.003529411764705781
Deleting feature no. 329 of importance score -0.004941176470588204
Deleting feature no. 190 of importance score -0.0030588235294117584
Deleting feature no. 137 of importance score -0.002352941176470602
Deleting feature no. 10 of importance score -0.002352941176470602
Deleting feature no. 371 of importance score -0.0021176470588236017
Deleting feature no. 25 of importance score -0.0021176470588235353
Deleting feature no. 187 of importance score -0.0023529411764705577
Deleting feature no. 58 of importance score -0.0023529411764705577
Deleting feature no. 141 of importance score -0.0018823529411764461
Deleting feature no. 254 of importance score -0.0023529411764706466
Deleting feature no. 67 of importance score -0.0018823529411764461
Deleting feature no. 196 of importance score -0.0025882352941176247
Deleting feature no. 293 of importance score -0.001647058823529468
Deleting feature no. 366 of importance score -0.0014117647058823125
Deleting feature no. 261 of importance score -0.002823529411764647
Deleting feature no. 182 of importance score -0.0028235294117647134
Deleting feature no. 361 of importance score -0.0025882352941176247
Deleting feature no. 83 of importance score -0.0016470588235294238
Deleting feature no. 291 of importance score -0.0018823529411765127
Deleting feature no. 1 of importance score -0.0016470588235294238
Deleting feature no. 219 of importance score -0.0030588235294117137
Deleting feature no. 259 of importance score -0.0023529411764706466
Deleting feature no. 368 of importance score -0.002823529411764736
Deleting feature no. 116 of importance score -0.001882352941176424
Deleting feature no. 191 of importance score -0.002588235294117713
Deleting feature no. 275 of importance score -0.002588235294117669
Deleting feature no. 220 of importance score -0.0023529411764706245
Deleting feature no. 36 of importance score -0.0023529411764705134
Deleting feature no. 176 of importance score -0.0028235294117646913
Deleting feature no. 171 of importance score -0.001882352941176535
Deleting feature no. 92 of importance score -0.0023529411764705577
Deleting feature no. 278 of importance score -0.0023529411764705577
Deleting feature no. 341 of importance score -0.0018823529411764683
Deleting feature no. 91 of importance score -0.00235294117647058
Deleting feature no. 355 of importance score -0.0028235294117647134
Deleting feature no. 361 of importance score -0.002588235294117713
Deleting feature no. 302 of importance score -0.001647058823529468
Deleting feature no. 282 of importance score -0.0021176470588234906
Deleting feature no. 38 of importance score -0.0021176470588234685
Deleting feature no. 202 of importance score -0.0023529411764706466
Deleting feature no. 316 of importance score -0.001647058823529468
Deleting feature no. 261 of importance score -0.001647058823529468
Deleting feature no. 84 of importance score -0.0021176470588235353
Deleting feature no. 70 of importance score -0.0016470588235294014
Deleting feature no. 27 of importance score -0.0025882352941176247
Deleting feature no. 291 of importance score -0.0021176470588235353
Deleting feature no. 317 of importance score -0.0016470588235294238
Deleting feature no. 85 of importance score -0.001882352941176535
Deleting feature no. 328 of importance score -0.002823529411764736
Deleting feature no. 62 of importance score -0.001882352941176535
Deleting feature no. 274 of importance score -0.0023529411764706466
Deleting feature no. 167 of importance score -0.0021176470588234906
Deleting feature no. 103 of importance score -0.0025882352941176247
Deleting feature no. 233 of importance score -0.001882352941176535
Deleting feature no. 146 of importance score -0.002352941176470602
Deleting feature no. 148 of importance score -0.0021176470588234906
Deleting feature no. 303 of importance score -0.001882352941176424
Deleting feature no. 276 of importance score -0.003764705882352981
Deleting feature no. 209 of importance score -0.0021176470588236017
Deleting feature no. 318 of importance score -0.002588235294117647
Deleting feature no. 259 of importance score -0.0011764705882352233
Deleting feature no. 232 of importance score -0.0030588235294117137
Deleting feature no. 213 of importance score -0.0011764705882353343
Deleting feature no. 182 of importance score -0.001882352941176535
Deleting feature no. 304 of importance score -0.0016470588235294238
Deleting feature no. 194 of importance score -0.001411764705882379
Deleting feature no. 44 of importance score -0.002588235294117602
Deleting feature no. 264 of importance score -0.0016470588235294238
Deleting feature no. 268 of importance score -0.003294117647058825
Deleting feature no. 8 of importance score -0.0023529411764705577
Deleting feature no. 26 of importance score -0.0021176470588235574
Deleting feature no. 123 of importance score -0.0021176470588235353
Deleting feature no. 14 of importance score -0.0016470588235294238
Deleting feature no. 5 of importance score -0.00258823529411758
Deleting feature no. 189 of importance score -0.0030588235294117137
Deleting feature no. 132 of importance score -0.0023529411764705577
Deleting feature no. 107 of importance score -0.002823529411764758
Deleting feature no. 233 of importance score -0.002117647058823513
Deleting feature no. 233 of importance score -0.002823529411764736
Deleting feature no. 23 of importance score -0.0023529411764706687
Deleting feature no. 241 of importance score -0.00258823529411758
Deleting feature no. 132 of importance score -0.00235294117647058
Deleting feature no. 80 of importance score -0.0021176470588235574
Deleting feature no. 275 of importance score -0.0023529411764705577
Deleting feature no. 160 of importance score -0.004235294117647048
Deleting feature no. 133 of importance score -0.002823529411764758
Deleting feature no. 312 of importance score -0.0030588235294117584
Deleting feature no. 299 of importance score -0.003294117647058825
Deleting feature no. 185 of importance score -0.004235294117647026
Deleting feature no. 125 of importance score -0.0023529411764706466
Deleting feature no. 134 of importance score -0.0028235294117646913
Deleting feature no. 231 of importance score -0.003294117647058803
Deleting feature no. 221 of importance score -0.0021176470588235353
Deleting feature no. 166 of importance score -0.002823529411764736
Deleting feature no. 257 of importance score -0.0021176470588234906
Deleting feature no. 143 of importance score -0.0032941176470588696
Deleting feature no. 184 of importance score -0.0018823529411764906
Deleting feature no. 172 of importance score -0.002823529411764758
Deleting feature no. 97 of importance score -0.0028235294117646913
Deleting feature no. 18 of importance score -0.0028235294117647134
Deleting feature no. 16 of importance score -0.0030588235294117584
Deleting feature no. 228 of importance score -0.0028235294117647134
Deleting feature no. 268 of importance score -0.002352941176470602
Deleting feature no. 118 of importance score -0.0023529411764706687
Deleting feature no. 33 of importance score -0.002823529411764669
Deleting feature no. 171 of importance score -0.002588235294117647
Deleting feature no. 24 of importance score -0.001411764705882379
Deleting feature no. 133 of importance score -0.002823529411764758
Deleting feature no. 206 of importance score -0.002117647058823513
Deleting feature no. 84 of importance score -0.002588235294117691
Deleting feature no. 127 of importance score -0.0028235294117647134
Deleting feature no. 181 of importance score -0.0018823529411764906
Deleting feature no. 198 of importance score -0.0023529411764706466
Deleting feature no. 256 of importance score -0.002588235294117647
Deleting feature no. 214 of importance score -0.0023529411764705577
Deleting feature no. 260 of importance score -0.0028235294117647134
Deleting feature no. 114 of importance score -0.002352941176470602
Deleting feature no. 149 of importance score -0.003294117647058825
Deleting feature no. 17 of importance score -0.0011764705882353343
Deleting feature no. 190 of importance score -0.002588235294117713
Deleting feature no. 16 of importance score -0.001882352941176424
Deleting feature no. 102 of importance score -0.0018823529411764906
Deleting feature no. 98 of importance score -0.0030588235294118473
Deleting feature no. 227 of importance score -0.0021176470588234906
Deleting feature no. 163 of importance score -0.0021176470588235795
Deleting feature no. 229 of importance score -0.002588235294117669
Deleting feature no. 194 of importance score -0.0018823529411764906
Deleting feature no. 263 of importance score -0.0018823529411764017
Deleting feature no. 266 of importance score -0.0014117647058824012
Deleting feature no. 103 of importance score -0.0021176470588235353
Deleting feature no. 162 of importance score -0.003529411764705914
Deleting feature no. 103 of importance score -0.0021176470588235574
Deleting feature no. 73 of importance score -0.0016470588235294238
Deleting feature no. 23 of importance score -0.0016470588235293793
Deleting feature no. 162 of importance score -0.0021176470588234685
Deleting feature no. 31 of importance score -0.001882352941176424
Deleting feature no. 218 of importance score -0.0028235294117646913
Deleting feature no. 224 of importance score -0.0030588235294118247
Deleting feature no. 215 of importance score -0.004235294117646981
Deleting feature no. 137 of importance score -0.0018823529411764683
Deleting feature no. 199 of importance score -0.002823529411764758
Deleting feature no. 78 of importance score -0.0028235294117646913
Deleting feature no. 3 of importance score -0.0016470588235294238
Deleting feature no. 14 of importance score -0.0030588235294116916
Deleting feature no. 34 of importance score -0.0018823529411764683
Deleting feature no. 86 of importance score -0.002823529411764647
Deleting feature no. 208 of importance score -0.0011764705882352678
Deleting feature no. 235 of importance score -0.0014117647058824012
Deleting feature no. 143 of importance score -0.0014117647058823125
Deleting feature no. 200 of importance score -0.0018823529411764683
Deleting feature no. 191 of importance score -0.0030588235294117137
Deleting feature no. 18 of importance score -0.001411764705882379
Deleting feature no. 11 of importance score -0.001882352941176424
Deleting feature no. 150 of importance score -0.0030588235294117805
Deleting feature no. 171 of importance score -0.0021176470588235353
Deleting feature no. 83 of importance score -0.002823529411764647
Deleting feature no. 74 of importance score -0.003294117647058803
Deleting feature no. 53 of importance score -0.0030588235294117805
Deleting feature no. 34 of importance score -0.00235294117647058
Deleting feature no. 135 of importance score -0.0021176470588235353
Deleting feature no. 171 of importance score -0.0030588235294118026
Deleting feature no. 127 of importance score -0.0030588235294116916
Deleting feature no. 18 of importance score -0.002823529411764736
Deleting feature no. 204 of importance score -0.00258823529411758
Deleting feature no. 195 of importance score -0.0018823529411764461
Deleting feature no. 161 of importance score -0.002588235294117647
Deleting feature no. 177 of importance score -0.001647058823529468
Deleting feature no. 229 of importance score -0.002588235294117691
Deleting feature no. 106 of importance score -0.002117647058823513
Deleting feature no. 228 of importance score -0.0028235294117646913
Deleting feature no. 153 of importance score -0.002823529411764669
Deleting feature no. 32 of importance score -0.001882352941176424
Deleting feature no. 220 of importance score -0.002117647058823513
Deleting feature no. 121 of importance score -0.0016470588235293793
Deleting feature no. 61 of importance score -0.0018823529411764906
Deleting feature no. 63 of importance score -0.002588235294117647
Deleting feature no. 134 of importance score -0.001411764705882379
Deleting feature no. 104 of importance score -0.001882352941176424
Deleting feature no. 5 of importance score -0.002117647058823513
Deleting feature no. 146 of importance score -0.001647058823529357
Deleting feature no. 146 of importance score -0.001882352941176424
Deleting feature no. 117 of importance score -0.001647058823529468
Deleting feature no. 163 of importance score -0.001411764705882379
Deleting feature no. 24 of importance score -0.0018823529411764906
Deleting feature no. 154 of importance score -0.0021176470588235574
Deleting feature no. 37 of importance score -0.002823529411764758
Deleting feature no. 72 of importance score -0.002823529411764758
Deleting feature no. 49 of importance score -0.0023529411764706245
Deleting feature no. 127 of importance score -0.0016470588235294459
Deleting feature no. 40 of importance score -0.0011764705882353343
Deleting feature no. 133 of importance score -0.0021176470588235795
Deleting feature no. 66 of importance score -0.002823529411764647
Deleting feature no. 50 of importance score -0.002588235294117602
Deleting feature no. 205 of importance score -0.0011764705882352454
Deleting feature no. 13 of importance score -0.002823529411764758
Deleting feature no. 126 of importance score -0.004000000000000026
Deleting feature no. 202 of importance score -0.00235294117647058
Deleting feature no. 9 of importance score -0.0021176470588236017
Deleting feature no. 4 of importance score -0.0023529411764706245
Deleting feature no. 86 of importance score -0.002117647058823513
Deleting feature no. 55 of importance score -0.0018823529411764461
Deleting feature no. 3 of importance score -0.001882352941176424
Deleting feature no. 103 of importance score -0.003294117647058825
Deleting feature no. 158 of importance score -0.0021176470588234906
Deleting feature no. 187 of importance score -0.002117647058823513
Deleting feature no. 153 of importance score -0.0018823529411764461
Deleting feature no. 136 of importance score -0.0023529411764705577
Deleting feature no. 104 of importance score -0.004235294117647026
Deleting feature no. 151 of importance score -0.0018823529411764906
Deleting feature no. 112 of importance score -0.002823529411764625
Deleting feature no. 149 of importance score -0.002117647058823513
Deleting feature no. 134 of importance score -0.002588235294117713
Deleting feature no. 143 of importance score -0.0035294117647058478
Deleting feature no. 124 of importance score -0.0032941176470588475
Deleting feature no. 106 of importance score -0.0016470588235294014
Deleting feature no. 74 of importance score -0.004470588235294138
Deleting feature no. 14 of importance score -0.0011764705882353343
Deleting feature no. 8 of importance score -0.0023529411764706245
Deleting feature no. 182 of importance score -0.0023529411764706687
Deleting feature no. 172 of importance score -0.001882352941176424
Deleting feature no. 58 of importance score -0.0023529411764706687
Deleting feature no. 155 of importance score -0.0014117647058823125
Deleting feature no. 106 of importance score -0.002823529411764669
Deleting feature no. 18 of importance score -0.0023529411764705577
Deleting feature no. 72 of importance score -0.0018823529411764683
Deleting feature no. 120 of importance score -0.002588235294117647
Deleting feature no. 168 of importance score -0.0028235294117646913
Deleting feature no. 117 of importance score -0.0023529411764705577
Deleting feature no. 143 of importance score -0.00235294117647058
Deleting feature no. 171 of importance score -0.0014117647058822902
Deleting feature no. 60 of importance score -0.0014117647058823125
Deleting feature no. 124 of importance score -0.002588235294117647
Deleting feature no. 46 of importance score -0.0016470588235293793
Deleting feature no. 90 of importance score -0.0028235294117646913
Deleting feature no. 52 of importance score -0.0014117647058823125
Deleting feature no. 13 of importance score -0.0018823529411764906
Deleting feature no. 142 of importance score -0.0021176470588235353
Deleting feature no. 34 of importance score -0.0030588235294116916
Deleting feature no. 12 of importance score -0.0021176470588236017
Deleting feature no. 101 of importance score -0.001882352941176424
Deleting feature no. 44 of importance score -0.003294117647058825
Deleting feature no. 86 of importance score -0.0028235294117646913
Deleting feature no. 75 of importance score -0.002352941176470602
Deleting feature no. 113 of importance score -0.0023529411764706466
Deleting feature no. 11 of importance score -0.0021176470588235574
Deleting feature no. 35 of importance score -0.0016470588235294238
Deleting feature no. 128 of importance score -0.00235294117647058
Deleting feature no. 13 of importance score -0.001882352941176535
Deleting feature no. 141 of importance score -0.0014117647058822902
Deleting feature no. 60 of importance score -0.0014117647058823125
Deleting feature no. 3 of importance score -0.0009411764705882675
Deleting feature no. 13 of importance score -0.0011764705882353343
Deleting feature no. 52 of importance score -0.0032941176470587586
Deleting feature no. 66 of importance score -0.002823529411764669
Deleting feature no. 18 of importance score -0.002117647058823513
Deleting feature no. 101 of importance score -0.0030588235294118026
Deleting feature no. 127 of importance score -0.002588235294117647
Deleting feature no. 47 of importance score -0.0018823529411765127
Deleting feature no. 42 of importance score -0.001647058823529468
Deleting feature no. 115 of importance score -0.00352941176470587
Deleting feature no. 104 of importance score -0.0018823529411764461
Deleting feature no. 119 of importance score -0.002823529411764625
Deleting feature no. 128 of importance score -0.002823529411764736
Deleting feature no. 65 of importance score -0.002588235294117647
Deleting feature no. 51 of importance score -0.0014117647058823567
Deleting feature no. 122 of importance score -0.004705882352941138
Deleting feature no. 2 of importance score -0.0028235294117646913
Deleting feature no. 31 of importance score -0.0037647058823529144
Deleting feature no. 87 of importance score -0.0009411764705882008
Deleting feature no. 45 of importance score -0.005647058823529472
Deleting feature no. 89 of importance score -0.0009411764705882675
Deleting feature no. 63 of importance score -0.003529411764705803
Deleting feature no. 31 of importance score -0.002588235294117647
Deleting feature no. 16 of importance score -0.002588235294117669
Deleting feature no. 21 of importance score -0.0016470588235293793
Deleting feature no. 121 of importance score -0.0016470588235294459
Deleting feature no. 15 of importance score -0.0018823529411764017
Deleting feature no. 12 of importance score -0.00235294117647058
Deleting feature no. 23 of importance score -0.0014117647058823125
Deleting feature no. 79 of importance score -0.0030588235294117805
Deleting feature no. 121 of importance score -0.0014117647058823125
Deleting feature no. 109 of importance score -0.001882352941176424
Deleting feature no. 112 of importance score -0.001882352941176424
Deleting feature no. 113 of importance score -0.0014117647058823567
Deleting feature no. 14 of importance score -0.0014117647058823125
Deleting feature no. 62 of importance score -0.0021176470588235795
Deleting feature no. 69 of importance score -0.0030588235294117584
Deleting feature no. 31 of importance score -0.001882352941176424
Deleting feature no. 38 of importance score -0.002588235294117647
Deleting feature no. 105 of importance score -0.0014117647058823567
Deleting feature no. 42 of importance score -0.0025882352941176247
Deleting feature no. 37 of importance score -0.0018823529411764461
Deleting feature no. 82 of importance score -0.0014117647058823346
Deleting feature no. 5 of importance score -0.002588235294117669
Deleting feature no. 61 of importance score -0.002588235294117669
Deleting feature no. 12 of importance score -0.00023529411764704465
Deleting feature no. 49 of importance score -0.0014117647058823567
Deleting feature no. 25 of importance score -0.0016470588235294238
Deleting feature no. 61 of importance score -0.0014117647058823125
Deleting feature no. 98 of importance score -0.0016470588235294238
Deleting feature no. 33 of importance score -0.00047058823529415593
Deleting feature no. 76 of importance score -0.003529411764705914
Deleting feature no. 37 of importance score -0.0016470588235294014
Deleting feature no. 70 of importance score -0.0030588235294117584
Deleting feature no. 38 of importance score -0.0037647058823530033
Deleting feature no. 87 of importance score -0.001882352941176424
Deleting feature no. 82 of importance score -0.002823529411764758
Deleting feature no. 20 of importance score -0.003294117647058825
Deleting feature no. 46 of importance score -0.004705882352941226
Deleting feature no. 25 of importance score -0.0051764705882353605
Deleting feature no. 2 of importance score -0.0030588235294118026
Deleting feature no. 38 of importance score -0.004470588235294138
Deleting feature no. 58 of importance score -0.004000000000000026
Deleting feature no. 14 of importance score -0.001882352941176424
Deleting feature no. 87 of importance score -0.0014117647058823125
Deleting feature no. 85 of importance score -0.003294117647058825
Deleting feature no. 43 of importance score -0.00235294117647058
Deleting feature no. 58 of importance score -0.0023529411764706466
Deleting feature no. 63 of importance score -0.0014117647058823125
Deleting feature no. 8 of importance score -0.0030588235294117362
Deleting feature no. 39 of importance score -0.002588235294117647
Deleting feature no. 11 of importance score -0.001647058823529468
Deleting feature no. 45 of importance score -0.0018823529411764461
Deleting feature no. 12 of importance score -0.003529411764705914
Deleting feature no. 7 of importance score -0.0018823529411764461
Deleting feature no. 8 of importance score -0.0021176470588235795
Deleting feature no. 29 of importance score -0.002823529411764758
Deleting feature no. 55 of importance score -0.0018823529411764461
Deleting feature no. 30 of importance score -0.0004705882352940893
Deleting feature no. 9 of importance score -0.003294117647058825
Deleting feature no. 66 of importance score -0.0030588235294117805
Deleting feature no. 50 of importance score -0.0016470588235294014
Deleting feature no. 55 of importance score -0.00235294117647058
Deleting feature no. 44 of importance score -0.002588235294117691
Deleting feature no. 26 of importance score -0.001882352941176535
Deleting feature no. 17 of importance score -0.0016470588235294459
Deleting feature no. 16 of importance score -0.0009411764705882231
Deleting feature no. 47 of importance score -0.002588235294117647
Deleting feature no. 25 of importance score -0.0032941176470588475
Deleting feature no. 12 of importance score -0.003529411764705892
Deleting feature no. 11 of importance score -0.0018823529411764461
Deleting feature no. 6 of importance score -0.004470588235294115
Deleting feature no. 33 of importance score -0.002352941176470602
Deleting feature no. 34 of importance score -0.0035294117647059363
Deleting feature no. 56 of importance score -0.0025882352941176247
Deleting feature no. 25 of importance score -0.0028235294117647134
Deleting feature no. 53 of importance score -0.0021176470588234906
Deleting feature no. 30 of importance score -0.0016470588235294459
Deleting feature no. 13 of importance score -0.0011764705882352678
Deleting feature no. 7 of importance score -0.0014117647058823125
Deleting feature no. 11 of importance score -0.0021176470588236017
Deleting feature no. 18 of importance score -0.0011764705882352454
Deleting feature no. 2 of importance score -0.002823529411764736
Deleting feature no. 18 of importance score -0.001647058823529357
Deleting feature no. 12 of importance score -0.0009411764705882453
Deleting feature no. 8 of importance score -0.0028235294117647802
Deleting feature no. 29 of importance score -0.002117647058823513
Deleting feature no. 19 of importance score -0.0014117647058824012
Deleting feature no. 43 of importance score -0.0021176470588235353
Deleting feature no. 19 of importance score -0.0014117647058824012
Deleting feature no. 24 of importance score -0.0021176470588235795
Deleting feature no. 9 of importance score -0.0011764705882353122
Deleting feature no. 23 of importance score -0.0032941176470588918
Deleting feature no. 18 of importance score -0.0056470588235294494
Deleting feature no. 30 of importance score -0.0007058823529411118
Deleting feature no. 14 of importance score -0.002823529411764669
Deleting feature no. 25 of importance score -0.0032941176470588475
Deleting feature no. 12 of importance score -0.00023529411764702246
Deleting feature no. 11 of importance score -0.0028235294117647134
Deleting feature no. 22 of importance score -0.0009411764705882453
Deleting feature no. 28 of importance score -0.0030588235294117137
Deleting feature no. 10 of importance score -0.002117647058823513
Deleting feature no. 7 of importance score -0.0021176470588235574
Deleting feature no. 27 of importance score -0.004705882352941204
Deleting feature no. 20 of importance score -0.0016470588235294459
Deleting feature no. 24 of importance score 0.00047058823529406714
Deleting feature no. 22 of importance score -0.00023529411764706688
Deleting feature no. 8 of importance score -0.0007058823529411562
Deleting feature no. 6 of importance score -0.0009411764705882453
Deleting feature no. 8 of importance score 0.0014117647058823567
Deleting feature no. 3 of importance score -0.0021176470588235353
Deleting feature no. 13 of importance score -0.0007058823529411562
Deleting feature no. 11 of importance score -0.0011764705882353122
Deleting feature no. 3 of importance score -0.0025882352941176247
Deleting feature no. 3 of importance score 0.0009411764705881786
Deleting feature no. 9 of importance score -0.0009411764705882453
Deleting feature no. 14 of importance score -0.0040000000000000036
Deleting feature no. 10 of importance score 0.0014117647058823567
Deleting feature no. 6 of importance score 0.003529411764705892
Deleting feature no. 6 of importance score 0.0011764705882352678
Deleting feature no. 4 of importance score 0.0018823529411764461
Deleting feature no. 4 of importance score 0.0028235294117647134
Deleting feature no. 2 of importance score 0.00117647058823529
Deleting feature no. 5 of importance score 0.00352941176470587
Deleting feature no. 4 of importance score 0.0051764705882353605
Deleting feature no. 2 of importance score 0.008941176470588253
Deleting feature no. 2 of importance score 0.010823529411764742
Deleting feature no. 1 of importance score 0.022823529411764687
Deleting feature no. 0 of importance score 0.04070588235294115
Deleting feature no. 0 of importance score 0.07670588235294114
Deleting feature no. 1 of importance score 0.08211764705882355
Deleting feature no. 0 of importance score 0.08094117647058828
"""

selected_features = get_feature_args_from_str(log_text, drop_limit=494)
selected_features

array([  2,   6, 414, 425, 462])

In [1196]:
get_feature_args_from_str(log_text, drop_limit=490)

array([  2,   6, 115, 283, 351, 402, 414, 425, 462])

In [634]:
X_train_selected = X_train.copy(deep=True).iloc[:, selected_features]
X_test_selected = X_test.copy(deep=True).iloc[:, selected_features]

data_dmatrix = xgb.DMatrix(data=X_train_selected,label=y_train)
params = {'objective':'binary:logistic',
          'eval_metric':'logloss',
          'eta':0.1,
          'max_depth' : 8,
          'device' : 'cuda',
          'colsample_bytree':0.8,
          'verbosity' : 1,
          'alpha' : 7,}


cvboosters = []
xgb_cv = xgb.cv(
    dtrain=data_dmatrix,
    params=params,
    nfold=5,
    metrics = 'logloss',
    seed=42,
    num_boost_round = 300,
    custom_metric = xgb_euros_gained,
    early_stopping_rounds=100,
    stratified = True,
    callbacks=[SaveBestModel(cvboosters), ],
    ) 

In [635]:
xgb_cv

Unnamed: 0,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std,train-Euros_gained-mean,train-Euros_gained-std,test-Euros_gained-mean,test-Euros_gained-std
0,0.673799,0.000529,0.674892,0.000552,-6479.411817,88.529007,-6352.941211,283.331713
1,0.657935,0.000618,0.660421,0.00108,-6514.705859,119.471159,-6482.352832,353.724654
2,0.644744,0.00069,0.648497,0.00147,-6561.764648,107.423356,-6494.117578,389.481074
3,0.633896,0.000961,0.639543,0.001911,-6723.529394,79.683724,-6388.235449,209.13391
4,0.62432,0.000964,0.630936,0.002359,-6708.823437,59.843954,-6364.705957,215.007737
5,0.616631,0.001138,0.624038,0.002973,-6691.176563,53.429105,-6388.235254,156.076457
6,0.609959,0.001352,0.618481,0.003601,-6708.823535,49.565646,-6447.058691,184.522185
7,0.604361,0.001498,0.613936,0.003874,-6685.294141,59.263109,-6423.529297,172.102838
8,0.598819,0.001944,0.60999,0.004065,-6764.705859,108.065741,-6529.411719,178.420661
9,0.594645,0.001963,0.606667,0.004485,-6823.529297,92.542017,-6517.64707,172.102911


# Catboost

In [609]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Format: (metric_name, value, is_higher_better)
def catboost_euros_gained(y_true, y_pred_proba, feature_num):
    n_selected = int(1 / 5 * len(y_true))
    top_k_indices = np.argsort(y_pred_proba[:, 1])[-n_selected:]
    true_positives = y_true.iloc[top_k_indices].sum()

    max_reward = n_selected * 10
    reward = true_positives * 10 * 10000 / max_reward
    cost = feature_num * 200
    euros = reward - cost

    return "Euros_gained", euros, True

In [1045]:

X_train_selected = X_train.copy(deep=True).iloc[:, selected_features]
X_test_selected = X_test.copy(deep=True).iloc[:, selected_features]


train_pool = Pool(data=X_train_selected, label=y_train)
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'learning_rate': 0.02,
    'depth': 1,
    'random_seed': 42,
    'bootstrap_type': 'Bayesian',
    'task_type': 'GPU',
    'devices': '0',
    'verbose': 0,
    'reg_lambda' : 30,
    'bagging_temperature' : 0
}

cv_data = cv(
    pool=train_pool,
    params=cat_params,
    fold_count=5,
    stratified=True,
    early_stopping_rounds=10,
    iterations=500,
    partition_random_seed=42,
    verbose=False,
    as_pandas=True
)



Training on fold [0/5]
bestTest = 0.6049666982
bestIteration = 166
Training on fold [1/5]
bestTest = 0.6051198957
bestIteration = 203
Training on fold [2/5]
bestTest = 0.5848068776
bestIteration = 213
Training on fold [3/5]
bestTest = 0.6056954212
bestIteration = 176
Training on fold [4/5]
bestTest = 0.6022611333
bestIteration = 208


In [1041]:
cv_data

Unnamed: 0,iterations,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.689642,0.000164,0.689609,0.000093
1,1,0.686267,0.000313,0.686211,0.000177
2,2,0.683082,0.000478,0.682971,0.000255
3,3,0.680024,0.000580,0.679856,0.000335
4,4,0.677050,0.000687,0.676850,0.000427
...,...,...,...,...,...
211,211,0.600346,0.008299,0.597846,0.002008
212,212,0.600354,0.008283,0.597844,0.002004
213,213,0.600353,0.008283,0.597844,0.002004
214,214,0.600356,0.008284,0.597843,0.002005


In [1073]:
best_iteration = np.argmin(cv_data['test-Logloss-mean'])
model = CatBoostClassifier(iterations=170, **cat_params)
model.fit(train_pool)

test_pool = Pool(data=X_test_selected, label=y_test)
catboost_euros_gained(y_test, model.predict_proba(test_pool), len(selected_features))



('Euros_gained',
 0    7600.0
 dtype: float64,
 True)

In [1102]:
from sklearn import metrics

print(metrics.accuracy_score(y_test,  model.predict(test_pool)))
print(metrics.precision_score(y_test, model.predict(test_pool)))
print(metrics.f1_score(y_test, model.predict(test_pool)))

0.7213333333333334
0.7138964577656676
0.7148703956343793


# Mutual information

In [617]:
from sklearn import feature_selection
mi_scores = feature_selection.mutual_info_classif(X_train, y_train, discrete_features='auto', random_state=42)

# Create a DataFrame to easily inspect scores
mi_df = pd.DataFrame({
    'feature': X_train.columns,
    'score': mi_scores
}).sort_values(by='score', ascending=False)

  y = column_or_1d(y, warn=True)


In [1185]:
top_k = 1
selected_features = mi_df.head(top_k)['feature'].tolist()
#selected_features = [2]
print("Top features selected by mutual_info_classif:")
print(mi_df.head(top_k))

X_train_selected = X_train.copy(deep=True).iloc[:, selected_features]
X_test_selected = X_test.copy(deep=True).iloc[:, selected_features]

data_dmatrix = xgb.DMatrix(data=X_train_selected,label=y_train)
params = {'objective':'binary:logistic',
          'eta':0.03,
          'max_depth' : 1,
          'device' : 'cuda',
          'subsample':0.9,
          'verbosity' : 1,
          'alpha' : 8,
          'lambda' : 400}


cvboosters = []
xgb_cv = xgb.cv(
    dtrain=data_dmatrix,
    params=params,
    nfold=5,
    metrics = 'logloss',
    seed=42,
    num_boost_round = 146,
    custom_metric = xgb_euros_gained,
    #early_stopping_rounds=100,
    stratified = True,
    callbacks=[SaveBestModel(cvboosters), ],
    ) 

Top features selected by mutual_info_classif:
   feature     score
2        2  0.090186


In [1153]:
print(xgb_cv.iloc[:, 6].min())
xgb_cv

-7458.823535


Unnamed: 0,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std,train-Euros_gained-mean,train-Euros_gained-std,test-Euros_gained-mean,test-Euros_gained-std
0,0.683091,0.000318,0.683260,0.000579,-7208.823535,79.683522,-7117.647070,316.992884
1,0.674474,0.000539,0.674703,0.001135,-7200.000000,72.879381,-7152.941211,302.240849
2,0.666764,0.000835,0.667089,0.001757,-7200.000000,72.879381,-7152.941211,302.240849
3,0.659915,0.001151,0.660437,0.002380,-7200.000000,72.879381,-7141.176562,318.734526
4,0.653896,0.001499,0.654630,0.002991,-7200.000000,72.879381,-7141.176562,318.734526
...,...,...,...,...,...,...,...,...
141,5.891640,0.266482,6.003245,0.483047,-7267.646973,128.000361,-7411.764648,256.675557
142,5.891928,0.266229,6.003500,0.483078,-7267.646973,128.000361,-7411.764648,256.675557
143,5.891536,0.266365,6.003149,0.483119,-7267.646973,128.000361,-7411.764648,256.675557
144,5.891370,0.266688,6.003117,0.482908,-7264.705859,132.385558,-7458.823535,349.789958


In [1160]:
X_full_selected = pd.concat([X_train_selected, X_test_selected])
y_full = pd.concat([y_train, y_test])

In [1169]:
X_test_final = pd.read_csv('/home/mpuscian/Desktop/repozytoria/MINI_projects/AML_Project_2/x_test.txt', sep=' ', header=None).iloc[:, selected_features]

In [1187]:
a = clf.predict_proba(X_test_final)[:, 1]
a.sort()

In [1189]:
np.savetxt('/home/mpuscian/Desktop/repozytoria/MINI_projects/AML_Project_2/305995_obs2.txt', clf.predict_proba(X_test_final)[:, 1].argsort()[-1000:], delimiter='\n', fmt='%i')

In [1186]:
X_train_selected = X_train.copy(deep=True).iloc[:, selected_features]
X_test_selected = X_test.copy(deep=True).iloc[:, selected_features]

params = {'objective':'binary:logistic',
          'eta':0.03,
          'max_depth' : 1,
          'device' : 'cuda',
          'subsample':0.9,
          'verbosity' : 1,
          'alpha' : 8,
          'lambda' : 400}

clf = xgb.XGBClassifier(**params, n_estimators = 147)
clf.fit(X_full_selected, y_full)

euros_gained(y_test.to_numpy(), clf.predict_proba(X_test_selected), len(selected_features))

('Euros_gained', np.float64(-7400.0))

In [None]:
clf.predict_proba()

In [1164]:
from sklearn import metrics

print(metrics.accuracy_score(y_test,  clf.predict(X_test_selected)))
print(metrics.precision_score(y_test, clf.predict(X_test_selected)))
print(metrics.f1_score(y_test, clf.predict(X_test_selected)))

0.7266666666666667
0.7293447293447294
0.7140864714086471


# Ensemble

In [1150]:
def get_ensemble_proba(catboost_model, xgb_model, pool, X_selected, method='max'):
    proba_cat = catboost_model.predict_proba(pool)
    proba_xgb = xgb_model.predict_proba(X_selected)

    assert proba_cat.shape == proba_xgb.shape, "Mismatch in prediction shape between models"

    if method == 'max':
        proba_cat_max = np.max(proba_cat, axis=1)
        proba_xgb_max = np.max(proba_xgb, axis=1)

        use_cat = proba_cat_max > proba_xgb_max
        ensemble_proba = np.where(use_cat[:, None], proba_cat, proba_xgb)
    else:
        raise ValueError(f"Unsupported method: {method}")
    
    return ensemble_proba

def get_ensemble_prediction(catboost_model, xgb_model, pool, X_selected, method='max'):
    proba = get_ensemble_proba(catboost_model, xgb_model, pool, X_selected, method=method)
    return np.argmax(proba, axis=1)


In [None]:
euros_gained(y_test.to_numpy(), get_ensemble_proba(model, clf, test_pool, X_test_selected, method='max'), len(selected_features))

('Euros_gained', np.float64(-7533.333333333333))

In [1148]:
euros_gained(y_test.to_numpy(), clf.predict_proba(X_test_selected), len(selected_features))

('Euros_gained', np.float64(-7466.666666666667))

In [1149]:
euros_gained(y_test.to_numpy(), model.predict_proba(test_pool), len(selected_features))

('Euros_gained', np.float64(-7600.0))

In [1151]:
print(metrics.accuracy_score(y_test,  get_ensemble_prediction(model, clf, test_pool, X_test_selected, method='max')))
print(metrics.precision_score(y_test, get_ensemble_prediction(model, clf, test_pool, X_test_selected, method='max')))
print(metrics.f1_score(y_test, get_ensemble_prediction(model, clf, test_pool, X_test_selected, method='max')))

0.7266666666666667
0.7293447293447294
0.7140864714086471
