In [27]:
import numpy as np
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier, Pool, cv
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, recall_score
from functools import partial 
from sklearn.base import BaseEstimator
from tqdm.auto import trange
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold

In [4]:
train_data = pd.read_csv('vserosotb/train (1)/train_share.tsv',sep='\t')
test_data = pd.read_csv('vserosotb/test-10/test_share.tsv',sep='\t')
train_data.head(5)

Unnamed: 0,graph_1,graph_2,graph_3,feature_1,bki_1,graph_4,feature_2,graph_5,feature_3,graph_6,...,graph_22,application_2,bki_14,bki_15,feature_37,bki_16,bki_17,mb_15,feature_38,target
0,73.0,25.0,63.0,0.0,-3.307791,59.0,36.0,19544.0,0.0,91.0,...,0.0,0,44211.0,31.0,390.414429,0.571132,0.0,12527.0,0.0,0
1,64.0,37.0,68.0,0.0,-2.527839,106.0,36.0,1457.0,0.0,180.0,...,6.0,0,5600.0,3.0,663.7025,1.447663,0.1,10276.0,0.0,0
2,73.0,34.0,80.0,0.0,-2.985949,72.0,44.0,13071.0,0.0,151.0,...,1.0,0,250889.0,8.0,,0.234987,11.24,4588.0,0.0,0
3,170.0,99.0,203.0,0.0,-3.392264,228.0,,7000.0,,388.0,...,2.0,0,,,471.623097,,,22968.0,0.0,0
4,65.0,20.0,77.0,0.0,-2.657559,115.0,18.0,21725.0,0.0,190.0,...,13.0,1,0.0,0.0,411.1624,0.0,29.402,15687.0,0.0,0


In [127]:
d = np.array([1,2,3])
d[[1,2]] = 1
d

array([1, 1, 1])

In [133]:
def recall_topk(y_t,y_p):
    idx = np.argsort(y_p)[::-1][:int(len(y_p)*0.15)]
    y_p *= 0
    y_p[idx] = 1
    return recall_score(y_t,y_p)

class RecallTop15Metric:
    def is_max_optimal(self):
        return True # greater is better

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        y_pred = approx #np.rint(approx)
        y_true = np.array(target).astype(int)

        output_weight = 1 # weight is not used

        score = recall_topk(y_true, y_pred)
 
        return score, output_weight

    def get_final_error(self, error, weight):
        return error


cb_params = {
    'iterations': 1500,
    'learning_rate': 0.05,
    'loss_function': 'CrossEntropy',
    'max_depth': 6,
    'eval_metric': RecallTop15Metric(),
    'use_best_model':True,
    'task_type': 'GPU' ,
    'random_seed': 56
}

lgb_params = {
    'enable_categorical': True,
    'objective': 'binary',
    'verbosity': -1,
    'n_iter': 500,
    'boosting_type': 'gbdt',
    'random_state': 56,
    #'lambda_l1': 0.08758718919397321, 
    #'lambda_l2': 0.0039689175176025465, 
    'learning_rate': 0.03, 
    'max_depth': 6, 
    #'num_leaves': 103, 
    #'colsample_bytree': 0.8329551585827726, 
    #'colsample_bynode': 0.4025961355653304, 
    #'bagging_fraction': 0.7738954452473223, 
    #'bagging_freq': 4, 
    #'min_data_in_leaf': 85, 
    #'scale_pos_weight': 2.7984184778875543,
}

params = {
    'cb_params':cb_params,
    'lgb_params': lgb_params,
    'xgb_params': None,
}

cat_cols = []
label_col = 'target'
drop_cols = []

In [134]:
class EnsembleClassifier(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostClassifier(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],categorical_feature=cat_features)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
    
    def predict_proba(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict_proba(test_pool)[:,1]
        #lgb_preds = self.lgbm.predict_proba(X_test)[:,1]
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds #, lgb_preds #, xgb_preds

In [135]:
class CustomBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56,score_func=None):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        self.score_func = score_func
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = StratifiedKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                
                model = EnsembleClassifier(**self.params)
                model.fit(
                    X = train_df.drop([label_col]+drop_cols,axis=1),
                    y = train_df[label_col],
                    X_val = test_df.drop([label_col]+drop_cols,axis=1),
                    y_val = test_df[label_col],
                    cat_features = cat_features,
                    verbose = verbose
                )
                cb_preds = model.predict_proba(test_df.drop([label_col]+drop_cols,axis=1),cat_features)
                avg_preds =  cb_preds#(cb_preds + lgb_preds) / 2
                self.scores += [[
                    self.score_func(
                        test_df[label_col],
                        avg_preds
                    )
                ]]
                print(self.scores[-1])
                self.models += [model]
                
        print(f"Total Score {np.mean([x[0] for x in self.scores])}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        preds = np.mean([
            model.predict_proba(test_data.drop(drop_cols,axis=1),cat_features=cat_features)
            for model in self.models
        ],axis=0)
        return preds
    
    def get_feature_importance(self,type='FeatureImportance'):
        imp_0 = self.models[0].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [136]:
model = CustomBoostKfoldWraper(
    num_folds=5,
    num_repits=1,
    params=params,
    random_state=5656,
    score_func=recall_topk
)

In [None]:
model.fit(
    train_data=train_data,
    cat_features=cat_cols,
    drop_cols=drop_cols,
    label_col=label_col,
    verbose=350,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  _check_train_params(params)
Default metric period is 5 because PythonUserDefinedPerObject is/are not implemented for GPU


0:	learn: 0.3206419	test: 0.3239893	best: 0.3239893 (0)	total: 570ms	remaining: 14m 14s
350:	learn: 0.4491250	test: 0.4484621	best: 0.4484621 (350)	total: 46.7s	remaining: 2m 32s
700:	learn: 0.4629994	test: 0.4546857	best: 0.4548915 (625)	total: 1m 32s	remaining: 1m 45s
1050:	learn: 0.4729005	test: 0.4575147	best: 0.4576175 (1045)	total: 2m 18s	remaining: 59.3s
1400:	learn: 0.4813486	test: 0.4566917	best: 0.4580804 (1110)	total: 3m 4s	remaining: 13s
1499:	learn: 0.4832388	test: 0.4578233	best: 0.4580804 (1110)	total: 3m 17s	remaining: 0us
bestTest = 0.4580804444
bestIteration = 1110
Shrink model to first 1111 iterations.
[0.4580804443987244]


  _check_train_params(params)
Default metric period is 5 because PythonUserDefinedPerObject is/are not implemented for GPU


0:	learn: 0.3099178	test: 0.3048040	best: 0.3048040 (0)	total: 498ms	remaining: 12m 26s


In [112]:
preds.shape

(301489,)

In [113]:
preds = model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [114]:
sub = pd.read_csv('simple_sub-4.csv')

In [115]:
sub['target'] = preds

In [118]:
sub.to_csv('dump_subv2.csv')