In [1]:
import numpy as np
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier, Pool, cv
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from functools import partial 
from sklearn.base import BaseEstimator
from scipy.stats import rankdata
from tqdm.auto import trange
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold

In [24]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head(5)

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
train_data.iloc[:,5:].sum(axis=0).sort_values()[::-1]

trend_id_res0     837
trend_id_res12    624
trend_id_res2     597
trend_id_res27    530
trend_id_res28    420
trend_id_res3     361
trend_id_res19    360
trend_id_res1     347
trend_id_res30    306
trend_id_res18    231
trend_id_res16    191
trend_id_res36    167
trend_id_res8     140
trend_id_res20    132
trend_id_res4     126
trend_id_res11    108
trend_id_res21     99
trend_id_res10     96
trend_id_res15     81
trend_id_res14     73
trend_id_res40     72
trend_id_res23     65
trend_id_res29     61
trend_id_res35     58
trend_id_res5      52
trend_id_res31     50
trend_id_res37     46
trend_id_res43     44
trend_id_res39     44
trend_id_res42     35
trend_id_res7      35
trend_id_res13     34
trend_id_res47     31
trend_id_res48     29
trend_id_res33     28
trend_id_res38     28
trend_id_res44     27
trend_id_res26     27
trend_id_res22     25
trend_id_res6      21
trend_id_res41     21
trend_id_res32     13
trend_id_res25     13
trend_id_res49     12
trend_id_res45     12
trend_id_r

In [3]:
TAG_LIST = [
    'DELIVERY',
    'ASSORTMENT',
    'PRICE',
    'PROMOTIONS',
    'PRODUCTS_QUALITY',
    'SUPPORT',
    'CATALOG_NAVIGATION',
    'PAYMENT',
]

def get_tags_feats(df):
    for tag in TAG_LIST:
        df[f'tag_{tag}'] = df['tags'].fillna('').apply(lambda x: tag in x).astype(int)
    return df

train_data = get_tags_feats(train_data)
test_data = get_tags_feats(test_data)

In [4]:
def get_static_text_features(df):
    df['length'] = df['text'].fillna('').apply(len)
    df['words_number'] = df['text'].fillna('').apply(lambda x: len(x.split()))
    df['end_with_.'] = df['text'].fillna('a').apply(lambda x: x[-1] == '.').astype(int)
    df['sentence_count'] = df['text'].fillna('').apply(lambda x:x.count('.'))
    df['angry'] = df['text'].fillna('').apply(lambda x:x.count('!'))
    return df

train_data = get_static_text_features(train_data)
test_data = get_static_text_features(test_data)

In [5]:
deberta_oof = pd.read_parquet('gemma_preds.parquet')

In [6]:
for i in range(50):
    train_data[f'preds_{i}'] = deberta_oof['preds'].apply(lambda x:x[i])
#for i in range(50):
#    train_data[f'rank_preds_{i}'] = deberta_oof['preds'].apply(lambda x: rankdata(x[i]))

In [9]:
idxes = np.load('first_foldn.npy')

In [11]:
train_data = train_data.iloc[idxes]

In [12]:
cb_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'loss_function': 'MultiCrossEntropy',
    'max_depth': 4,
    'eval_metric': 'Accuracy',
    'random_seed': 56
}

params = {
    'cb_params':cb_params,
    'lgb_params': None,
    'xgb_params': None,
}

cat_cols = []
labels = [f'trend_id_res{i}' for i in range(50)]
drop_cols = [
    'Unnamed: 0',
    'tags',
    'text'
]

In [13]:
class EnsembleClassifier(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostClassifier(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],eval_metric=lgb_custom_metric,categorical_feature=cat_features)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
    
    def predict_proba(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict_proba(test_pool)
        #lgb_preds = self.lgbm.predict_proba(X_test)[:,1]
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds #, lgb_preds, xgb_preds

In [14]:
class CustomBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56,score_func=None):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        self.score_func = score_func
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = MultilabelStratifiedKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col].values)):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                
                model = EnsembleClassifier(**self.params)
                model.fit(
                    X = train_df.drop(label_col+drop_cols,axis=1),
                    y = train_df[label_col],
                    X_val = test_df.drop(label_col+drop_cols,axis=1),
                    y_val = test_df[label_col],
                    cat_features = cat_features,
                    verbose = verbose
                )
                cb_preds = model.predict_proba(test_df.drop(label_col+drop_cols,axis=1),cat_features)
                avg_preds =  cb_preds#(cb_preds + lgb_preds + xgb_preds) / 3
                self.scores += [[
                    self.score_func(
                        test_df.apply(lambda x: [x[f'trend_id_res{i}'] for i in range(50)],axis=1).tolist(),
                        cb_preds
                    )
                ]]
                print(self.scores[-1])
                self.models += [model]
                
        print(f"Total Score {np.mean([x[0] for x in self.scores])}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        preds = np.mean([
            model.predict_proba(test_data.drop(drop_cols,axis=1),cat_features=cat_features)
            for model in self.models
        ],axis=0)
        return preds
    
    def get_feature_importance(self,type='FeatureImportance'):
        imp_0 = self.models[0].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [15]:
def rank_accuracy(y_true,y_preds):
    preds = [list(map(lambda y: str(round(y)),x)) for x in y_preds]
    labels = [''.join(map(str,x)) for x in y_true]
    for i in range(len(y_preds)):
        preds[i][np.argmax(y_preds[i])] = '1'
    preds = [''.join(x) for x in preds]
    return accuracy_score(preds,labels)

In [16]:
model = CustomBoostKfoldWraper(
    num_folds=5,
    num_repits=3,
    params=params,
    random_state=5656,
    score_func=rank_accuracy
)

In [17]:
model.fit(
    train_data=train_data,
    cat_features=cat_cols,
    drop_cols=drop_cols,
    label_col=labels,
    verbose=350,
)

  0%|          | 0/3 [00:00<?, ?it/s]

0:	learn: 0.0026350	test: 0.0000000	best: 0.0000000 (0)	total: 72.3ms	remaining: 1m 12s
350:	learn: 0.8985507	test: 0.4836957	best: 0.4891304 (252)	total: 7.75s	remaining: 14.3s
700:	learn: 1.0000000	test: 0.4782609	best: 0.4945652 (445)	total: 15.5s	remaining: 6.59s
999:	learn: 1.0000000	test: 0.5000000	best: 0.5000000 (831)	total: 22s	remaining: 0us

bestTest = 0.5
bestIteration = 831

Shrink model to first 832 iterations.
[0.5543478260869565]
0:	learn: 0.0026882	test: 0.0000000	best: 0.0000000 (0)	total: 22.5ms	remaining: 22.5s
350:	learn: 0.8911290	test: 0.4924623	best: 0.4924623 (343)	total: 8.58s	remaining: 15.9s
700:	learn: 1.0000000	test: 0.4924623	best: 0.5025126 (441)	total: 16.1s	remaining: 6.88s
999:	learn: 1.0000000	test: 0.5025126	best: 0.5025126 (441)	total: 23.3s	remaining: 0us

bestTest = 0.5025125628
bestIteration = 441

Shrink model to first 442 iterations.
[0.5728643216080402]


CatBoostError: catboost/libs/metrics/metric.cpp:6930: All train targets are equal

In [16]:
model.get_feature_importance()[:20]

Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
preds_0,6.809859
preds_2,5.542084
preds_27,5.302952
preds_12,4.833376
preds_19,4.071223
preds_18,3.672766
preds_28,3.606547
preds_3,3.570526
preds_1,3.450128
preds_30,3.018386


In [24]:
mean_state = np.average([
    np.load('gemma__test_fold_0.npy'),
    np.load('gemma__test_fold_1.npy'),
    np.load('gemma__test_fold_2.npy'),
    np.load('gemma__test_fold_3.npy'),
    np.load('gemma__test_fold_4.npy'),
    np.load('IlyaGusev_gemma-2-9b-it-abliterated-test-fold_attnlrnk-0.npy'),
    #np.load('IlyaGusev_gemma-2-9b-it-abliterated-test-fold_attnlrnk-0.npy'),
    np.load('IlyaGusev_gemma-2-9b-it-abliterated-test-fold-0.npy'),  # last_token_pool
    np.load('e5_preds.npy'), np.load('e5_preds.npy'), # mean_pool
    np.load('/notebooks/BAAI-bge-multilingual-gemma2-test-fold_attnlrnk-0.npy'), # last_token_pool
    np.load('labse_en_ru_preds.npy'),# attn_pool
    np.load('IlyaGusev-gemma-2-9b-it-abliterated_mlp_old-0.npy'),
    np.load('IlyaGusev-gemma-2-9b-it-abliterated_mlp_old-0.npy'),
],axis=0,weights = [
    0.5,0.5,0.5,0.5,0.5,
    1,1,1,1,1,1,1,1
])

In [33]:
mean_state_0 = np.mean([
    np.load('gemma__test_fold_0.npy'),
    np.load('gemma__test_fold_1.npy'),
    np.load('gemma__test_fold_2.npy'),
    np.load('gemma__test_fold_3.npy'),
    np.load('gemma__test_fold_4.npy'),
    np.load('IlyaGusev_gemma-2-9b-it-abliterated-test-fold_attnlrnk-0.npy'),
    #np.load('IlyaGusev_gemma-2-9b-it-abliterated-test-fold_attnlrnk-0.npy'),
    np.load('IlyaGusev_gemma-2-9b-it-abliterated-test-fold-0.npy'),  # last_token_pool
    np.load('e5_preds.npy'), np.load('e5_preds.npy'), # mean_pool
    np.load('/notebooks/BAAI-bge-multilingual-gemma2-test-fold_attnlrnk-0.npy'), # last_token_pool
    np.load('labse_en_ru_preds.npy'),# attn_pool
    np.load('IlyaGusev-gemma-2-9b-it-abliterated_mlp_old-0.npy'),
    np.load('IlyaGusev-gemma-2-9b-it-abliterated_mlp_old-0.npy'),
],axis=0)

#mean_state_1 = np.load('IlyaGusev-gemma-2-9b-it-abliterated_mlp_old-0.npy')

mean_state = mean_state_0#(mean_state_0 + mean_state_1 * 2) / 3

In [21]:
for i in range(50):
    test_data[f'preds_{i}'] = mean_state[:,i]

In [44]:
preds = model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

NameError: name 'model' is not defined

In [25]:
simple_sub = pd.read_csv('sample_submission_formated.csv')

In [26]:
simple_sub

Unnamed: 0,index,target
0,3135,
1,4655,12
2,22118,2
3,23511,0
4,45,
...,...,...
9010,3523,
9011,24925,
9012,6327,
9013,530,


In [34]:
def prepare_predict(pred):
    preds_r = list(map(lambda y: str(round(y)),pred))
    preds_r[np.argmax(pred)] = '1'
    return ' '.join([str(i) for i,x in enumerate(preds_r) if x == '1'])

In [35]:
simple_sub['target'] = [prepare_predict(x) for x in mean_state]

In [36]:
simple_sub.to_csv('dump_subv22.csv',index=False)

In [10]:
df_old = pd.read_csv('dump_subv5.csv')

In [11]:
(df_old['target'] != simple_sub['target']).sum()

1016

In [30]:
simple_sub

Unnamed: 0,index,target
0,3135,1 2 12
1,4655,12
2,22118,35
3,23511,0
4,45,16
...,...,...
9010,3523,37
9011,24925,12 28
9012,6327,8
9013,530,15
