In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, TimeSeriesSplit, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
import random
from statistics import mean
from tqdm.auto import tqdm, trange
from sklearn.base import BaseEstimator

In [29]:
train_data = pd.read_csv('dodohack/Data Secrets First Cup/train_target.csv',parse_dates=['LocalBeginDate','LocalEndDate'])
test_data = pd.read_csv('dodohack/Data Secrets First Cup/test.csv',parse_dates=['LocalBeginDate','LocalEndDate'])
sample_submit = pd.read_csv('dodohack/Data Secrets First Cup/submit.csv')
orders_data = pd.read_csv('dodohack/Data Secrets First Cup/orders.csv',parse_dates=['SaleDate','Date'])
mobile_data = pd.read_csv('dodohack/Data Secrets First Cup/mobile_events.csv',parse_dates=['Timestamp'])
clients_data = pd.read_csv('dodohack/Data Secrets First Cup/clients_promo_october.csv',parse_dates=['LocalBeginDate','LocalEndDate'])

In [3]:
def mounth_count_day(x):
    months = [31,28,31,30,31,30,31,31,30,31,30,31]
    return sum(months[:x])

def create_time_features(df,time_col):
    df['month'] = df[time_col].dt.month
    df['day'] = df[time_col].dt.day
    df['hour'] = df[time_col].dt.hour
    df['year'] = df[time_col].dt.year
    df['weekofyear'] = df[time_col].apply(lambda x: x.weekofyear)
    df['dayofweek'] = df[time_col].dt.dayofweek
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['absolute_time'] = (df[time_col] - df[time_col].min()).dt.days
    
    df['all_day_time'] = df[time_col].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df[time_col].apply(lambda x: x.dayofweek * 24 + x.hour)
    
    df['cl_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16')
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16')
    return df

In [4]:
mobile_data = create_time_features(mobile_data,'Timestamp')
orders_data = create_time_features(orders_data,'SaleDate')

In [5]:
mobile_data

Unnamed: 0,ClientUUId,VisitToken,EventName,Platform,Timestamp,month,day,hour,year,weekofyear,dayofweek,dayofyear,absolute_time,all_day_time,all_week_time,cl_early_morning,cl_is_weekend
0,000D3A22FA54A81611EB315CF5443815,976627AD-76DC-41D9-981E-F2A6CA14B3A9,screen_menu,ios,2023-10-29 16:14:21.343000+00:00,10,29,16,2023,43,6,302,387,58461,160,0,1
1,2A4A64AF719E994511EE3F2D8B31E188,5b22e610-c71e-4b6a-a842-71e8575ece87,screen_menu,android,2023-10-29 11:04:36.484000+00:00,10,29,11,2023,43,6,302,386,39876,155,0,1
2,000D3AAC977BBB2F11ECDD081F5481AE,BDCEB72F-2B6A-4565-801A-6CB729279C6F,open_app,ios,2023-10-29 11:14:34.723000+00:00,10,29,11,2023,43,6,302,386,40474,155,0,1
3,000D3A21DA51A81411EAE468856F96F1,A66E2DA6-4E2E-4C37-9FBF-627732623212,add_to_cart,ios,2023-10-29 14:03:08.363000+00:00,10,29,14,2023,43,6,302,387,50588,158,0,1
4,000D3A23B0DC80D811E67F4D0A9CA345,1BE8139F-A3D3-439F-A51B-5CBD4D3FFF1B,screen_menu,ios,2023-10-29 08:35:40.127000+00:00,10,29,8,2023,43,6,302,386,30940,152,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916649,000D3A25D54580E011E707D4762E0C84,C5EC0144-6C3C-45C2-B0A4-A7A2B6B45444,open_app,ios,2023-10-14 06:00:21.558000+00:00,10,14,6,2023,41,5,287,371,21621,126,1,1
3916650,CEA31E584572ADD411ED0E4D10ED070E,5a8b6b6c-e4ad-4fd9-aad9-895fa987ddf1,screen_profile,android,2023-10-13 12:55:57.752000+00:00,10,13,12,2023,41,4,286,370,46557,108,0,0
3916651,000D3A25D54580E011E707D4762E0C84,C5EC0144-6C3C-45C2-B0A4-A7A2B6B45444,close_app,ios,2023-10-14 06:00:21.515000+00:00,10,14,6,2023,41,5,287,371,21621,126,1,1
3916652,CEA31E584572ADD411ED0E4D10ED070E,5a8b6b6c-e4ad-4fd9-aad9-895fa987ddf1,screen_menu,android,2023-10-13 12:55:56.303000+00:00,10,13,12,2023,41,4,286,370,46556,108,0,0


In [6]:
def get_shift_features(data,time_col='absolute_time',user_col='ClientUUId',prefix=''):
    shift_time = data.groupby(user_col)[time_col].agg(lambda x: np.diff(np.sort(x)).tolist())
    shift_features = pd.DataFrame(index=shift_time.index)
    shift_features[f'{prefix}_mean'] = shift_time.apply(np.mean)
    shift_features[f'{prefix}_min'] = shift_time.apply(np.min)
    shift_features[f'{prefix}_max'] = shift_time.apply(np.max)
    shift_features[f'{prefix}_std'] = shift_time.apply(np.std)
    shift_features[f'{prefix}_polyfit'] = shift_time.apply(lambda x:np.polyfit(range(len(x)), x, 1)[1] if len(x) > 10 else -100)
    shift_features[f'{prefix}_polyfit_st'] = shift_time.apply(lambda x:np.polyfit(range(len(x)), x, 1)[0] if len(x) > 10 else -100)
    return shift_features

def get_shift_features(data,time_col='absolute_time',user_col='ClientUUId',prefix=''):
    shift_time = data.groupby(user_col)[time_col].agg(lambda x: np.diff(np.sort(x)).tolist())
    shift_features = pd.DataFrame(index=shift_time.index)
    shift_features[f'{prefix}_mean'] = shift_time.apply(np.mean)
    shift_features[f'{prefix}_min'] = shift_time.apply(np.min)
    shift_features[f'{prefix}_max'] = shift_time.apply(np.max)
    shift_features[f'{prefix}_std'] = shift_time.apply(np.std)
    shift_features[f'{prefix}_polyfit'] = shift_time.apply(lambda x:np.polyfit(range(len(x)), x, 1)[1] if len(x) > 10 else -100)
    shift_features[f'{prefix}_polyfit_st'] = shift_time.apply(lambda x:np.polyfit(range(len(x)), x, 1)[0] if len(x) > 10 else -100)
    return shift_features


def get_mobile_data_features(mobile_data):
    time_day_feats = mobile_data.groupby('ClientUUId')['absolute_time'].agg(['min','max','mean','std','nunique','count'])
    time_day_feats['diff'] = time_day_feats['max'] - time_day_feats['min']
    time_day_feats['otn_diff'] = time_day_feats['diff'] / time_day_feats['mean']
    time_day_feats['designity'] = time_day_feats['nunique'] / time_day_feats['count']
    time_day_feats.columns = [f'mobile_time_{x}' for x in time_day_feats.columns]
    
    time_shift_feats = get_shift_features(mobile_data,prefix='mobile_data')
    
    time_hour_features = mobile_data.groupby('ClientUUId')['hour'].agg(['min','max','mean','std','nunique'])
    time_hour_features.columns = [f'mobile_hour_{x}' for x in time_hour_features.columns]
    
    
    platform_featues = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['absolute_time'],
                             columns=['Platform'],
                             aggfunc=['count','nunique','mean','min','max','std']
    ).fillna(-1).sort_index()
    
    event_featues = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['absolute_time'],
                             columns=['EventName'],
                             aggfunc=['count','nunique','mean','min','max','std']
    ).fillna(-1).sort_index()
    
    day_of_week_featues = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['dayofweek'],
                             columns=['EventName'],
                             aggfunc=['count','nunique','mean']
    ).fillna(-1).sort_index()

    
    hour_counters = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['absolute_time'],
                             columns=['hour'],
                             aggfunc=['count','nunique','mean']
    ).fillna(-1).sort_index()
    
    platform_featues.columns = [f'mobile_platform_{x[0]}_{x[2]}' for x in platform_featues.columns]
    event_featues.columns = [f'mobile_event_{x[0]}_{x[2]}' for x in event_featues.columns]
    hour_counters.columns = [f'mobile_hour_{x[0]}_{x[2]}' for x in hour_counters.columns]
    day_of_week_featues.columns = [f'mobile_dayofweek_{x[0]}_{x[2]}' for x in day_of_week_featues.columns]
    
    mobile_data['VisitToken_count'] = mobile_data.groupby('VisitToken')['EventName'].transform('count')
    visitors_features = mobile_data.groupby('ClientUUId')['VisitToken_count'].agg(['min','max','mean','std'])
    visitors_features.columns = [f'mobile_visitors_{x}' for x in visitors_features.columns]
    visitors_features['mobile_visitors_nunique'] = mobile_data.groupby('ClientUUId')['VisitToken'].agg('nunique')
    
    for col in event_featues.columns:
        event_featues[f'{col}_otn'] = event_featues[col] / time_day_feats['mobile_time_count']
    for col in platform_featues.columns:
        platform_featues[f'{col}_otn'] = platform_featues[col] / time_day_feats['mobile_time_count']
    
    return pd.concat([
        time_day_feats,
        time_hour_features,
        platform_featues,
        event_featues,
        hour_counters,
        visitors_features,
        time_shift_feats
    ],axis=1)

In [7]:
mobile_features = get_mobile_data_features(mobile_data)

In [8]:
def get_clients_features(clients_data):
    clients_data['vugoda'] = clients_data['Discount'] / clients_data['OrderPrice']
    clients_data['Id+OrderType'] = clients_data['Id'].astype(str) + clients_data['OrderType'].astype(str)
    
    order_price_feats = clients_data.groupby('ClientUUId')['OrderPrice'].agg(['mean','std','sum','mean','max','count'])
    Discount_feats = clients_data.groupby('ClientUUId')['Discount'].agg(['mean','std','sum','mean','max'])
    vugoda_feats = clients_data.groupby('ClientUUId')['vugoda'].agg(['mean','std','sum','mean','max'])
    
    order_price_feats.columns = [f'clients_order_price_{x}' for x in order_price_feats.columns]
    Discount_feats.columns = [f'clients_discount_{x}' for x in Discount_feats.columns]
    vugoda_feats.columns = [f'clients_vugoda_{x}' for x in vugoda_feats.columns]
    
    id_features = pd.pivot_table(
                             clients_data,
                             index =['ClientUUId'],
                             values=['OrderPrice',],
                             columns=['Id'],
                             aggfunc=['count','nunique']
    ).fillna(-1).sort_index()
    
    order_features = pd.pivot_table(
                             clients_data,
                             index =['ClientUUId'],
                             values=['OrderPrice',],
                             columns=['OrderType'],
                             aggfunc=['count','nunique']
    ).fillna(-1).sort_index()
    
    orderid_features = pd.pivot_table(
                             clients_data,
                             index =['ClientUUId'],
                             values=['OrderPrice'],
                             columns=['Id+OrderType'],
                             aggfunc=['count',]
    ).fillna(-1).sort_index()
    
    id_features.columns = [f'clients_id_{x[0]}_{x[1]}_{x[2]}' for x in id_features.columns]
    order_features.columns = [f'clients_order_{x[0]}_{x[1]}_{x[2]}' for x in order_features.columns]
    orderid_features.columns = [f'clients_orderid_{x[0]}_{x[2]}' for x in orderid_features.columns]

    return pd.concat([
        id_features,
        order_features,
        orderid_features,
        order_price_feats,
        Discount_feats,
        vugoda_feats
    ],axis=1)

In [9]:
clients_features = get_clients_features(clients_data)

In [10]:
def get_order_data_feats(orders_data):
    time_day_feats = mobile_data.groupby('ClientUUId')['absolute_time'].agg(['min','max','mean','std','nunique','count'])
    time_day_feats['diff'] = time_day_feats['max'] - time_day_feats['min']
    time_day_feats['otn_diff'] = time_day_feats['diff'] / time_day_feats['mean']
    time_day_feats['designity'] = time_day_feats['nunique'] / time_day_feats['count']
    time_day_feats.columns = [f'order_time_{x}' for x in time_day_feats.columns]
    
    time_shift_feats = get_shift_features(orders_data,prefix='orders_data')
    
    orders_data = orders_data.fillna(0)
    orders_data['OrderUUId_count'] = orders_data.groupby('OrderUUId').transform('count')['NewClient']
    orders_data['addressId_count'] = orders_data.groupby('addressId').transform('count')['NewClient']
    orders_data['deliverySectorId_count'] = orders_data.groupby('deliverySectorId').transform('count')['NewClient']
    orders_data['UnitUUId_count'] = orders_data.groupby('UnitUUId').transform('count')['NewClient']
    orders_data['ProductUUId_count'] = orders_data.groupby('ProductUUId').transform('count')['NewClient']
    
    orders_data['OrderUUId_appl_mean'] = orders_data.groupby('OrderUUId')['apply_promo'].agg('mean').loc[orders_data['OrderUUId']].values
    orders_data['addressId_appl_mean'] = orders_data.groupby('addressId')['apply_promo'].agg('mean').loc[orders_data['addressId']].values
    orders_data['deliverySectorId_appl_mean'] = orders_data.groupby('deliverySectorId')['apply_promo'].agg('mean').loc[orders_data['deliverySectorId']].values
    orders_data['UnitUUId_appl_mean'] = orders_data.groupby('UnitUUId')['apply_promo'].agg('mean').loc[orders_data['UnitUUId']].values
    orders_data['ProductUUId_appl_mean'] = orders_data.groupby('ProductUUId')['apply_promo'].agg('mean').loc[orders_data['ProductUUId']].values

    orders_data['Skidka'] = orders_data['MenuPrice'] - orders_data['ProductTotalPrice']
    orders_data['Skidka_otn'] = orders_data['Skidka'] / orders_data['ProductTotalPrice']
    
    appl_features = orders_data.groupby('ClientUUId')['apply_promo'].agg(['mean','count','std','sum'])
    newcl_features = orders_data.groupby('ClientUUId')['NewClient'].agg(['sum'])
    appl_features.columns = [f'order_appl_{x}' for x in appl_features.columns]
    newcl_features.columns = [f'order_newcl_{x}' for x in newcl_features.columns]
    
    category_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice'],
                             columns=['apply_promo','CategoryId'],
                             aggfunc=['count','sum']
    ).fillna(-1).sort_index()
    
    payment_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice'],
                             columns=['apply_promo','OrderPaymentType'],
                             aggfunc=['count','nunique','sum','mean']
    ).fillna(-1).sort_index()

    
    order_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice'],
                             columns=['apply_promo','OrderType'],
                             aggfunc=['count','nunique','sum','mean']
    ).fillna(-1).sort_index()
    
    order_state_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice'],
                             columns=['apply_promo','OrderState'],
                             aggfunc=['count','nunique','sum','mean']
    ).fillna(-1).sort_index()


    features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice','ClientOrderNumber',
                                     'ProductTotalPrice','OrderTotalPrice',
                                     'OrderUUId_count','Skidka','Skidka_otn',
                                     'addressId_count','deliverySectorId_count',
                                     'UnitUUId_count','ProductUUId_count',
                                     'OrderUUId_appl_mean','addressId_appl_mean',
                                     'deliverySectorId_appl_mean',
                                     'UnitUUId_appl_mean','ProductUUId_appl_mean',
                                     'hour','absolute_time','dayofweek'],
                             columns=['apply_promo'],
                             aggfunc=['mean','sum','min','max','std','nunique',lambda x:np.max(x) - np.min(x)]
        
    ).fillna(-1).sort_index()
    
    features.columns = [f'order_feats_{x[0]}_{x[1]}_{x[2]}' for x in features.columns]
    payment_features.columns = [f'order_payment_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in payment_features.columns]
    order_features.columns = [f'order_order_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in order_features.columns]
    category_features.columns = [f'order_order_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in category_features.columns]
    order_state_features.columns = [f'order_order_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in order_state_features.columns]
    
    return pd.concat([
        appl_features,
        newcl_features,
        payment_features,
        order_features,
        time_day_feats,
        order_state_features,
        time_shift_feats,
        features,
    ],axis=1)

In [11]:
order_features = get_order_data_feats(orders_data)

  aggfunc=['mean','sum','min','max','std','nunique',lambda x:np.max(x) - np.min(x)]


In [12]:
mobile_embeds = pd.read_parquet('mobile_embeds.parquet')
order_embeds = pd.read_parquet('order_embeds.parquet')

In [13]:
mobile_embeds.columns = [f"mobile_{x}" for x in mobile_embeds.columns]
order_embeds.columns = [f"order_{x}" for x in order_embeds.columns]

In [14]:
all_features = pd.concat([
    order_features,
    clients_features,
    mobile_features,
    mobile_embeds,
    order_embeds
],axis=1).fillna(-100)

In [15]:
all_features = all_features.T.drop_duplicates().T
all_features['ClientUUId'] = all_features.index
all_features.index = range(len(all_features))

In [16]:
all_features.shape

(30000, 774)

In [17]:
best_feats = np.load('best_features4.npy')

In [18]:
best_feats

array(['Discount', 'order_appl_mean', 'order_appl_std',
       'order_payment_count_MenuPrice_0_1',
       'order_payment_mean_MenuPrice_1_0',
       'order_order_count_MenuPrice_0_1', 'order_order_sum_MenuPrice_0_1',
       'order_order_sum_MenuPrice_1_3', 'order_order_sum_MenuPrice_1_4',
       'order_order_mean_MenuPrice_1_4',
       'order_feats_mean_ProductTotalPrice_1',
       'order_feats_mean_dayofweek_1',
       'order_feats_sum_ClientOrderNumber_1',
       'order_feats_sum_OrderUUId_count_0', 'order_feats_sum_Skidka_1',
       'order_feats_sum_hour_1', 'order_feats_min_ClientOrderNumber_1',
       'order_feats_min_ProductTotalPrice_1',
       'order_feats_min_deliverySectorId_count_1',
       'order_feats_max_ClientOrderNumber_1',
       'order_feats_max_ProductTotalPrice_1',
       'order_feats_max_ProductUUId_count_1', 'order_feats_max_Skidka_1',
       'order_feats_max_UnitUUId_appl_mean_1',
       'order_feats_std_MenuPrice_1', 'order_feats_std_OrderTotalPrice_1',
       

In [19]:
train_data = train_data.merge(all_features)

In [37]:
params = {
    'iterations':1000,
    'learning_rate':0.01,
    'loss_function':'CrossEntropy',
    'max_depth':7,
    'eval_metric':'AUC',
    'task_type':'GPU',
    'random_seed':56
}

drop_cols = [
    'LocalBeginDate',
    'LocalEndDate',
    'ClientUUId',
]

cat_cols = [
    'Id',
    'OrderType',
]

label_col = 'apply_promo'
num_fold = 5
n_fold_test = 10
test_c_stop = 3
num_repits = 1

In [38]:
train_data = train_data[best_feats.tolist() + drop_cols + [label_col,"Id",'OrderType']]

KeyError: "['order_appl_mean', 'order_appl_std', 'order_payment_count_MenuPrice_0_1', 'order_payment_mean_MenuPrice_1_0', 'order_order_count_MenuPrice_0_1', 'order_order_sum_MenuPrice_0_1', 'order_order_sum_MenuPrice_1_3', 'order_order_sum_MenuPrice_1_4', 'order_order_mean_MenuPrice_1_4', 'order_feats_mean_ProductTotalPrice_1', 'order_feats_mean_dayofweek_1', 'order_feats_sum_ClientOrderNumber_1', 'order_feats_sum_OrderUUId_count_0', 'order_feats_sum_Skidka_1', 'order_feats_sum_hour_1', 'order_feats_min_ClientOrderNumber_1', 'order_feats_min_ProductTotalPrice_1', 'order_feats_min_deliverySectorId_count_1', 'order_feats_max_ClientOrderNumber_1', 'order_feats_max_ProductTotalPrice_1', 'order_feats_max_ProductUUId_count_1', 'order_feats_max_Skidka_1', 'order_feats_max_UnitUUId_appl_mean_1', 'order_feats_std_MenuPrice_1', 'order_feats_std_OrderTotalPrice_1', 'order_feats_std_absolute_time_1', 'order_feats_std_hour_1', 'order_feats_nunique_ClientOrderNumber_1', 'order_feats_nunique_OrderTotalPrice_1', 'order_feats_nunique_ProductTotalPrice_1', 'order_feats_nunique_Skidka_1', 'order_feats_nunique_hour_1', 'order_feats_<lambda>_ProductTotalPrice_1', 'order_feats_<lambda>_ProductUUId_count_1', 'order_feats_<lambda>_absolute_time_1', 'order_feats_<lambda>_hour_1', 'clients_id_count_OrderPrice_7', 'clients_orderid_count_71,2,3', 'mobile_event_nunique_apply_personal_offer', 'mobile_event_nunique_close_app', 'mobile_event_nunique_open_app', 'mobile_event_min_apply_personal_offer', 'mobile_event_max_apply_personal_offer', 'mobile_event_std_apply_personal_offer', 'mobile_event_nunique_apply_personal_offer_otn', 'mobile_event_nunique_screen_cart_otn', 'mobile_event_mean_apply_personal_offer_otn', 'mobile_event_mean_open_bonusaction_otn', 'mobile_event_max_apply_personal_offer_otn', 'mobile_event_std_apply_personal_offer_otn', 'mobile_hour_nunique_6', 'mobile_hour_nunique_7', 'mobile_hour_nunique_8', 'mobile_hour_nunique_9', 'mobile_hour_nunique_20', 'mobile_hour_nunique_22', 'mobile_hour_mean_10', 'mobile_hour_mean_22', 'mobile_emb_0008', 'order_emb_0001', 'order_emb_0014', 'order_emb_0052', 'order_emb_0061'] not in index"

In [None]:
class CatBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        scores = []
        
        for i in trange(num_repits):
            kfold = StratifiedGroupKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]

                train_pool = Pool(
                    train_df.drop([label_col]+drop_cols,axis=1),
                    label = train_df[label_col],
                    cat_features = cat_features
                )

                eval_pool = Pool(
                    test_df.drop([label_col]+drop_cols,axis=1),
                    label = test_df[label_col],
                    cat_features = cat_features
                )

                cbm = CatBoostClassifier(**self.params)
                cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)

                score = roc_auc_score(test_df[label_col],cbm.predict_proba(eval_pool)[:,1])
                scores += [score]
                self.models += [cbm]
        #print(f"Total Score {np.mean(scores)}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        test_pool = Pool(
            test_data.drop(drop_cols,axis=1),
            cat_features=cat_features
        )
        preds = np.mean([model.predict_proba(test_pool)[:,1] for model in self.models],axis=0)
        return preds
    
    def get_feature_importance(self):
        imp_0 = self.models[0].get_feature_importance(prettified=True).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].get_feature_importance(prettified=True).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [39]:
def model_builder(train_data):
    model = CatBoostKfoldWraper(num_fold,num_repits,params)
    
    model.fit(
        train_data,
        cat_features=cat_cols,
        drop_cols=drop_cols,
        label_col=label_col,
        verbose=500
    )
    
    return model

def model_predicter(model,test_data):
    return model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [40]:
class TestKFoldWrapper():
    def __init__(self,num_folds=10,top_c=1,random_state=56):
        self.top_c = top_c
        self.kfold = StratifiedGroupKFold(num_folds,random_state=random_state,shuffle=True)
    
    def run_experiments(self,model_builder,model_predicter,train_data,label_col=None):
        self.models = []
        self.scores = []
        c = 0
        
        for train_index, test_index in tqdm(self.kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
            train_df = train_data.iloc[train_index]
            test_df = train_data.iloc[test_index]
            
            model = model_builder(train_df)
            score = roc_auc_score(test_df[label_col],model_predicter(model,test_df))
            self.scores.append(score)
            self.models.append(model)
            c += 1
            if c >= self.top_c:
                break
        
        print(f"Total Score {np.mean(self.scores)}")


In [41]:
evaluator = TestKFoldWrapper(n_fold_test,test_c_stop)

evaluator.run_experiments(
    model_builder=model_builder,
    model_predicter=model_predicter,
    train_data=train_data,
    label_col=label_col
)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5291578	best: 0.5291578 (0)	total: 25.2ms	remaining: 25.1s
500:	test: 0.5681150	best: 0.5723687 (185)	total: 10.8s	remaining: 10.7s
999:	test: 0.5614500	best: 0.5723687 (185)	total: 23.1s	remaining: 0us
bestTest = 0.5723686516
bestIteration = 185
Shrink model to first 186 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5498991	best: 0.5498991 (0)	total: 16.7ms	remaining: 16.7s
500:	test: 0.5816314	best: 0.5854330 (315)	total: 11.1s	remaining: 11.1s
999:	test: 0.5759275	best: 0.5854330 (315)	total: 23.8s	remaining: 0us
bestTest = 0.5854329765
bestIteration = 315
Shrink model to first 316 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5465097	best: 0.5465097 (0)	total: 15.5ms	remaining: 15.5s
500:	test: 0.5407131	best: 0.5613647 (5)	total: 11s	remaining: 10.9s
999:	test: 0.5427504	best: 0.5613647 (5)	total: 23.5s	remaining: 0us
bestTest = 0.5613647252
bestIteration = 5
Shrink model to first 6 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5541841	best: 0.5541841 (0)	total: 14ms	remaining: 14s
500:	test: 0.5910054	best: 0.5924819 (430)	total: 12.4s	remaining: 12.3s
999:	test: 0.5981155	best: 0.5994857 (900)	total: 25.1s	remaining: 0us
bestTest = 0.5994856656
bestIteration = 900
Shrink model to first 901 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5240374	best: 0.5240374 (0)	total: 14.4ms	remaining: 14.4s
500:	test: 0.5476362	best: 0.5483635 (480)	total: 10.4s	remaining: 10.4s
999:	test: 0.5541857	best: 0.5569019 (870)	total: 25.1s	remaining: 0us
bestTest = 0.556901902
bestIteration = 870
Shrink model to first 871 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5421959	best: 0.5421959 (0)	total: 22.8ms	remaining: 22.8s
500:	test: 0.5547471	best: 0.5619089 (45)	total: 12.3s	remaining: 12.2s
999:	test: 0.5474137	best: 0.5619089 (45)	total: 26s	remaining: 0us
bestTest = 0.5619088858
bestIteration = 45
Shrink model to first 46 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5373290	best: 0.5373290 (0)	total: 18ms	remaining: 18s
500:	test: 0.5556414	best: 0.5556414 (500)	total: 12.4s	remaining: 12.4s
999:	test: 0.5509958	best: 0.5606341 (705)	total: 25.9s	remaining: 0us
bestTest = 0.5606341064
bestIteration = 705
Shrink model to first 706 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5341925	best: 0.5341925 (0)	total: 15ms	remaining: 15s
500:	test: 0.5528321	best: 0.5556626 (5)	total: 10.9s	remaining: 10.8s
999:	test: 0.5564939	best: 0.5585209 (750)	total: 23.8s	remaining: 0us
bestTest = 0.5585209429
bestIteration = 750
Shrink model to first 751 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5405416	best: 0.5405416 (0)	total: 14.4ms	remaining: 14.4s
500:	test: 0.5580577	best: 0.5673683 (65)	total: 9.66s	remaining: 9.62s
999:	test: 0.5596487	best: 0.5673683 (65)	total: 21s	remaining: 0us
bestTest = 0.5673682988
bestIteration = 65
Shrink model to first 66 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5479324	best: 0.5479324 (0)	total: 15.8ms	remaining: 15.8s
500:	test: 0.5664322	best: 0.5691692 (15)	total: 9.85s	remaining: 9.81s
999:	test: 0.5626872	best: 0.5691692 (15)	total: 24s	remaining: 0us
bestTest = 0.5691692382
bestIteration = 15
Shrink model to first 16 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5591819	best: 0.5591819 (0)	total: 15.5ms	remaining: 15.5s
500:	test: 0.5334298	best: 0.5688922 (130)	total: 10.8s	remaining: 10.7s
999:	test: 0.5227798	best: 0.5688922 (130)	total: 23.4s	remaining: 0us
bestTest = 0.5688921809
bestIteration = 130
Shrink model to first 131 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5508027	best: 0.5508027 (0)	total: 14.3ms	remaining: 14.3s
500:	test: 0.5319194	best: 0.5508027 (0)	total: 11.1s	remaining: 11.1s
999:	test: 0.5118139	best: 0.5508027 (0)	total: 24.2s	remaining: 0us
bestTest = 0.5508027226
bestIteration = 0
Shrink model to first 1 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5255332	best: 0.5255332 (0)	total: 16.2ms	remaining: 16.2s
500:	test: 0.5534925	best: 0.5565796 (430)	total: 11.2s	remaining: 11.2s
999:	test: 0.5535735	best: 0.5565796 (430)	total: 22.6s	remaining: 0us
bestTest = 0.5565796196
bestIteration = 430
Shrink model to first 431 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5433789	best: 0.5433789 (0)	total: 14.8ms	remaining: 14.8s
500:	test: 0.5561716	best: 0.5627559 (300)	total: 9.89s	remaining: 9.85s
999:	test: 0.5677490	best: 0.5686696 (905)	total: 22.9s	remaining: 0us
bestTest = 0.5686695576
bestIteration = 905
Shrink model to first 906 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5314177	best: 0.5314177 (0)	total: 14.4ms	remaining: 14.4s
500:	test: 0.5417268	best: 0.5426583 (420)	total: 9.67s	remaining: 9.63s
999:	test: 0.5460235	best: 0.5470753 (785)	total: 22.4s	remaining: 0us
bestTest = 0.5470752716
bestIteration = 785
Shrink model to first 786 iterations.
Total Score 0.5662169399812113


In [None]:
evaluator.models[0].get_feature_importance()[:60]

In [71]:
num_repits = 3
model = model_builder(train_data)

  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [30]:
test_data = test_data.merge(all_features)

In [31]:
test_data = test_data[best_feats.tolist() + drop_cols + ['Id','OrderType']]

In [32]:
preds = evaluator.models[0].predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)
preds += evaluator.models[1].predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)
preds += evaluator.models[2].predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)
preds /= 3
#preds = model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [35]:
sample_submit['apply_promo'] = preds  + pd.read_csv('./submits/sub_dodo15.csv').apply_promo

In [36]:
sample_submit.to_csv('./submits/sub_dodo17.csv',index=False)