In [102]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, TimeSeriesSplit, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
import random
from statistics import mean
from tqdm.auto import tqdm, trange
from sklearn.base import BaseEstimator

In [103]:
train_data = pd.read_csv('dodohack/Data Secrets First Cup/train_target.csv',parse_dates=['LocalBeginDate','LocalEndDate'])
test_data = pd.read_csv('dodohack/Data Secrets First Cup/test.csv',parse_dates=['LocalBeginDate','LocalEndDate'])
sample_submit = pd.read_csv('dodohack/Data Secrets First Cup/submit.csv')
orders_data = pd.read_csv('dodohack/Data Secrets First Cup/orders.csv',parse_dates=['SaleDate','Date'])
mobile_data = pd.read_csv('dodohack/Data Secrets First Cup/mobile_events.csv',parse_dates=['Timestamp'])
clients_data = pd.read_csv('dodohack/Data Secrets First Cup/clients_promo_october.csv',parse_dates=['LocalBeginDate','LocalEndDate'])

In [104]:
def mounth_count_day(x):
    months = [31,28,31,30,31,30,31,31,30,31,30,31]
    return sum(months[:x])

def create_time_features(df,time_col):
    df['month'] = df[time_col].dt.month
    df['day'] = df[time_col].dt.day
    df['hour'] = df[time_col].dt.hour
    df['year'] = df[time_col].dt.year
    df['weekofyear'] = df[time_col].apply(lambda x: x.weekofyear)
    df['dayofweek'] = df[time_col].dt.dayofweek
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['absolute_time'] = (df[time_col] - df[time_col].min()).dt.days
    
    df['all_day_time'] = df[time_col].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df[time_col].apply(lambda x: x.dayofweek * 24 + x.hour)
    
    df['cl_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16')
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16')
    return df

In [105]:
mobile_data = create_time_features(mobile_data,'Timestamp')
orders_data = create_time_features(orders_data,'SaleDate')

In [106]:
def get_shift_features(data,time_col='absolute_time',user_col='ClientUUId',prefix=''):
    shift_time = data.groupby(user_col)[time_col].agg(lambda x: np.diff(np.sort(x)).tolist())
    shift_features = pd.DataFrame(index=shift_time.index)
    shift_features[f'{prefix}_mean'] = shift_time.apply(np.mean)
    shift_features[f'{prefix}_min'] = shift_time.apply(np.min)
    shift_features[f'{prefix}_max'] = shift_time.apply(np.max)
    shift_features[f'{prefix}_std'] = shift_time.apply(np.std)
    shift_features[f'{prefix}_polyfit'] = shift_time.apply(lambda x:np.polyfit(range(len(x)), x, 1)[1] if len(x) > 10 else -100)
    shift_features[f'{prefix}_polyfit_st'] = shift_time.apply(lambda x:np.polyfit(range(len(x)), x, 1)[0] if len(x) > 10 else -100)
    return shift_features


def get_mobile_data_features(mobile_data):
    time_day_feats = mobile_data.groupby('ClientUUId')['absolute_time'].agg(['min','max','mean','std','nunique','count'])
    time_day_feats['diff'] = time_day_feats['max'] - time_day_feats['min']
    time_day_feats['otn_diff'] = time_day_feats['diff'] / time_day_feats['mean']
    time_day_feats['designity'] = time_day_feats['nunique'] / time_day_feats['count']
    time_day_feats.columns = [f'mobile_time_{x}' for x in time_day_feats.columns]
    
    time_shift_feats = get_shift_features(mobile_data,prefix='mobile_data')
    
    time_hour_features = mobile_data.groupby('ClientUUId')['hour'].agg(['min','max','mean','std','nunique'])
    time_hour_features.columns = [f'mobile_hour_{x}' for x in time_hour_features.columns]
    
    
    platform_featues = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['absolute_time'],
                             columns=['Platform'],
                             aggfunc=['count','nunique','mean','min','max','std']
    ).fillna(-1).sort_index()
    
    event_featues = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['absolute_time'],
                             columns=['EventName'],
                             aggfunc=['count','nunique','mean','min','max','std']
    ).fillna(-1).sort_index()
    
    day_of_week_featues = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['dayofweek'],
                             columns=['EventName'],
                             aggfunc=['count','nunique','mean']
    ).fillna(-1).sort_index()

    
    hour_counters = pd.pivot_table(
                             mobile_data,
                             index =['ClientUUId'],
                             values=['absolute_time'],
                             columns=['hour'],
                             aggfunc=['count','nunique','mean']
    ).fillna(-1).sort_index()
    
    platform_featues.columns = [f'mobile_platform_{x[0]}_{x[2]}' for x in platform_featues.columns]
    event_featues.columns = [f'mobile_event_{x[0]}_{x[2]}' for x in event_featues.columns]
    hour_counters.columns = [f'mobile_hour_{x[0]}_{x[2]}' for x in hour_counters.columns]
    day_of_week_featues.columns = [f'mobile_dayofweek_{x[0]}_{x[2]}' for x in day_of_week_featues.columns]
    
    mobile_data['VisitToken_count'] = mobile_data.groupby('VisitToken')['EventName'].transform('count')
    visitors_features = mobile_data.groupby('ClientUUId')['VisitToken_count'].agg(['min','max','mean','std'])
    visitors_features.columns = [f'mobile_visitors_{x}' for x in visitors_features.columns]
    visitors_features['mobile_visitors_nunique'] = mobile_data.groupby('ClientUUId')['VisitToken'].agg('nunique')
    
    for col in event_featues.columns:
        event_featues[f'{col}_otn'] = event_featues[col] / time_day_feats['mobile_time_count']
    for col in platform_featues.columns:
        platform_featues[f'{col}_otn'] = platform_featues[col] / time_day_feats['mobile_time_count']
    
    return pd.concat([
        time_day_feats,
        time_hour_features,
        platform_featues,
        event_featues,
        hour_counters,
        visitors_features,
        time_shift_feats
    ],axis=1)

In [107]:
mobile_features = get_mobile_data_features(mobile_data)

In [108]:
def get_clients_features(clients_data):
    clients_data['vugoda'] = clients_data['Discount'] / clients_data['OrderPrice']
    clients_data['Id+OrderType'] = clients_data['Id'].astype(str) + clients_data['OrderType'].astype(str)
    
    order_price_feats = clients_data.groupby('ClientUUId')['OrderPrice'].agg(['mean','std','sum','mean','max','count'])
    Discount_feats = clients_data.groupby('ClientUUId')['Discount'].agg(['mean','std','sum','mean','max','nunique'])
    vugoda_feats = clients_data.groupby('ClientUUId')['vugoda'].agg(['mean','std','sum','mean','max','nunique'])
    
    order_price_feats.columns = [f'clients_order_price_{x}' for x in order_price_feats.columns]
    Discount_feats.columns = [f'clients_discount_{x}' for x in Discount_feats.columns]
    vugoda_feats.columns = [f'clients_vugoda_{x}' for x in vugoda_feats.columns]
    
    id_features = pd.pivot_table(
                             clients_data,
                             index =['ClientUUId'],
                             values=['OrderPrice',],
                             columns=['Id'],
                             aggfunc=['count','nunique','sum']
    ).fillna(-1).sort_index()
    
    order_features = pd.pivot_table(
                             clients_data,
                             index =['ClientUUId'],
                             values=['OrderPrice',],
                             columns=['OrderType'],
                             aggfunc=['count','nunique','sum']
    ).fillna(-1).sort_index()
    
    orderid_features = pd.pivot_table(
                             clients_data,
                             index =['ClientUUId'],
                             values=['OrderPrice'],
                             columns=['Id+OrderType'],
                             aggfunc=['count']
    ).fillna(-1).sort_index()
    
    id_features.columns = [f'clients_id_{x[0]}_{x[1]}_{x[2]}' for x in id_features.columns]
    order_features.columns = [f'clients_order_{x[0]}_{x[1]}_{x[2]}' for x in order_features.columns]
    orderid_features.columns = [f'clients_orderid_{x[0]}_{x[2]}' for x in orderid_features.columns]

    return pd.concat([
        id_features,
        order_features,
        orderid_features,
        order_price_feats,
        Discount_feats,
        vugoda_feats
    ],axis=1)

In [109]:
clients_features = get_clients_features(clients_data)

In [110]:
def get_order_data_feats(orders_data):
    time_day_feats = mobile_data.groupby('ClientUUId')['absolute_time'].agg(['min','max','mean','std','nunique','count'])
    time_day_feats['diff'] = time_day_feats['max'] - time_day_feats['min']
    time_day_feats['otn_diff'] = time_day_feats['diff'] / time_day_feats['mean']
    time_day_feats['designity'] = time_day_feats['nunique'] / time_day_feats['count']
    time_day_feats.columns = [f'order_time_{x}' for x in time_day_feats.columns]
    
    time_shift_feats = get_shift_features(orders_data,prefix='orders_data')
    
    orders_data = orders_data.fillna(0)
    orders_data['OrderUUId_count'] = orders_data.groupby('OrderUUId').transform('count')['NewClient']
    orders_data['addressId_count'] = orders_data.groupby('addressId').transform('count')['NewClient']
    orders_data['deliverySectorId_count'] = orders_data.groupby('deliverySectorId').transform('count')['NewClient']
    orders_data['UnitUUId_count'] = orders_data.groupby('UnitUUId').transform('count')['NewClient']
    orders_data['ProductUUId_count'] = orders_data.groupby('ProductUUId').transform('count')['NewClient']
    
    orders_data['OrderUUId_appl_mean'] = orders_data.groupby('OrderUUId')['apply_promo'].agg('mean').loc[orders_data['OrderUUId']].values
    orders_data['addressId_appl_mean'] = orders_data.groupby('addressId')['apply_promo'].agg('mean').loc[orders_data['addressId']].values
    orders_data['deliverySectorId_appl_mean'] = orders_data.groupby('deliverySectorId')['apply_promo'].agg('mean').loc[orders_data['deliverySectorId']].values
    orders_data['UnitUUId_appl_mean'] = orders_data.groupby('UnitUUId')['apply_promo'].agg('mean').loc[orders_data['UnitUUId']].values
    orders_data['ProductUUId_appl_mean'] = orders_data.groupby('ProductUUId')['apply_promo'].agg('mean').loc[orders_data['ProductUUId']].values
    
    orders_data['OrderUUId_appl_sum'] = orders_data.groupby('OrderUUId')['apply_promo'].agg('sum').loc[orders_data['OrderUUId']].values
    orders_data['addressId_appl_sum'] = orders_data.groupby('addressId')['apply_promo'].agg('sum').loc[orders_data['addressId']].values
    orders_data['deliverySectorId_appl_sum'] = orders_data.groupby('deliverySectorId')['apply_promo'].agg('sum').loc[orders_data['deliverySectorId']].values
    orders_data['UnitUUId_appl_sum'] = orders_data.groupby('UnitUUId')['apply_promo'].agg('sum').loc[orders_data['UnitUUId']].values
    orders_data['ProductUUId_appl_sum'] = orders_data.groupby('ProductUUId')['apply_promo'].agg('sum').loc[orders_data['ProductUUId']].values


    orders_data['Skidka'] = orders_data['MenuPrice'] - orders_data['ProductTotalPrice']
    orders_data['Skidka_otn'] = orders_data['Skidka'] / orders_data['ProductTotalPrice']
    
    appl_features = orders_data.groupby('ClientUUId')['apply_promo'].agg(['mean','count','std','sum'])
    newcl_features = orders_data.groupby('ClientUUId')['NewClient'].agg(['sum'])
    appl_features.columns = [f'order_appl_{x}' for x in appl_features.columns]
    newcl_features.columns = [f'order_newcl_{x}' for x in newcl_features.columns]
    
    category_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice','absolute_time'],
                             columns=['apply_promo','CategoryId'],
                             aggfunc=['count','sum','mean','std']
    ).fillna(-1).sort_index()
    
    payment_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice','absolute_time'],
                             columns=['apply_promo','OrderPaymentType'],
                             aggfunc=['count','nunique','sum','mean','std']
    ).fillna(-1).sort_index()

    
    order_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice','absolute_time'],
                             columns=['apply_promo','OrderType'],
                             aggfunc=['count','nunique','sum','mean','std']
    ).fillna(-1).sort_index()
    
    order_state_features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice'],
                             columns=['apply_promo','OrderState'],
                             aggfunc=['count','nunique','sum','mean']
    ).fillna(-1).sort_index()


    features = pd.pivot_table(
                             orders_data,
                             index =['ClientUUId'],
                             values=['MenuPrice','ClientOrderNumber',
                                     'ProductTotalPrice','OrderTotalPrice',
                                     'OrderUUId_count','Skidka','Skidka_otn',
                                     'addressId_count','deliverySectorId_count',
                                     'UnitUUId_count','ProductUUId_count',
                                     'OrderUUId_appl_mean','addressId_appl_mean',
                                     'deliverySectorId_appl_mean',
                                     'UnitUUId_appl_mean','ProductUUId_appl_mean',
                                     'OrderUUId_appl_sum','addressId_appl_sum',
                                     'deliverySectorId_appl_sum',
                                     'UnitUUId_appl_sum','ProductUUId_appl_sum',
                                     'hour','absolute_time','dayofweek','month','all_day_time'],
                             columns=['apply_promo'],
                             aggfunc=['mean','sum','min','max','std','nunique',lambda x:np.max(x) - np.min(x)]
        
    ).fillna(-1).sort_index()
    
    features.columns = [f'order_feats_{x[0]}_{x[1]}_{x[2]}' for x in features.columns]
    payment_features.columns = [f'order_payment_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in payment_features.columns]
    order_features.columns = [f'order_order_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in order_features.columns]
    category_features.columns = [f'order_order_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in category_features.columns]
    order_state_features.columns = [f'order_order_{x[0]}_{x[1]}_{x[2]}_{x[3]}' for x in order_state_features.columns]
    
    return pd.concat([
        appl_features,
        newcl_features,
        payment_features,
        order_features,
        time_day_feats,
        order_state_features,
        time_shift_feats,
        features,
    ],axis=1)

In [111]:
order_features = get_order_data_feats(orders_data)

  aggfunc=['mean','sum','min','max','std','nunique',lambda x:np.max(x) - np.min(x)]


In [112]:
mobile_embeds = pd.read_parquet('mobile_embeds.parquet')
order_embeds = pd.read_parquet('order_embeds.parquet')

In [113]:
mobile_embeds.columns = [f"mobile_{x}" for x in mobile_embeds.columns]
order_embeds.columns = [f"order_{x}" for x in order_embeds.columns]

In [114]:
all_features = pd.concat([
    order_features,
    clients_features,
    mobile_features,
    mobile_embeds,
    order_embeds
],axis=1).fillna(-100)

In [115]:
all_features = all_features.T.drop_duplicates().T
all_features['ClientUUId'] = all_features.index
all_features.index = range(len(all_features))

In [116]:
all_features.shape

(30000, 919)

In [117]:
best_feats = np.load('best_features1.npy')

In [118]:
train_data['vigoda'] = train_data['Discount'] / train_data['OrderPrice']
train_data['pr'] =  train_data['OrderPrice'] - train_data['Discount']

In [158]:
params = {
    'iterations':1000,
    'learning_rate':0.01,
    'loss_function':'CrossEntropy',
    'max_depth':7,
    'eval_metric':'AUC',
    'task_type':'GPU',
    'random_seed':56
}

drop_cols = [
    'LocalBeginDate',
    'LocalEndDate',
    'ClientUUId',
]

cat_cols = [
    'Id',
    'OrderType',
]

label_col = 'apply_promo'
num_fold = 5
n_fold_test = 10
test_c_stop = 100
num_repits = 1

In [120]:
train_data = train_data.merge(all_features)

In [121]:
train_data = train_data[best_feats.tolist() + drop_cols + [label_col,'OrderPrice','vigoda','pr']]

In [159]:
class CatBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        scores = []
        
        for i in trange(num_repits):
            kfold = StratifiedGroupKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]

                train_pool = Pool(
                    train_df.drop([label_col]+drop_cols,axis=1),
                    label = train_df[label_col],
                    cat_features = cat_features
                )

                eval_pool = Pool(
                    test_df.drop([label_col]+drop_cols,axis=1),
                    label = test_df[label_col],
                    cat_features = cat_features
                )

                cbm = CatBoostClassifier(**self.params)
                cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)

                score = roc_auc_score(test_df[label_col],cbm.predict_proba(eval_pool)[:,1])
                scores += [score]
                self.models += [cbm]
        #print(f"Total Score {np.mean(scores)}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        test_pool = Pool(
            test_data.drop(drop_cols,axis=1),
            cat_features=cat_features
        )
        preds = np.mean([model.predict_proba(test_pool)[:,1] for model in self.models],axis=0)
        return preds
    
    def get_feature_importance(self):
        imp_0 = self.models[0].get_feature_importance(prettified=True).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].get_feature_importance(prettified=True).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [160]:
def model_builder(train_data):
    model = CatBoostKfoldWraper(num_fold,num_repits,params)
    
    model.fit(
        train_data,
        cat_features=cat_cols,
        drop_cols=drop_cols,
        label_col=label_col,
        verbose=500
    )
    
    return model

def model_predicter(model,test_data):
    return model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [161]:
class TestKFoldWrapper():
    def __init__(self,num_folds=10,top_c=1,random_state=56):
        self.top_c = top_c
        self.kfold = StratifiedGroupKFold(num_folds,random_state=random_state,shuffle=True)
    
    def run_experiments(self,model_builder,model_predicter,train_data,label_col=None):
        self.models = []
        self.scores = []
        c = 0
        
        for train_index, test_index in tqdm(self.kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
            train_df = train_data.iloc[train_index]
            test_df = train_data.iloc[test_index]
            
            model = model_builder(train_df)
            score = roc_auc_score(test_df[label_col],model_predicter(model,test_df))
            self.scores.append(score)
            self.models.append(model)
            c += 1
            if c >= self.top_c:
                break
        
        print(f"Total Score {np.mean(self.scores)}")


In [162]:
evaluator = TestKFoldWrapper(n_fold_test,test_c_stop)

evaluator.run_experiments(
    model_builder=model_builder,
    model_predicter=model_predicter,
    train_data=train_data,
    label_col=label_col
)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5291578	best: 0.5291578 (0)	total: 13.9ms	remaining: 13.8s
500:	test: 0.7770278	best: 0.7790359 (355)	total: 23.5s	remaining: 23.5s
999:	test: 0.7727700	best: 0.7790359 (355)	total: 47.5s	remaining: 0us
bestTest = 0.7790359259
bestIteration = 355
Shrink model to first 356 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5498991	best: 0.5498991 (0)	total: 21ms	remaining: 21s
500:	test: 0.7694749	best: 0.7695445 (490)	total: 23.8s	remaining: 23.7s
999:	test: 0.7705436	best: 0.7724399 (815)	total: 47.9s	remaining: 0us
bestTest = 0.7724398971
bestIteration = 815
Shrink model to first 816 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5465097	best: 0.5465097 (0)	total: 22ms	remaining: 22s
500:	test: 0.7797034	best: 0.7891747 (75)	total: 23.6s	remaining: 23.5s
999:	test: 0.7727748	best: 0.7891747 (75)	total: 47.8s	remaining: 0us
bestTest = 0.7891747355
bestIteration = 75
Shrink model to first 76 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5541841	best: 0.5541841 (0)	total: 18.2ms	remaining: 18.1s
500:	test: 0.7896367	best: 0.7903101 (405)	total: 24.1s	remaining: 24s
999:	test: 0.7938451	best: 0.7938451 (999)	total: 48.5s	remaining: 0us
bestTest = 0.7938451171
bestIteration = 999


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5240374	best: 0.5240374 (0)	total: 16.2ms	remaining: 16.2s
500:	test: 0.7700291	best: 0.7701522 (490)	total: 23.8s	remaining: 23.7s
999:	test: 0.7739676	best: 0.7740786 (860)	total: 47.9s	remaining: 0us
bestTest = 0.7740786374
bestIteration = 860
Shrink model to first 861 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5421959	best: 0.5421959 (0)	total: 14.6ms	remaining: 14.6s
500:	test: 0.7950425	best: 0.7961564 (430)	total: 23.8s	remaining: 23.7s
999:	test: 0.7915820	best: 0.7961564 (430)	total: 48.9s	remaining: 0us
bestTest = 0.7961564064
bestIteration = 430
Shrink model to first 431 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5373290	best: 0.5373290 (0)	total: 17ms	remaining: 16.9s
500:	test: 0.7641284	best: 0.7647828 (470)	total: 24.1s	remaining: 24s
999:	test: 0.7622771	best: 0.7652158 (670)	total: 48.9s	remaining: 0us
bestTest = 0.7652157545
bestIteration = 670
Shrink model to first 671 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5341925	best: 0.5341925 (0)	total: 15.4ms	remaining: 15.4s
500:	test: 0.7702050	best: 0.7711396 (445)	total: 24.1s	remaining: 24s
999:	test: 0.7635940	best: 0.7711396 (445)	total: 48.5s	remaining: 0us
bestTest = 0.7711396217
bestIteration = 445
Shrink model to first 446 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5405416	best: 0.5405416 (0)	total: 15.7ms	remaining: 15.7s
500:	test: 0.7697665	best: 0.7712349 (255)	total: 23.7s	remaining: 23.6s
999:	test: 0.7615603	best: 0.7712349 (255)	total: 48.3s	remaining: 0us
bestTest = 0.77123487
bestIteration = 255
Shrink model to first 256 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5479324	best: 0.5479324 (0)	total: 15.2ms	remaining: 15.1s
500:	test: 0.7678386	best: 0.7699664 (170)	total: 23.8s	remaining: 23.7s
999:	test: 0.7679067	best: 0.7699664 (170)	total: 48.3s	remaining: 0us
bestTest = 0.7699663639
bestIteration = 170
Shrink model to first 171 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5591819	best: 0.5591819 (0)	total: 16.1ms	remaining: 16s
500:	test: 0.7327316	best: 0.7327316 (500)	total: 24s	remaining: 23.9s
999:	test: 0.7339741	best: 0.7355404 (700)	total: 48.6s	remaining: 0us
bestTest = 0.7355404496
bestIteration = 700
Shrink model to first 701 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5508027	best: 0.5508027 (0)	total: 16ms	remaining: 16s
500:	test: 0.8009461	best: 0.8084205 (230)	total: 24.1s	remaining: 24s
999:	test: 0.7871029	best: 0.8084205 (230)	total: 48.4s	remaining: 0us
bestTest = 0.8084205389
bestIteration = 230
Shrink model to first 231 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5255332	best: 0.5255332 (0)	total: 14.8ms	remaining: 14.8s
500:	test: 0.8057029	best: 0.8098621 (355)	total: 23.9s	remaining: 23.8s
999:	test: 0.7913927	best: 0.8098621 (355)	total: 48.3s	remaining: 0us
bestTest = 0.8098620772
bestIteration = 355
Shrink model to first 356 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5433789	best: 0.5433789 (0)	total: 14.5ms	remaining: 14.5s
500:	test: 0.7436132	best: 0.7436216 (495)	total: 24.3s	remaining: 24.2s
999:	test: 0.7529610	best: 0.7531469 (985)	total: 49s	remaining: 0us
bestTest = 0.7531468868
bestIteration = 985
Shrink model to first 986 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5314177	best: 0.5314177 (0)	total: 16.2ms	remaining: 16.1s
500:	test: 0.7430389	best: 0.7476655 (385)	total: 23.7s	remaining: 23.6s
999:	test: 0.7403809	best: 0.7476655 (385)	total: 47.8s	remaining: 0us
bestTest = 0.7476655245
bestIteration = 385
Shrink model to first 386 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5291526	best: 0.5291526 (0)	total: 15.7ms	remaining: 15.7s
500:	test: 0.7320710	best: 0.7320710 (500)	total: 23.9s	remaining: 23.8s
999:	test: 0.7350286	best: 0.7354928 (990)	total: 48.6s	remaining: 0us
bestTest = 0.7354928255
bestIteration = 990
Shrink model to first 991 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5450152	best: 0.5450152 (0)	total: 16ms	remaining: 16s
500:	test: 0.7555572	best: 0.7558942 (490)	total: 23.9s	remaining: 23.8s
999:	test: 0.7598435	best: 0.7613446 (885)	total: 48.2s	remaining: 0us
bestTest = 0.761344552
bestIteration = 885
Shrink model to first 886 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5396785	best: 0.5396785 (0)	total: 13.7ms	remaining: 13.7s
500:	test: 0.7577391	best: 0.7702219 (145)	total: 23.8s	remaining: 23.7s
999:	test: 0.7503097	best: 0.7702219 (145)	total: 47.9s	remaining: 0us
bestTest = 0.770221889
bestIteration = 145
Shrink model to first 146 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5571058	best: 0.5571058 (0)	total: 15.4ms	remaining: 15.4s
500:	test: 0.8007773	best: 0.8017529 (410)	total: 23.8s	remaining: 23.7s
999:	test: 0.7946445	best: 0.8017529 (410)	total: 48.4s	remaining: 0us
bestTest = 0.8017529249
bestIteration = 410
Shrink model to first 411 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5303495	best: 0.5303495 (0)	total: 15.2ms	remaining: 15.2s
500:	test: 0.7826564	best: 0.7840828 (380)	total: 24.2s	remaining: 24.1s
999:	test: 0.7760787	best: 0.7840828 (380)	total: 49s	remaining: 0us
bestTest = 0.7840827703
bestIteration = 380
Shrink model to first 381 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5403728	best: 0.5403728 (0)	total: 15.1ms	remaining: 15.1s
500:	test: 0.7584716	best: 0.7649713 (330)	total: 23.9s	remaining: 23.8s
999:	test: 0.7506155	best: 0.7649713 (330)	total: 48.6s	remaining: 0us
bestTest = 0.7649713159
bestIteration = 330
Shrink model to first 331 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5469549	best: 0.5469549 (0)	total: 15ms	remaining: 15s
500:	test: 0.7563549	best: 0.7568886 (460)	total: 23.7s	remaining: 23.6s
999:	test: 0.7571365	best: 0.7582009 (775)	total: 47.8s	remaining: 0us
bestTest = 0.7582008839
bestIteration = 775
Shrink model to first 776 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5419255	best: 0.5419255 (0)	total: 14.9ms	remaining: 14.9s
500:	test: 0.7907553	best: 0.7918285 (475)	total: 23.6s	remaining: 23.5s
999:	test: 0.7934318	best: 0.7952258 (855)	total: 48.2s	remaining: 0us
bestTest = 0.7952257991
bestIteration = 855
Shrink model to first 856 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5307242	best: 0.5307242 (0)	total: 14.2ms	remaining: 14.2s
500:	test: 0.7659968	best: 0.7666491 (485)	total: 24.1s	remaining: 24s
999:	test: 0.7720178	best: 0.7728277 (815)	total: 48.6s	remaining: 0us
bestTest = 0.7728276849
bestIteration = 815
Shrink model to first 816 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5312662	best: 0.5312662 (0)	total: 18.1ms	remaining: 18.1s
500:	test: 0.7805716	best: 0.7853457 (245)	total: 24.1s	remaining: 24s
999:	test: 0.7719502	best: 0.7853457 (245)	total: 48.8s	remaining: 0us
bestTest = 0.7853457332
bestIteration = 245
Shrink model to first 246 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5385603	best: 0.5385603 (0)	total: 31.5ms	remaining: 31.5s
500:	test: 0.7703426	best: 0.7730598 (365)	total: 23.9s	remaining: 23.8s
999:	test: 0.7594776	best: 0.7730598 (365)	total: 48.6s	remaining: 0us
bestTest = 0.773059845
bestIteration = 365
Shrink model to first 366 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5258165	best: 0.5258165 (0)	total: 15.2ms	remaining: 15.2s
500:	test: 0.8008124	best: 0.8028598 (380)	total: 24s	remaining: 24s
999:	test: 0.7976022	best: 0.8028598 (380)	total: 48.3s	remaining: 0us
bestTest = 0.8028597832
bestIteration = 380
Shrink model to first 381 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5522988	best: 0.5522988 (0)	total: 15.3ms	remaining: 15.3s
500:	test: 0.7690729	best: 0.7701645 (410)	total: 23.9s	remaining: 23.8s
999:	test: 0.7595562	best: 0.7701645 (410)	total: 48.2s	remaining: 0us
bestTest = 0.7701645494
bestIteration = 410
Shrink model to first 411 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5321336	best: 0.5321336 (0)	total: 15.6ms	remaining: 15.6s
500:	test: 0.7495285	best: 0.7498019 (490)	total: 24.1s	remaining: 24s
999:	test: 0.7556458	best: 0.7560738 (990)	total: 48.7s	remaining: 0us
bestTest = 0.7560738325
bestIteration = 990
Shrink model to first 991 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5577211	best: 0.5577211 (0)	total: 15ms	remaining: 15s
500:	test: 0.7631039	best: 0.7777802 (5)	total: 24.1s	remaining: 24s
999:	test: 0.7656584	best: 0.7777802 (5)	total: 48.6s	remaining: 0us
bestTest = 0.777780205
bestIteration = 5
Shrink model to first 6 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5500532	best: 0.5500532 (0)	total: 15.4ms	remaining: 15.4s
500:	test: 0.7520258	best: 0.7532638 (355)	total: 23.8s	remaining: 23.7s
999:	test: 0.7445852	best: 0.7532638 (355)	total: 48.3s	remaining: 0us
bestTest = 0.7532638311
bestIteration = 355
Shrink model to first 356 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5354156	best: 0.5354156 (0)	total: 15.8ms	remaining: 15.8s
500:	test: 0.7719339	best: 0.7723988 (485)	total: 23.9s	remaining: 23.8s
999:	test: 0.7656330	best: 0.7723988 (485)	total: 48.3s	remaining: 0us
bestTest = 0.7723987699
bestIteration = 485
Shrink model to first 486 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5504219	best: 0.5504219 (0)	total: 14.3ms	remaining: 14.3s
500:	test: 0.7733331	best: 0.7734794 (455)	total: 23.7s	remaining: 23.7s
999:	test: 0.7725923	best: 0.7756743 (660)	total: 47.8s	remaining: 0us
bestTest = 0.7756743431
bestIteration = 660
Shrink model to first 661 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5313740	best: 0.5313740 (0)	total: 15.4ms	remaining: 15.4s
500:	test: 0.8036038	best: 0.8047867 (400)	total: 23.6s	remaining: 23.5s
999:	test: 0.7949976	best: 0.8047867 (400)	total: 47.9s	remaining: 0us
bestTest = 0.8047867417
bestIteration = 400
Shrink model to first 401 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5506247	best: 0.5506247 (0)	total: 14.8ms	remaining: 14.8s
500:	test: 0.7539645	best: 0.7545996 (485)	total: 24s	remaining: 23.9s
999:	test: 0.7538155	best: 0.7546250 (920)	total: 48.2s	remaining: 0us
bestTest = 0.7546249628
bestIteration = 920
Shrink model to first 921 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5353601	best: 0.5353601 (0)	total: 14.7ms	remaining: 14.6s
500:	test: 0.7889388	best: 0.7893353 (465)	total: 23.7s	remaining: 23.7s
999:	test: 0.7825478	best: 0.7895210 (625)	total: 47.9s	remaining: 0us
bestTest = 0.7895209789
bestIteration = 625
Shrink model to first 626 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5468577	best: 0.5468577 (0)	total: 15.3ms	remaining: 15.3s
500:	test: 0.7586719	best: 0.7586719 (500)	total: 23.5s	remaining: 23.4s
999:	test: 0.7545487	best: 0.7591575 (645)	total: 48s	remaining: 0us
bestTest = 0.7591575384
bestIteration = 645
Shrink model to first 646 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5502254	best: 0.5502254 (0)	total: 16.6ms	remaining: 16.6s
500:	test: 0.7865313	best: 0.7873600 (435)	total: 23.6s	remaining: 23.5s
999:	test: 0.7902030	best: 0.7902030 (999)	total: 47.9s	remaining: 0us
bestTest = 0.7902029753
bestIteration = 999


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5315709	best: 0.5315709 (0)	total: 14.7ms	remaining: 14.7s
500:	test: 0.8184702	best: 0.8216069 (260)	total: 24.4s	remaining: 24.3s
999:	test: 0.8126925	best: 0.8216069 (260)	total: 48.7s	remaining: 0us
bestTest = 0.8216068745
bestIteration = 260
Shrink model to first 261 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5361612	best: 0.5361612 (0)	total: 15ms	remaining: 15s
500:	test: 0.7109149	best: 0.7140241 (365)	total: 24.2s	remaining: 24.1s
999:	test: 0.7048585	best: 0.7140241 (365)	total: 48.7s	remaining: 0us
bestTest = 0.7140240669
bestIteration = 365
Shrink model to first 366 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5431921	best: 0.5431921 (0)	total: 16.7ms	remaining: 16.7s
500:	test: 0.8052099	best: 0.8052099 (500)	total: 24s	remaining: 23.9s
999:	test: 0.8090035	best: 0.8103076 (885)	total: 48.6s	remaining: 0us
bestTest = 0.8103075624
bestIteration = 885
Shrink model to first 886 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5283962	best: 0.5283962 (0)	total: 15.9ms	remaining: 15.9s
500:	test: 0.7480584	best: 0.7511666 (385)	total: 23.5s	remaining: 23.4s
999:	test: 0.7382894	best: 0.7511666 (385)	total: 48s	remaining: 0us
bestTest = 0.7511665821
bestIteration = 385
Shrink model to first 386 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5483219	best: 0.5483219 (0)	total: 15.8ms	remaining: 15.7s
500:	test: 0.7670751	best: 0.7717038 (300)	total: 24.2s	remaining: 24.1s
999:	test: 0.7493001	best: 0.7717038 (300)	total: 48.5s	remaining: 0us
bestTest = 0.7717037797
bestIteration = 300
Shrink model to first 301 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5358802	best: 0.5358802 (0)	total: 14.3ms	remaining: 14.3s
500:	test: 0.7816867	best: 0.7816867 (500)	total: 23.7s	remaining: 23.6s
999:	test: 0.7795882	best: 0.7844696 (650)	total: 48.4s	remaining: 0us
bestTest = 0.7844696045
bestIteration = 650
Shrink model to first 651 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5487879	best: 0.5487879 (0)	total: 14.8ms	remaining: 14.8s
500:	test: 0.7446346	best: 0.7446346 (500)	total: 24s	remaining: 23.9s
999:	test: 0.7485260	best: 0.7491128 (910)	total: 48.5s	remaining: 0us
bestTest = 0.7491128445
bestIteration = 910
Shrink model to first 911 iterations.


  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5406818	best: 0.5406818 (0)	total: 15.8ms	remaining: 15.7s
500:	test: 0.7761292	best: 0.7766906 (410)	total: 24s	remaining: 23.9s
999:	test: 0.7772942	best: 0.7772971 (995)	total: 48.4s	remaining: 0us
bestTest = 0.7772971392
bestIteration = 995
Shrink model to first 996 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5541808	best: 0.5541808 (0)	total: 14.6ms	remaining: 14.6s
500:	test: 0.7468292	best: 0.7468292 (500)	total: 24.5s	remaining: 24.4s
999:	test: 0.7535661	best: 0.7536559 (995)	total: 49.2s	remaining: 0us
bestTest = 0.7536558509
bestIteration = 995
Shrink model to first 996 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5442208	best: 0.5442208 (0)	total: 16ms	remaining: 15.9s
500:	test: 0.7789164	best: 0.7796447 (85)	total: 24s	remaining: 23.9s
999:	test: 0.7714816	best: 0.7796447 (85)	total: 48.6s	remaining: 0us
bestTest = 0.7796447277
bestIteration = 85
Shrink model to first 86 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5236980	best: 0.5236980 (0)	total: 15.7ms	remaining: 15.7s
500:	test: 0.8105212	best: 0.8105212 (500)	total: 24s	remaining: 23.9s
999:	test: 0.8018584	best: 0.8106487 (505)	total: 48.9s	remaining: 0us
bestTest = 0.8106486797
bestIteration = 505
Shrink model to first 506 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5409864	best: 0.5409864 (0)	total: 14.6ms	remaining: 14.6s
500:	test: 0.7757629	best: 0.7763822 (395)	total: 24.4s	remaining: 24.3s
999:	test: 0.7722932	best: 0.7763822 (395)	total: 49.6s	remaining: 0us
bestTest = 0.7763822079
bestIteration = 395
Shrink model to first 396 iterations.
Total Score 0.7713927758654888


In [34]:
evaluator.models[0].get_feature_importance()[10:70]

Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
mobile_platform_count_ios_otn,2.139177
clients_id_count_OrderPrice_7,2.064758
order_feats_min_absolute_time_0,2.064053
order_feats_sum_ProductUUId_count_0,2.048825
order_feats_mean_hour_1,2.03761
mobile_event_count_screen_menu_otn,2.011414
order_emb_0120,1.993875
order_feats_min_UnitUUId_appl_mean_1,1.973122
order_feats_std_hour_1,1.884915
order_feats_nunique_ClientOrderNumber_1,1.819715


In [165]:
test_data = test_data.merge(all_features)

In [166]:
test_data['vigoda'] = test_data['Discount'] / test_data['OrderPrice']
test_data['pr'] =  test_data['OrderPrice'] - test_data['Discount']

In [167]:
test_data = test_data[best_feats.tolist() + drop_cols +  ['OrderPrice','vigoda','pr']]

In [213]:
preds = evaluator.models[].predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)
#preds = evaluator.models[1].predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)
#preds = evaluator.models[2].predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)
#preds /= 3
#preds = model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [214]:
preds = preds * 0.05
preds += pd.read_csv('submits/sub_dodo35.csv').apply_promo

In [215]:
sample_submit['apply_promo'] = preds

In [217]:
sample_submit.to_csv('./submits/sub_dodo65.csv',index=False)

In [61]:
test_pool = Pool(
    test_data.drop(drop_cols,axis=1),
    cat_features=cat_cols
)

In [154]:
preds = new_models.models[4].predict_proba(test_pool)[:,1] * 0.05

In [155]:
preds += pd.read_csv('submits/sub_dodo35.csv')['apply_promo']

In [146]:
new_models = model_builder(train_data)

  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 