In [1]:
import cudf
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from lifelines.utils import concordance_index
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.base import BaseEstimator
import itertools
from catboost import CatBoostClassifier,CatBoostRegressor,Pool,cv
import xgboost as xgb
from xgbse.non_parametric import get_time_bins
from xgbse import (
    XGBSEKaplanNeighbors,
    XGBSEKaplanTree,
    XGBSEDebiasedBCE,
    XGBSEBootstrapEstimator
)
from xgbse.converters import (
    convert_data_to_xgb_format,
    convert_to_structured
)

In [25]:
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
transaction_data = cudf.read_csv('./transactions.csv',parse_dates=['transaction_dttm'])
clients_data = cudf.read_csv('./clients.csv')
train_data = cudf.read_csv('./train.csv').to_pandas()
report_data = cudf.read_csv('./report_dates.csv')
simple_sol = cudf.read_csv('./sample_submit_naive.csv')

In [3]:
def mounth_count_day(x):
    mounths = [31,28,31,30,31,30,31,31,30,31,30,31]
    return sum(mounths[:x])

def create_time_features(df):
    df['month'] = df['transaction_dttm'].dt.month
    df['day'] = df['transaction_dttm'].dt.day
    df['week'] = df['transaction_dttm'].dt.day.map(lambda x: x // 7)
    df['minute'] = df['transaction_dttm'].dt.minute
    df['second'] = df['transaction_dttm'].dt.second
    df['month_count'] = df['transaction_dttm'].dt.month.map(lambda x: 30 * x)
    df['hour'] = df['transaction_dttm'].dt.hour
    df['year'] = 2021 - df['transaction_dttm'].dt.year

    df['weekofyear'] = df['transaction_dttm'].dt.dayofyear.map(lambda x: x // 7)
    df['dayofweek'] = df['transaction_dttm'].dt.dayofweek
    df['all_day_time'] = df.apply(lambda x:x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df.apply(lambda x:x.dayofweek * 24 + x.hour)
    df['all_year_time'] = df.apply(lambda x: x.month_count + x.day + x.hour / 24)
    df['all_time'] = df.apply(lambda x:(x.year - 2021)*365 + x.month_count + x.day + x.hour / 24)
    df['all_month_time'] = df.apply(lambda x:x.day * 24 + x.hour + x.minute / 60)
    df['hour_time'] = df.apply(lambda x:x.minute * 60 + x.second)

    df['cl_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16')
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16')
    return df

In [4]:
transaction_data = create_time_features(transaction_data)

In [5]:
def generate_time_features(df): ## ADD cnt / (max - min)
    time_day_features = df.groupby('user_id')['all_day_time'].agg(['mean', 'std', 'min', 'max', 'median','count'])
    time_day_features.columns = [f'time_day_features_{c}' for c in time_day_features.columns]
    time_day_features['time_day_features_diff'] = time_day_features['time_day_features_max'] - time_day_features['time_day_features_min']

    time_year_featues = df.groupby('user_id')['all_year_time'].agg(['mean', 'std', 'min', 'max', 'median',lambda x: x.quantile(0.25),lambda x: x.quantile(0.75),'count'])
    time_year_featues.columns = [f'time_year_featues_{c}' for c in time_year_featues.columns]
    time_year_featues['time_year_featues_diff'] = time_year_featues['time_year_featues_max'] - time_year_featues['time_year_featues_min']
    time_year_featues['time_year_featues_interval'] = time_year_featues['time_year_featues_count'] / time_year_featues['time_year_featues_diff']    

    all_time_featues = df.groupby('user_id')['all_time'].agg(['mean', 'std', 'min', 'max','count','median'])
    all_time_featues.columns = [f'all_time_featues_{c}' for c in all_time_featues.columns]
    all_time_featues['all_time_featues_diff'] = all_time_featues['all_time_featues_count'] / (all_time_featues['all_time_featues_max'] - all_time_featues['all_time_featues_min'])

    time_month_features = df.groupby('user_id')['all_month_time'].agg(['mean', 'std', 'min', 'max', 'median'])
    time_month_features.columns = [f'time_month_features_{c}' for c in time_month_features.columns]
    time_month_features['time_month_features_diff'] = time_month_features['time_month_features_max'] - time_month_features['time_month_features_min']


    dayofweek_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['dayofweek'],
                             aggfunc=['count','mean','max','std','min']).fillna(0).sort_index() # 'median','sum'
    
    hour_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['hour'],
                             aggfunc=['count','mean','max','std','min',]).fillna(0).sort_index()
    

    month_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['month'],
                             aggfunc=['count','mean','max','std','min']).fillna(0).sort_index()
    
    dayofweek_embeds = cudf.DataFrame(dayofweek_embeds.values)
    hour_embeds = cudf.DataFrame(hour_embeds.values)
    month_embeds = cudf.DataFrame(month_embeds.values)
    
    dayofweek_embeds.columns = [f'dayofweek_embeds_{i}' for i in dayofweek_embeds.columns]
    hour_embeds.columns = [f'hour_embeds_{i}' for i in hour_embeds.columns]
    month_embeds.columns = [f'month_embeds_{i}' for i in month_embeds.columns]
    
    dayofweek_embeds.index = time_day_features.index
    hour_embeds.index = time_day_features.index
    month_embeds.index = time_day_features.index

    
    time_feartures = cudf.concat([time_day_features.sort_index(),
                                time_year_featues.sort_index(),
                                time_month_features.sort_index(),
                                dayofweek_embeds,
                                hour_embeds,
                                month_embeds],axis=1)

    return time_feartures

In [6]:
def get_corr_cov_features(df):
    feat_year_time_corr_ps = df[['transaction_amt','all_time','user_id']].groupby('user_id').corr('pearson')
    feat_year_time_corr_ps = feat_year_time_corr_ps.values[:,1][::2]
    #feat_year_time_corr_sp = df[['transaction_amt','all_time','user_id']].groupby('user_id').corr('spearman')
    #feat_year_time_corr_sp = feat_year_time_corr_sp.values[:,1][::2]
    
    feat_hour_corr_ps = df[['transaction_amt','hour','user_id']].groupby('user_id').corr('pearson')
    feat_hour_corr_ps = feat_hour_corr_ps.values[:,1][::2]
    #feat_hour_corr_sp = df[['transaction_amt','hour','user_id']].groupby('user_id').corr('spearman')
    #feat_hour_corr_sp = feat_hour_corr_sp.values[:,1][::2]
    
    feat_dayofweek_corr_ps = df[['transaction_amt','dayofweek','user_id']].groupby('user_id').corr('pearson')
    feat_dayofweek_corr_ps = feat_dayofweek_corr_ps.values[:,1][::2]
    #feat_dayofweek_corr_sp = df[['transaction_amt','dayofweek','user_id']].groupby('user_id').corr('spearman')
    #feat_dayofweek_corr_sp = feat_dayofweek_corr_sp.values[:,1][::2]
    
    feat_year_time_cov = df[['transaction_amt','all_time','user_id']].groupby('user_id').cov()
    feat_year_time_cov_0 = feat_year_time_cov.values[:,1][::2]
    feat_year_time_cov_1 = feat_year_time_cov.values[:,0][::2]
    
    feat_hour_cov = df[['transaction_amt','hour','user_id']].groupby('user_id').cov()
    feat_hour_cov_0 = feat_year_time_cov.values[:,1][::2]
    feat_hour_cov_1 = feat_year_time_cov.values[:,0][::2]    

    feat_dayofweek_cov = df[['transaction_amt','dayofweek','user_id']].groupby('user_id').cov()
    feat_dayofweek_cov_0 = feat_year_time_cov.values[:,1][::2]
    feat_dayofweek_cov_1 = feat_year_time_cov.values[:,0][::2]    
    
    feature_df = cudf.DataFrame({'feat_year_time_corr_ps':feat_year_time_corr_ps,
                               #'feat_year_time_corr_sp':feat_year_time_corr_sp,
                               'feat_hour_corr_ps':feat_hour_corr_ps,
                               #'feat_hour_corr_sp':feat_hour_corr_sp,
                               'feat_dayofweek_corr_ps':feat_dayofweek_corr_ps,
                               #'feat_dayofweek_corr_sp':feat_dayofweek_corr_sp,
                               'feat_year_time_cov_0':feat_year_time_cov_0,
                               'feat_year_time_cov_1':feat_year_time_cov_1,
                               'feat_hour_cov_0':feat_hour_cov_0,
                               'feat_hour_cov_1':feat_hour_cov_1,
                               'feat_dayofweek_cov_0':feat_dayofweek_cov_0,
                               'feat_dayofweek_cov_1':feat_dayofweek_cov_1})
    return feature_df

In [7]:
time_features = generate_time_features(transaction_data)

In [8]:
corr_features = get_corr_cov_features(transaction_data)

In [9]:
corr_features.index = time_features.index

In [10]:
def prepare_mcc(df):
    value_counts = transaction_data['mcc_code'].value_counts()
    important_mcc = value_counts[value_counts > 40_000].index.values.tolist()
    df['important_mcc'] = [x if x in important_mcc else -1 for x in df['mcc_code'].to_pandas()]
    return df

transaction_data = prepare_mcc(transaction_data)

In [11]:
def make_money_features(df):
    simple_features = df.groupby('user_id')['transaction_amt'].agg(['mean', 'std', 'min', 'max', 'median','sum',lambda x:x.quantile(0.25),lambda x: x.quantile(0.75)])
    simple_features.columns = [f'simple_features_{c}' for c in simple_features.columns]
    simple_features['simple_features_diff'] = simple_features['simple_features_max'] - simple_features['simple_features_min']
    simple_features['simple_features_step'] =  simple_features['simple_features_mean'] / simple_features['simple_features_diff']
    
    month_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['important_mcc'],
                             aggfunc=['count','mean','max','std','min','median']).fillna(0).sort_index()
    
    df['currency_rk'] = df['currency_rk'].apply(lambda x: 0 if x > 1 else x)
    curr_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['currency_rk'],
                             aggfunc=['count','mean','max','std','min','median']).fillna(0).sort_index()

    
    month_embeds = cudf.DataFrame(month_embeds.values)
    month_embeds.columns = [f'mony_embeds_{i}' for i in month_embeds.columns]
    month_embeds.index = simple_features.index
    
    curr_embeds = cudf.DataFrame(month_embeds.values)
    curr_embeds.columns = [f'curr_embeds_{i}' for i in month_embeds.columns]
    curr_embeds.index = simple_features.index

    
    money_features = cudf.concat([simple_features.sort_index(),
                                curr_embeds,month_embeds],axis=1)

    return money_features

In [12]:
mony_features = make_money_features(transaction_data)

In [13]:
clients_data = clients_data.set_index('user_id')
all_data = cudf.concat([mony_features,time_features,clients_data,corr_features],axis=1)
all_data

Unnamed: 0_level_0,simple_features_mean,simple_features_std,simple_features_min,simple_features_max,simple_features_median,simple_features_sum,simple_features_<lambda>,simple_features_diff,simple_features_step,curr_embeds_mony_embeds_0,...,customer_age,feat_year_time_corr_ps,feat_hour_corr_ps,feat_dayofweek_corr_ps,feat_year_time_cov_0,feat_year_time_cov_1,feat_hour_cov_0,feat_hour_cov_1,feat_dayofweek_cov_0,feat_dayofweek_cov_1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1246.037876,60860.880477,-153866.890625,104011.960937,4549.455078,13706.416641,16861.306152,257878.851562,0.004832,1.0,...,3,0.575034,0.458957,0.366283,1.424515e+06,3.704047e+09,1.424515e+06,3.704047e+09,1.424515e+06,3.704047e+09
9,-3593.718520,11797.055119,-90147.617188,-45.579891,-840.509674,-323434.666813,-434.467491,90102.037296,-0.039885,18.0,...,3,0.156064,-0.097424,-0.020805,5.753988e+05,1.391705e+08,5.753988e+05,1.391705e+08,5.753988e+05,1.391705e+08
13,-5668.971780,23507.953991,-58740.300781,70322.828125,-10529.004883,-124717.379150,5211.026978,129063.128906,-0.043924,0.0,...,2,0.043895,0.268719,-0.086992,5.212550e+04,5.526239e+08,5.212550e+04,5.526239e+08,5.212550e+04,5.526239e+08
37,-1053.522538,3913.114835,-35782.984375,5487.140625,-236.420776,-331859.599463,-141.720711,41270.125000,-0.025527,14.0,...,2,0.067020,-0.034972,0.048981,1.193260e+04,1.531247e+07,1.193260e+04,1.531247e+07,1.193260e+04,1.531247e+07
41,-6786.663385,5251.737982,-16841.208984,-290.766998,-6328.293701,-108586.614166,-1537.244461,16550.441986,-0.410059,5.0,...,2,-0.439175,0.394066,-0.255335,-6.061952e+05,2.758075e+07,-6.061952e+05,2.758075e+07,-6.061952e+05,2.758075e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,-799.493409,2136.038913,-13020.519531,-40.769005,-239.170517,-29581.256115,-102.463661,12979.750526,-0.061595,1.0,...,2,0.010758,0.134854,-0.141449,2.447804e+03,4.562662e+06,2.447804e+03,4.562662e+06,2.447804e+03,4.562662e+06
562205,-268.158335,590.682233,-6220.171387,1595.461060,-147.305283,-40491.908630,-27.259905,7815.632446,-0.034311,14.0,...,1,0.082231,-0.043995,-0.094764,1.568401e+04,3.489055e+05,1.568401e+04,3.489055e+05,1.568401e+04,3.489055e+05
562312,-331.032523,309.587795,-1372.377075,-28.292030,-271.973526,-18537.821270,-138.175877,1344.085045,-0.246288,0.0,...,0,-0.130948,-0.141864,0.110958,-1.143809e+04,9.584460e+04,-1.143809e+04,9.584460e+04,-1.143809e+04,9.584460e+04
562721,-1929.467785,4172.623511,-18981.269531,5412.773926,-399.343048,-164004.761685,-22.946484,24394.043457,-0.079096,2.0,...,2,0.085521,-0.238338,-0.043696,9.285091e+04,1.741079e+07,9.285091e+04,1.741079e+07,9.285091e+04,1.741079e+07


In [14]:
all_data['customer_age'] = all_data['customer_age'].astype('category')
all_data['employee_count_nm'] = all_data['employee_count_nm'].astype('category')
all_train_df = all_data.loc[train_data['user_id']].to_pandas()
all_train_df['label'] = train_data['target'].values
all_train_df['time'] = train_data['time'].values

In [15]:
all_train_df.columns = [x.replace('>','').replace('<','') for x in all_train_df.columns]

In [16]:
def build_xgb_cox_dmatrix(X, T, E):
    """Builds a XGB DMatrix using specified Data Frame of features (X)
        arrays of times (T) and censors/events (E).

    Args:
        X ([pd.DataFrame, np.array]): Data Frame to be converted to XGBDMatrix format.
        T ([np.array, pd.Series]): Array of times.
        E ([np.array, pd.Series]): Array of censors(False) / events(True).

    Returns:
        (DMatrix): A XGB DMatrix is returned including features and target.
    """

    target = np.where(E, T, -T)

    return xgb.DMatrix(X, label=target,enable_categorical=True)


In [31]:
class XGBKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,params,random_state=56):
        self.models = []
        self.params = params
        self.kfold = StratifiedKFold(n_splits=num_folds,random_state=random_state,shuffle=True)
        
    def fit(self,train_data,cat_features=None,drop_cols=['time'],label_col='label',verbose=False):
        scores = []
        for train_index, test_index in tqdm(self.kfold.split(train_data,train_data[label_col])):
            train_df = train_data.iloc[train_index]
            test_df = train_data.iloc[test_index]
            
            train_pool = build_xgb_cox_dmatrix(train_df.drop(['label','time'],axis=1),train_df['time'],train_df['label'])
            eval_pool = build_xgb_cox_dmatrix(test_df.drop(['label','time'],axis=1),test_df['time'],test_df['label'])
            
            bst = xgb.train(
                            self.params,
                            train_pool,
                            num_boost_round=1000,
                            early_stopping_rounds=100,
                            evals=[(eval_pool, 'val')],
                            verbose_eval=50
                            )
            
            val_preds = bst.predict(eval_pool)
            score = concordance_index(test_df['time'], -val_preds, test_df['label'])
            print(score)
            scores += [score]
            self.models += [bst]
        print(f"Total Score {np.mean(scores)}")
        return scores
            
    def predict(self,test_data,cat_features=None):
        test_pool = xgb.DMatrix(test_data,enable_categorical=True)
        preds = np.mean([model.predict(test_pool) for model in self.models],axis=0)
        return preds

In [33]:
PARAMS_XGB_COX = {
    'objective': 'survival:cox',
    #'tree_method': 'hist', 
    'learning_rate': 0.01, 
    'max_cat_to_onehot':32,
    'max_depth': 5, 
    #'booster':'dart',
    'subsample':0.5,
    'min_child_weight': 50, 
    'colsample_bynode':0.5
}

In [26]:
model = XGBKfoldWraper(num_folds=5,params=PARAMS_XGB_COX,random_state=56)
model.fit(all_train_df)

0it [00:00, ?it/s]

[0]	val-cox-nloglik:9.26172
[50]	val-cox-nloglik:9.00208
[100]	val-cox-nloglik:8.89972
[150]	val-cox-nloglik:8.85002
[200]	val-cox-nloglik:8.82310
[250]	val-cox-nloglik:8.80432
[300]	val-cox-nloglik:8.79277
[350]	val-cox-nloglik:8.78611
[400]	val-cox-nloglik:8.77957
[450]	val-cox-nloglik:8.77517
[500]	val-cox-nloglik:8.77219
[550]	val-cox-nloglik:8.76943
[600]	val-cox-nloglik:8.76760
[650]	val-cox-nloglik:8.76586
[700]	val-cox-nloglik:8.76517
[750]	val-cox-nloglik:8.76447
[800]	val-cox-nloglik:8.76351
[850]	val-cox-nloglik:8.76331
[900]	val-cox-nloglik:8.76216
[950]	val-cox-nloglik:8.76180
[999]	val-cox-nloglik:8.76078
0.750204791579568
[0]	val-cox-nloglik:9.25311
[50]	val-cox-nloglik:9.00007
[100]	val-cox-nloglik:8.89893
[150]	val-cox-nloglik:8.84931
[200]	val-cox-nloglik:8.81953
[250]	val-cox-nloglik:8.79799
[300]	val-cox-nloglik:8.78599
[350]	val-cox-nloglik:8.77552
[400]	val-cox-nloglik:8.76819
[450]	val-cox-nloglik:8.76356
[500]	val-cox-nloglik:8.75948
[550]	val-cox-nloglik:8.7558

[0.750204791579568,
 0.7583705719759748,
 0.7526148574938034,
 0.7475887798531414,
 0.7687597143904629]

In [28]:
sample_sub = pd.read_csv('sample_submit_naive.csv')

In [39]:
all_data = all_data#.to_pandas()
test_data = all_data.loc[sample_sub['user_id']]
test_data.columns = [x.replace('>','').replace('<','') for x in test_data.columns]

In [40]:
preds = model.predict(test_data)

In [42]:
preds.mean()

0.8030144

In [43]:
sample_sub['predict'] = preds

In [44]:
sample_sub.to_csv('xgb_5f.csv',index=False)

In [46]:
catboost_sub = pd.read_csv('basev9.csv')
catboost_sub

Unnamed: 0,user_id,predict
0,9,0.025671
1,61,0.019911
2,62,0.041590
3,80,0.010960
4,88,0.147119
...,...,...
31995,561362,0.065828
31996,561419,0.060738
31997,561895,0.050331
31998,561908,0.076354


In [47]:
catboost_sub['predict'].mean()

0.0821827622456092

In [48]:
preds += catboost_sub['predict'] * 10
preds /= 2
sample_sub['predict'] = preds
sample_sub.to_csv('ens_5f.csv',index=False)

In [50]:
preds

0        0.232916
1        0.190738
2        0.376666
3        0.113898
4        1.341840
           ...   
31995    0.598635
31996    0.525355
31997    0.507601
31998    0.678707
31999    1.142556
Name: predict, Length: 32000, dtype: float32