In [1]:
import cudf
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from lifelines.utils import concordance_index
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.base import BaseEstimator
from catboost import CatBoostClassifier,CatBoostRegressor,Pool,cv
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

In [2]:
transaction_data = cudf.read_csv('./transactions.csv',parse_dates=['transaction_dttm'])
clients_data = cudf.read_csv('./clients.csv')
train_data = cudf.read_csv('./train.csv')
report_data = cudf.read_csv('./report_dates.csv')
simple_sol = cudf.read_csv('./sample_submit_naive.csv')

In [3]:
def mounth_count_day(x):
    mounths = [31,28,31,30,31,30,31,31,30,31,30,31]
    return sum(mounths[:x])

def create_time_features(df):
    df['month'] = df['transaction_dttm'].dt.month
    df['day'] = df['transaction_dttm'].dt.day
    df['week'] = df['transaction_dttm'].dt.day.map(lambda x: x % 7)
    df['minute'] = df['transaction_dttm'].dt.minute
    df['second'] = df['transaction_dttm'].dt.second
    df['month_count'] = df['transaction_dttm'].dt.month.map(lambda x: 30 * x)
    df['hour'] = df['transaction_dttm'].dt.hour
    df['year'] = 2021 - df['transaction_dttm'].dt.year

    df['weekofyear'] = df['transaction_dttm'].dt.dayofyear.map(lambda x: x % 7)
    df['dayofweek'] = df['transaction_dttm'].dt.dayofweek
    df['all_day_time'] = df.apply(lambda x:x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df.apply(lambda x:x.dayofweek * 24 + x.hour)
    df['all_year_time'] = df.apply(lambda x: x.month_count + x.day + x.hour / 24)
    df['all_time'] = df.apply(lambda x:(x.year - 2021)*365 + x.month_count + x.day)
    df['all_month_time'] = df.apply(lambda x:x.day * 24 + x.hour + x.minute / 60)
    df['hour_time'] = df.apply(lambda x:x.minute * 60 + x.second)

    df['cl_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16')
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16')
    return df

In [4]:
transaction_data = create_time_features(transaction_data)

In [5]:
def generate_time_features(df): ## ADD cnt / (max - min)
    time_day_features = df.groupby('user_id')['all_day_time'].agg(['mean', 'std', 'min', 'max', 'median'])
    time_day_features.columns = [f'time_day_features_{c}' for c in time_day_features.columns]
    time_day_features['time_day_features_diff'] = time_day_features['time_day_features_max'] - time_day_features['time_day_features_min']

    time_year_featues = df.groupby('user_id')['all_year_time'].agg(['mean', 'std', 'min', 'max', 'median'])
    time_year_featues.columns = [f'time_year_featues_{c}' for c in time_year_featues.columns]
    time_year_featues['time_year_featues_diff'] = time_year_featues['time_year_featues_max'] - time_year_featues['time_year_featues_min']

    all_time_featues = df.groupby('user_id')['all_time'].agg(['mean', 'std', 'min', 'max','count','median'])
    all_time_featues.columns = [f'all_time_featues_{c}' for c in all_time_featues.columns]
    all_time_featues['all_time_featues_diff'] = all_time_featues['all_time_featues_count'] / (all_time_featues['all_time_featues_max'] - all_time_featues['all_time_featues_min'])

    time_month_features = df.groupby('user_id')['all_month_time'].agg(['mean', 'std', 'min', 'max', 'median'])
    time_month_features.columns = [f'time_month_features_{c}' for c in time_month_features.columns]
    time_month_features['time_month_features_diff'] = time_month_features['time_month_features_max'] - time_month_features['time_month_features_min']


    dayofweek_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['dayofweek'],
                             aggfunc=['count','mean','max','std','min']).fillna(0).sort_index() # 'median','sum'
    
    hour_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['hour'],
                             aggfunc=['count','mean','max','std','min',]).fillna(0).sort_index()
    

    month_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['month'],
                             aggfunc=['count','mean','max','std','min']).fillna(0).sort_index()
    
    dayofweek_embeds = cudf.DataFrame(dayofweek_embeds.values)
    hour_embeds = cudf.DataFrame(hour_embeds.values)
    month_embeds = cudf.DataFrame(month_embeds.values)
    
    dayofweek_embeds.columns = [f'dayofweek_embeds_{i}' for i in dayofweek_embeds.columns]
    hour_embeds.columns = [f'hour_embeds_{i}' for i in hour_embeds.columns]
    month_embeds.columns = [f'month_embeds_{i}' for i in month_embeds.columns]
    
    dayofweek_embeds.index = time_day_features.index
    hour_embeds.index = time_day_features.index
    month_embeds.index = time_day_features.index

    
    time_feartures = cudf.concat([time_day_features.sort_index(),
                                time_year_featues.sort_index(),
                                time_month_features.sort_index(),
                                dayofweek_embeds,
                                hour_embeds,
                                month_embeds],axis=1)

    return time_feartures

In [6]:
time_features = generate_time_features(transaction_data)

In [7]:
def prepare_mcc(df):
    value_counts = transaction_data['mcc_code'].value_counts()
    important_mcc = value_counts[value_counts > 40_000].index.values.tolist()
    df['important_mcc'] = [x if x in important_mcc else -1 for x in df['mcc_code'].to_pandas()]
    return df

transaction_data = prepare_mcc(transaction_data)

In [10]:
def make_money_features(df):
    simple_features = df.groupby('user_id')['transaction_amt'].agg(['mean', 'std', 'min', 'max', 'median','sum'])
    simple_features.columns = [f'simple_features_{c}' for c in simple_features.columns]
    simple_features['simple_features_diff'] = simple_features['simple_features_max'] - simple_features['simple_features_min']
    simple_features['simple_features_step'] =  simple_features['simple_features_mean'] / simple_features['simple_features_diff']
    
    month_embeds = cudf.pivot_table(
                             transaction_data,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['important_mcc'],
                             aggfunc=['count','mean','max','std','min']).fillna(0).sort_index()
    
    month_embeds = cudf.DataFrame(month_embeds.values)
    month_embeds.columns = [f'mony_embeds_{i}' for i in month_embeds.columns]
    month_embeds.index = simple_features.index
    
    money_features = cudf.concat([simple_features.sort_index(),
                                month_embeds],axis=1)

    return money_features

In [11]:
mony_features = make_money_features(transaction_data)
clients_data = clients_data.set_index('user_id')
all_data = cudf.concat([mony_features,time_features,clients_data],axis=1)

In [12]:
all_train_df = all_data.loc[train_data['user_id']]
all_train_df = all_train_df.to_pandas()
all_train_df

Unnamed: 0_level_0,simple_features_mean,simple_features_std,simple_features_min,simple_features_max,simple_features_median,simple_features_sum,simple_features_diff,simple_features_step,mony_embeds_0,mony_embeds_1,...,month_embeds_54,month_embeds_55,month_embeds_56,month_embeds_57,month_embeds_58,month_embeds_59,report,employee_count_nm,bankemplstatus,customer_age
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1246.037876,60860.880477,-153866.890625,104011.960937,4549.455078,13706.416641,257878.851562,0.004832,1.0,7.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,ОТ 101 ДО 500,0,3
13,-5668.971780,23507.953991,-58740.300781,70322.828125,-10529.004883,-124717.379150,129063.128906,-0.043924,0.0,0.0,...,-17234.970703,-16453.037109,-16394.193359,0.000000,0.000000,0.000000,6,ОТ 501 ДО 1000,0,2
37,-1053.522538,3913.114835,-35782.984375,5487.140625,-236.420776,-331859.599463,41270.125000,-0.025527,14.0,2.0,...,-5205.989258,-26518.769531,0.000000,0.000000,0.000000,0.000000,5,БОЛЕЕ 1001,0,2
41,-6786.663385,5251.737982,-16841.208984,-290.766998,-6328.293701,-108586.614166,16550.441986,-0.410059,5.0,0.0,...,0.000000,0.000000,0.000000,0.000000,-16841.208984,-11552.795898,1,ОТ 101 ДО 500,0,2
42,193.721817,6735.085596,-19526.582031,45706.445312,-321.756958,11429.587215,65233.027344,0.002970,6.0,1.0,...,0.000000,0.000000,-1110.479004,-3498.598633,-898.395203,-2185.843994,12,ДО 10,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561824,-504.927428,1046.917405,-6747.823730,1270.197021,-280.176743,-74729.259329,8018.020752,-0.062974,22.0,4.0,...,0.000000,0.000000,-984.325684,-6747.823730,-6155.848633,-6629.865723,12,,0,0
562043,-799.493409,2136.038913,-13020.519531,-40.769005,-239.170517,-29581.256115,12979.750526,-0.061595,1.0,0.0,...,0.000000,0.000000,0.000000,-13020.519531,-1346.094482,-2458.228760,12,,0,2
562312,-331.032523,309.587795,-1372.377075,-28.292030,-271.973526,-18537.821270,1344.085045,-0.246288,0.0,0.0,...,0.000000,0.000000,-1308.028442,-1372.377075,-868.479248,-1017.174133,12,,0,0
562721,-1929.467785,4172.623511,-18981.269531,5412.773926,-399.343048,-164004.761685,24394.043457,-0.079096,2.0,19.0,...,0.000000,0.000000,-429.593597,-10965.872070,-12050.644531,-18981.269531,12,,0,2


In [13]:
train_data = train_data.to_pandas()