In [117]:
import cudf
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from lifelines.utils import concordance_index
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from scipy.stats import dweibull, skew, kstat, expectile
from seglearn.feature_functions import (willison_amplitude, emg_var, waveform_length, \
                      slope_sign_changes, zero_crossing, hist, mean_crossings, \
                      mse, means_abs_diff, variation)
import scipy.stats as sts
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
from tsfresh.feature_extraction import feature_calculators as ts_calcs
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.plotting import plot_probability_alive_matrix
import itertools
from catboost import CatBoostClassifier,CatBoostRegressor,Pool,cv
import warnings
warnings.simplefilter('ignore', np.RankWarning)

In [2]:
transaction_data = cudf.read_csv('./transactions.csv',parse_dates=['transaction_dttm'])
clients_data = cudf.read_csv('./clients.csv')
train_data = cudf.read_csv('./train.csv')
report_data = cudf.read_csv('./report_dates.csv')
simple_sol = cudf.read_csv('./sample_submit_naive.csv')
reports_data = cudf.read_csv('report_dates.csv',parse_dates=['report_dt'])

In [4]:
clients_data_merge = clients_data.merge(reports_data)[['report_dt','user_id']]
transaction_data = transaction_data.merge(clients_data_merge)

In [10]:
def mounth_count_day(x):
    mounths = [31,28,31,30,31,30,31,31,30,31,30,31]
    return sum(mounths[:x])

def create_time_features(df):
    df['month'] = df['transaction_dttm'].dt.month
    df['day'] = df['transaction_dttm'].dt.day
    df['week'] = df['transaction_dttm'].dt.day.map(lambda x: x // 7)
    df['minute'] = df['transaction_dttm'].dt.minute
    df['second'] = df['transaction_dttm'].dt.second
    df['month_count'] = df['transaction_dttm'].dt.month.map(lambda x: 30 * x)
    df['hour'] = df['transaction_dttm'].dt.hour
    df['year'] = df['transaction_dttm'].dt.year
    
    df['weekofyear'] = df['transaction_dttm'].dt.dayofyear.map(lambda x: x // 7)
    df['dayofweek'] = df['transaction_dttm'].dt.dayofweek
    df['all_day_time'] = df.apply(lambda x:x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df.apply(lambda x:x.dayofweek * 24 + x.hour)
    df['all_year_time'] = df.apply(lambda x: x.month_count + x.day + x.hour / 24)
    df['all_time'] = df.apply(lambda x:(x.year - 2021)*365 + x.month_count + x.day + x.hour / 24)
    df['all_month_time'] = df.apply(lambda x:x.day * 24 + x.hour + x.minute / 60)
    df['all_sec_time'] = df.apply(lambda x: (x.year - 2021)*365 * 24 * 3600 + x.month * 30 * 24 * 3600 + x.day * 24 * 3600 + x.hour * 3600 + x.minute * 60 + x.second)
    df['all_sec_time_report'] = (df['report_dt'].dt.year - 2021)*365 * 24 * 3600 + df['report_dt'].dt.month * 30 * 24 * 3600 + df['report_dt'].dt.day * 24 * 3600 + df['report_dt'].dt.hour * 3600 + df['report_dt'].dt.minute * 60 + df['report_dt'].dt.second
    df['delta_all_sec_time'] = df['all_sec_time_report'] - df['all_sec_time']
    df['hour_time'] = df.apply(lambda x:x.minute * 60 + x.second)
    df['days_to_reprort'] = (df['report_dt'] - df['transaction_dttm']).dt.days
    df['mounth_until_report'] = (df['report_dt'].dt.year-df['transaction_dttm'].dt.year)*12+(df['report_dt'].dt.month-df['transaction_dttm'].dt.month)

    df['cl_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16')
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16')
    return df

In [11]:
transaction_data = create_time_features(transaction_data)

In [21]:
trashold_days = 30 + 92.5
transaction_data['days_on_target_time'] = transaction_data['days_to_reprort'].map(lambda x: 1 if x < trashold_days else 0)

In [28]:
targets = transaction_data.groupby('user_id')['days_on_target_time'].sum()

In [33]:
drop_idxes = transaction_data[transaction_data['days_on_target_time'] == 1].index
transaction_data = transaction_data.drop(drop_idxes)

In [61]:
user_cnt = transaction_data.groupby(['user_id']).agg('count')['day']
bad_users = user_cnt[user_cnt < 10].index

In [66]:
bad_users_set = set(bad_users.values.tolist())
bad_users_idx = [idx for x,idx in zip(transaction_data['user_id'].values.tolist(),transaction_data.index.values.tolist()) if x in bad_users_set]

In [69]:
transaction_data = transaction_data.drop(bad_users_idx)

In [70]:
def generate_time_features(df): ## ADD cnt / (max - min)
    time_day_features = df.groupby('user_id')['all_day_time'].agg(['mean', 'std', 'min', 'max', 'median','count'])
    time_day_features.columns = [f'time_day_features_{c}' for c in time_day_features.columns]
    time_day_features['time_day_features_diff'] = time_day_features['time_day_features_max'] - time_day_features['time_day_features_min']
    
    all_sec_features = df.groupby('user_id')['all_sec_time'].agg(['mean', 'std', 'min', 'max', 'median','count','var',lambda x: x.quantile(0.25),lambda x: x.quantile(0.75)])
    all_sec_features.columns = [f'all_sec_time_features_{c}' for c in all_sec_features.columns]
    all_sec_features['all_sec_time_features_diff'] = all_sec_features['all_sec_time_features_max'] - all_sec_features['all_sec_time_features_min']
    all_sec_features['all_sec_time_scale'] = all_sec_features['all_sec_time_features_mean'] - all_sec_features['all_sec_time_features_median']
    
    delta_all_sec_features = df.groupby('user_id')['delta_all_sec_time'].agg(['mean', 'std', 'min', 'max', 'median','count','var',lambda x: x.quantile(0.25),lambda x: x.quantile(0.75)])
    delta_all_sec_features.columns = [f'delta_all_sec_time_features_{c}' for c in delta_all_sec_features.columns]
    delta_all_sec_features['delta_all_sec_features_diff'] = delta_all_sec_features['delta_all_sec_time_features_max'] - delta_all_sec_features['delta_all_sec_time_features_min']
    delta_all_sec_features['delta_all_sec_features_scale'] = delta_all_sec_features['delta_all_sec_time_features_mean'] - delta_all_sec_features['delta_all_sec_time_features_median']


    time_year_featues = df.groupby('user_id')['all_year_time'].agg(['mean', 'std', 'min', 'max', 'median',lambda x: x.quantile(0.25),lambda x: x.quantile(0.75),'count'])
    time_year_featues.columns = [f'time_year_featues_{c}' for c in time_year_featues.columns]
    time_year_featues['time_year_featues_diff'] = time_year_featues['time_year_featues_max'] - time_year_featues['time_year_featues_min']
    time_year_featues['time_year_featues_interval'] = time_year_featues['time_year_featues_count'] / time_year_featues['time_year_featues_diff']  
    

    all_time_featues = df.groupby('user_id')['all_time'].agg(['mean', 'std', 'min', 'max','count','median'])
    all_time_featues.columns = [f'all_time_featues_{c}' for c in all_time_featues.columns]
    all_time_featues['all_time_featues_diff'] = all_time_featues['all_time_featues_count'] / (all_time_featues['all_time_featues_max'] - all_time_featues['all_time_featues_min'])

    time_month_features = df.groupby('user_id')['all_month_time'].agg(['mean', 'std', 'min', 'max', 'median'])
    time_month_features.columns = [f'time_month_features_{c}' for c in time_month_features.columns]
    time_month_features['time_month_features_diff'] = time_month_features['time_month_features_max'] - time_month_features['time_month_features_min']


    dayofweek_embeds = cudf.pivot_table(
                             df,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['dayofweek'],
                             aggfunc=['count','mean','max','std','min']).fillna(0).sort_index() # 'median','sum'
    
    hour_embeds = cudf.pivot_table(
                             df,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['hour'],
                             aggfunc=['count','mean','max','std','min',]).fillna(0).sort_index()
    

    month_embeds = cudf.pivot_table(
                             df,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['month'],
                             aggfunc=['count','mean','max','std','min']).fillna(0).sort_index()
    
    dayofweek_embeds = cudf.DataFrame(dayofweek_embeds.values)
    hour_embeds = cudf.DataFrame(hour_embeds.values)
    month_embeds = cudf.DataFrame(month_embeds.values)
    
    dayofweek_embeds.columns = [f'dayofweek_embeds_{i}' for i in dayofweek_embeds.columns]
    hour_embeds.columns = [f'hour_embeds_{i}' for i in hour_embeds.columns]
    month_embeds.columns = [f'month_embeds_{i}' for i in month_embeds.columns]
    
    dayofweek_embeds.index = time_day_features.index
    hour_embeds.index = time_day_features.index
    month_embeds.index = time_day_features.index

    
    time_feartures = cudf.concat([time_day_features.sort_index(),
                                 time_year_featues.sort_index(),
                                 delta_all_sec_features.sort_index(),
                                 time_month_features.sort_index(),
                                 all_time_featues.sort_index(),
                                 all_sec_features.sort_index(),
                                 dayofweek_embeds,
                                 hour_embeds,
                                 month_embeds],axis=1)
 
    return time_feartures

In [71]:
def get_corr_cov_features(df):
    feat_year_time_corr_ps = df[['transaction_amt','all_sec_time','user_id']].to_pandas().groupby('user_id').corr('pearson')
    feat_year_time_corr_ps = feat_year_time_corr_ps.values[:,1][::2]
    feat_year_time_corr_sp = df[['transaction_amt','all_time','user_id']].to_pandas().groupby('user_id').corr('spearman')
    feat_year_time_corr_sp = feat_year_time_corr_sp.values[:,1][::2]
    
    feat_hour_corr_ps = df[['transaction_amt','hour','user_id']].to_pandas().groupby('user_id').corr('pearson')
    feat_hour_corr_ps = feat_hour_corr_ps.values[:,1][::2]
    feat_hour_corr_sp = df[['transaction_amt','hour','user_id']].to_pandas().groupby('user_id').corr('spearman')
    feat_hour_corr_sp = feat_hour_corr_sp.values[:,1][::2]
    
    feat_dayofweek_corr_ps = df[['transaction_amt','dayofweek','user_id']].to_pandas().groupby('user_id').corr('pearson')
    feat_dayofweek_corr_ps = feat_dayofweek_corr_ps.values[:,1][::2]
    feat_dayofweek_corr_sp = df[['transaction_amt','dayofweek','user_id']].to_pandas().groupby('user_id').corr('spearman')
    feat_dayofweek_corr_sp = feat_dayofweek_corr_sp.values[:,1][::2]
    
    feat_year_time_cov = df[['transaction_amt','all_sec_time','user_id']].to_pandas().groupby('user_id').cov()
    feat_year_time_cov_0 = feat_year_time_cov.values[:,1][::2]
    feat_year_time_cov_1 = feat_year_time_cov.values[:,0][::2]
    
    feat_hour_cov = df[['transaction_amt','hour','user_id']].to_pandas().groupby('user_id').cov()
    feat_hour_cov_0 = feat_year_time_cov.values[:,1][::2]
    feat_hour_cov_1 = feat_year_time_cov.values[:,0][::2]    

    feat_dayofweek_cov = df[['transaction_amt','dayofweek','user_id']].to_pandas().groupby('user_id').cov()
    feat_dayofweek_cov_0 = feat_year_time_cov.values[:,1][::2]
    feat_dayofweek_cov_1 = feat_year_time_cov.values[:,0][::2]    
    
    feature_df = cudf.DataFrame({'feat_year_time_corr_ps':feat_year_time_corr_ps,
                               'feat_year_time_corr_sp':feat_year_time_corr_sp,
                               'feat_hour_corr_ps':feat_hour_corr_ps,
                               'feat_hour_corr_sp':feat_hour_corr_sp,
                               'feat_dayofweek_corr_ps':feat_dayofweek_corr_ps,
                               'feat_dayofweek_corr_sp':feat_dayofweek_corr_sp,
                               'feat_year_time_cov_0':feat_year_time_cov_0,
                               'feat_year_time_cov_1':feat_year_time_cov_1,
                               'feat_hour_cov_0':feat_hour_cov_0,
                               'feat_hour_cov_1':feat_hour_cov_1,
                               'feat_dayofweek_cov_0':feat_dayofweek_cov_0,
                               'feat_dayofweek_cov_1':feat_dayofweek_cov_1})
    return feature_df

In [72]:
def make_money_features(df):
    simple_features = df.groupby('user_id')['transaction_amt'].agg(['mean', 'std', 'min', 'max', 'median','sum',lambda x:x.quantile(0.25),lambda x: x.quantile(0.75)])
    simple_features.columns = [f'simple_features_{c}' for c in simple_features.columns]
    simple_features['simple_features_diff'] = simple_features['simple_features_max'] - simple_features['simple_features_min']
    simple_features['simple_features_step'] =  simple_features['simple_features_mean'] / simple_features['simple_features_diff']
    
    month_embeds = cudf.pivot_table(
                             df,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['important_mcc'],
                             aggfunc=['count','mean','max','std','min','median']).fillna(0).sort_index()
    
    #df['currency_rk'] = df['currency_rk'].apply(lambda x: 0 if x > 1 else x)
    curr_embeds = cudf.pivot_table(
                             df,
                             index =['user_id'],
                             values=['transaction_amt'],
                             columns=['currency_rk'],
                             aggfunc=['count','mean','max','std','min','median']).fillna(0).sort_index()

    
    month_embeds = cudf.DataFrame(month_embeds.values)
    month_embeds.columns = [f'mony_embeds_{i}' for i in month_embeds.columns]
    month_embeds.index = simple_features.index
    
    curr_embeds = cudf.DataFrame(curr_embeds.values)
    curr_embeds.columns = [f'curr_embeds_{i}' for i in curr_embeds.columns]
    curr_embeds.index = simple_features.index

    
    money_features = cudf.concat([simple_features.sort_index(),
                                curr_embeds,month_embeds],axis=1)

    return money_features


In [73]:
def scipy_features(df):
    df['delta_all_sec_time_abs'] = df['delta_all_sec_time'].abs()
    group = df[['delta_all_sec_time_abs','user_id']].to_pandas().groupby('user_id')['delta_all_sec_time_abs']
    
    features = cudf.DataFrame()
    features['delta_time_differential_entropy'] = group.progress_aggregate(lambda x: sts.differential_entropy(x.values))
    features['delta_time_entropy'] = group.progress_aggregate(lambda x: sts.entropy(x.values))    

    
    return features

In [74]:
def prepare_mcc(df):
    value_counts = transaction_data['mcc_code'].value_counts()
    important_mcc = value_counts[value_counts > 40_000].index.values.tolist()
    df['important_mcc'] = [x if x in important_mcc else -1 for x in df['mcc_code'].to_pandas()]
    return df

transaction_data = prepare_mcc(transaction_data)

In [75]:
def distance_calc(x):
    dist = []
    for i in range(len(x)-1):
        dist += [x[i+1] - x[i]]
    return dist

def calc_corr_synth(x):
    return cudf.DataFrame({'feature':x,'idx':range(len(x))}).corr()['feature']['idx']
    
def polifit_second(x):
    if len(x) % 2 == 0:
        return np.polyfit(range(len(x) // 2), x[len(x) // 2:], 1)[1]
    else:
        return np.polyfit(range(len(x) // 2 + 1), x[len(x) // 2:], 1)[1]

def shift_features(df,shifter_f=distance_calc):
    df_pandas = df[['user_id','all_sec_time']].to_pandas()
    
    times_featues = df_pandas.groupby('user_id').agg(lambda x:sorted(x.tolist()))
    times_featues['distances'] = times_featues['all_sec_time'].map(shifter_f)
    
    feature_df = cudf.DataFrame()
    feature_df['time_step_max'] = times_featues['distances'].progress_map(np.max)
    feature_df['time_step_min'] = times_featues['distances'].progress_map(np.min)
    feature_df['time_step_mean'] = times_featues['distances'].progress_map(np.mean)
    feature_df['time_step_std'] = times_featues['distances'].progress_map(np.std)
    feature_df['time_step_median'] = times_featues['distances'].progress_map(np.median)
    feature_df['time_step_q_75'] = times_featues['distances'].progress_map(lambda x:np.quantile(x,0.75))
    feature_df['time_step_q_25'] = times_featues['distances'].progress_map(lambda x:np.quantile(x,0.25))
    feature_df['time_step_q_90'] = times_featues['distances'].progress_map(lambda x:np.quantile(x,0.90))
    feature_df['time_step_dist_entropy'] = times_featues['distances'].progress_map(lambda x: sts.differential_entropy(x))
    feature_df['time_step_polyfit'] = times_featues['distances'].progress_map(lambda x:np.polyfit(range(len(x)), x, 1)[1])
    feature_df['time_step_polyfit_st'] = times_featues['distances'].progress_map(lambda x:np.polyfit(range(len(x)), x, 1)[0])
    feature_df['time_step_polyfit_first_per'] = times_featues['distances'].progress_map(lambda x:np.polyfit(range(len(x) // 2), x[:len(x) // 2], 1)[1])
    feature_df['time_step_polyfit_second_per'] = times_featues['distances'].progress_map(polifit_second)
    feature_df['time_step_polyfit_otn'] = feature_df['time_step_polyfit_first_per'] - feature_df['time_step_polyfit_second_per']
    
    feature_df['time_step_skew'] = times_featues['distances'].progress_map(lambda x: skew(x))
    feature_df['time_step_last'] = times_featues['distances'].progress_map(lambda x: x[-1])

    #feature_df['time_step_polyfit_st_2'] = times_featues['distances'].progress_map(lambda x:np.polyfit(range(len(x)), x, 2)[2])
    feature_df.index = times_featues.index.tolist()
    
    return feature_df

In [76]:
def nununique_features(df):
    df_pandas = df[['user_id','days_to_reprort']].to_pandas()
    
    times_featues = df_pandas.groupby('user_id').agg(lambda x:sorted(x.tolist()))
    times_featues['steps'] = times_featues['days_to_reprort'].map(np.diff)

    feature_df = cudf.DataFrame()
    feature_df['mcc_nuniue'] = df.groupby('user_id')['mcc_code'].nunique()
    feature_df['hour_nuniue'] = df.groupby('user_id')['hour'].nunique()
    feature_df['days_nuniue'] = df.groupby('user_id')['days_to_reprort'].nunique()
    feature_df['currency_nuniue'] = df.groupby('user_id')['currency_rk'].nunique()
    feature_df['step_nuniue'] = times_featues['steps'].map(lambda x:len(np.unique(x)))
    return feature_df

In [77]:
def last_count(df,bins=[5,10,15,20,25,30,35,45,50,60]):
    df['transaction_amt_abs'] = df['transaction_amt'].abs()
    feature_df = cudf.DataFrame(index=df.groupby('user_id')['mcc_code'].agg('count').index)
    for x in bins:
        feature_df[f'count_{x}_prev'] = df[df['days_to_reprort'] > x + 100].groupby('user_id')['mcc_code'].agg('count')
        feature_df[f'count_{x}_last'] = df[df['days_to_reprort'] <= x + 100].groupby('user_id')['mcc_code'].agg('count')
        feature_df[f'percent_{x}'] =  feature_df[f'count_{x}_last'] / (feature_df[f'count_{x}_last'] + feature_df[f'count_{x}_prev'])
        
        sum_prev = df[df['days_to_reprort'] > x + 100].groupby('user_id')['transaction_amt_abs'].agg('sum')
        sum_last = df[df['days_to_reprort'] <= x + 100].groupby('user_id')['transaction_amt_abs'].agg('sum')
        feature_df[f'sum_percent_{x}'] = sum_last / (sum_last + sum_prev)
        
        mean_prev = df[df['days_to_reprort'] > x + 100].groupby('user_id')['transaction_amt_abs'].agg('mean')
        mean_last = df[df['days_to_reprort'] <= x + 100].groupby('user_id')['transaction_amt_abs'].agg('mean')
        feature_df[f'mean_percent_{x}'] = mean_last / (mean_prev + mean_last)
    
        std_prev = df[df['days_to_reprort'] > x + 100].groupby('user_id')['transaction_amt_abs'].agg('std')
        std_last = df[df['days_to_reprort'] <= x + 100].groupby('user_id')['transaction_amt_abs'].agg('std')
        feature_df[f'std_percent_{x}'] = std_last / (std_last + std_prev)
        
        max_prev = df[df['days_to_reprort'] > x + 100].groupby('user_id')['transaction_amt_abs'].agg('max')
        max_last = df[df['days_to_reprort'] <= x + 100].groupby('user_id')['transaction_amt_abs'].agg('max')
        feature_df[f'max_percent_{x}'] = max_last / max_prev

    return feature_df.fillna(-1)

In [79]:
tqdm.pandas()
last_count_features = last_count(transaction_data)
sts_features = scipy_features(transaction_data)
shift_feature = shift_features(transaction_data)
time_features = generate_time_features(transaction_data)
mony_features = make_money_features(transaction_data)
nununique_feature = nununique_features(transaction_data)

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  logs = np.log(n * differences / (ci * m))


  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

  0%|          | 0/94606 [00:00<?, ?it/s]

In [81]:
clients_data = clients_data.set_index('user_id')
clients_data = clients_data.drop(bad_users)

In [87]:
targets = targets.drop(bad_users)

In [88]:
all_data = cudf.concat([mony_features,
                        last_count_features,
                        nununique_feature,
                        shift_feature,
                        sts_features,
                        time_features,
                        clients_data,
                        targets,
                       ],axis=1)
all_data

Unnamed: 0_level_0,simple_features_mean,simple_features_std,simple_features_min,simple_features_max,simple_features_median,simple_features_sum,simple_features_<lambda>,simple_features_diff,simple_features_step,curr_embeds_0,...,month_embeds_55,month_embeds_56,month_embeds_57,month_embeds_58,month_embeds_59,report,employee_count_nm,bankemplstatus,customer_age,days_on_target_time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,-3782.258305,12339.183590,-90147.617188,-45.579891,-873.457520,-310145.181049,-482.653419,90102.037296,-0.041978,0.0,...,0.000000,0.000000,-4931.944336,-4486.468750,-5923.344238,1,БОЛЕЕ 1001,0,3,8
13,-5956.430383,24297.512592,-58740.300781,70322.828125,-10529.004883,-119128.607666,4827.161987,129063.128906,-0.046151,1.0,...,-16453.037109,0.000000,0.000000,0.000000,0.000000,6,ОТ 501 ДО 1000,0,2,2
37,-1014.892038,3761.359619,-35782.984375,5487.140625,-246.406715,-298378.259188,-141.679279,41270.125000,-0.024591,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,5,БОЛЕЕ 1001,0,2,21
41,-7181.512026,5452.863168,-16841.208984,-290.766998,-6526.302002,-100541.168365,-1782.529892,16550.441986,-0.433917,0.0,...,0.000000,0.000000,0.000000,-16841.208984,-11552.795898,1,ОТ 101 ДО 500,0,2,2
42,-264.964694,1733.098156,-3677.014404,4436.269043,-334.707733,-12188.375912,-51.089417,8113.283447,-0.032658,0.0,...,0.000000,-1110.479004,-3498.598633,-898.395203,-2185.843994,12,ДО 10,0,3,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,-799.493409,2136.038913,-13020.519531,-40.769005,-239.170517,-29581.256115,-102.463661,12979.750526,-0.061595,0.0,...,0.000000,0.000000,-13020.519531,-1346.094482,-2458.228760,12,,0,2,0
562205,-218.410403,323.091342,-1926.202271,1595.461060,-144.195938,-30359.046066,-27.315047,3521.663330,-0.062019,0.0,...,0.000000,-734.152161,-813.717102,-1048.412842,-977.024231,12,,0,1,12
562312,-315.871324,290.704075,-1372.377075,-28.292030,-271.298309,-17372.922832,-137.594276,1344.085045,-0.235008,0.0,...,0.000000,-1308.028442,-1372.377075,-868.479248,-1017.174133,12,,0,0,1
562721,-1724.093623,3857.289641,-18981.269531,5412.773926,-335.201294,-143099.770719,-22.930573,24394.043457,-0.070677,0.0,...,0.000000,-429.593597,-10965.872070,-12050.644531,-18981.269531,12,,0,2,2


In [89]:
all_data['employee_count_nm'] = all_data['employee_count_nm'].fillna('NaN')
all_data['customer_age'] = all_data['customer_age'].fillna('NaN')

In [93]:
all_data = all_data.rename({'days_on_target_time':'label'},axis=1)

In [99]:
all_train_df = all_data.to_pandas()

In [104]:
class CatBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,params,random_state=56):
        self.models = []
        self.params = params
        self.kfold = KFold(n_splits=num_folds,random_state=random_state,shuffle=True)
        
    def fit(self,train_data,cat_features=None,drop_cols=[],label_col='label',verbose=False):
        scores = []
        for train_index, test_index in tqdm(self.kfold.split(train_data)):
            train_df = train_data.iloc[train_index]
            test_df = train_data.iloc[test_index]
            
            train_pool = Pool(train_df.drop([label_col]+drop_cols,axis=1),
                              label = train_df[label_col],
                              cat_features = cat_features)

            eval_pool = Pool(test_df.drop([label_col]+drop_cols,axis=1),
                             label = test_df[label_col],
                             cat_features = cat_features)
            
            cbm = CatBoostRegressor(**self.params)
            cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
            
            val_preds = cbm.predict(eval_pool)
            score = mae(val_preds, test_df['label'])
            print(score)
            scores += [score]
            self.models += [cbm]
        print(f"Total Score {np.mean(scores)}")
            
    def predict(self,test_data,cat_features=None):
        test_pool = Pool(test_data,cat_features=cat_features)
        preds = np.mean([model.predict(test_pool) for model in self.models],axis=0)
        return preds

In [105]:
cat_features = ['employee_count_nm','customer_age']

In [106]:
params = {'iterations':6_000,
          'loss_function':'MAE',
          'use_best_model':True,
          'learning_rate':0.01,
          'task_type':'GPU',
          #'min_child_samples':20,
          'max_depth':6,
          'eval_metric':'MAE',
          'random_seed':56}

In [107]:
model = CatBoostKfoldWraper(num_folds=5,params=params,random_state=56)
model.fit(all_train_df,cat_features,verbose=500)

0it [00:00, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 13.4988457	test: 13.3510237	best: 13.3510237 (0)	total: 31.6ms	remaining: 3m 9s
500:	learn: 12.4229994	test: 12.2912464	best: 12.2912464 (500)	total: 14s	remaining: 2m 34s
1000:	learn: 11.5474466	test: 11.4318096	best: 11.4318096 (1000)	total: 28.6s	remaining: 2m 22s
1500:	learn: 10.8663648	test: 10.7668378	best: 10.7668378 (1500)	total: 43s	remaining: 2m 9s
2000:	learn: 10.3542626	test: 10.2667306	best: 10.2667306 (2000)	total: 57.6s	remaining: 1m 55s
2500:	learn: 9.9353259	test: 9.8597537	best: 9.8597537 (2500)	total: 1m 12s	remaining: 1m 41s
3000:	learn: 9.5814461	test: 9.5202288	best: 9.5202288 (3000)	total: 1m 27s	remaining: 1m 26s
3500:	learn: 9.2737439	test: 9.2253546	best: 9.2253546 (3500)	total: 1m 40s	remaining: 1m 11s
4000:	learn: 9.0047607	test: 8.9667286	best: 8.9667286 (4000)	total: 1m 54s	remaining: 57.2s
4500:	learn: 8.7686755	test: 8.7378469	best: 8.7378469 (4500)	total: 2m 8s	remaining: 42.7s
5000:	learn: 8.5607599	test: 8.5367672	best: 8.5367672 (5000)	tota

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 13.4721555	test: 13.4577041	best: 13.4577041 (0)	total: 27.5ms	remaining: 2m 45s
500:	learn: 12.3979560	test: 12.3911951	best: 12.3911951 (500)	total: 13.7s	remaining: 2m 30s
1000:	learn: 11.5226045	test: 11.5262774	best: 11.5262774 (1000)	total: 27.8s	remaining: 2m 18s
1500:	learn: 10.8425268	test: 10.8597314	best: 10.8597314 (1500)	total: 41.8s	remaining: 2m 5s
2000:	learn: 10.3304197	test: 10.3578429	best: 10.3578429 (2000)	total: 55.8s	remaining: 1m 51s
2500:	learn: 9.9155374	test: 9.9489329	best: 9.9489329 (2500)	total: 1m 9s	remaining: 1m 37s
3000:	learn: 9.5646732	test: 9.6002366	best: 9.6002366 (3000)	total: 1m 23s	remaining: 1m 23s
3500:	learn: 9.2608456	test: 9.2974369	best: 9.2974369 (3500)	total: 1m 37s	remaining: 1m 9s
4000:	learn: 8.9941676	test: 9.0329637	best: 9.0329637 (4000)	total: 1m 51s	remaining: 55.5s
4500:	learn: 8.7590642	test: 8.8007635	best: 8.8007635 (4500)	total: 2m 5s	remaining: 41.7s
5000:	learn: 8.5532452	test: 8.5963555	best: 8.5963555 (5000)	t

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 13.4507540	test: 13.5435319	best: 13.5435319 (0)	total: 27.6ms	remaining: 2m 45s
500:	learn: 12.3790602	test: 12.4663614	best: 12.4663614 (500)	total: 14.1s	remaining: 2m 34s
1000:	learn: 11.5075263	test: 11.5905166	best: 11.5905166 (1000)	total: 28s	remaining: 2m 19s
1500:	learn: 10.8327299	test: 10.9079571	best: 10.9079571 (1500)	total: 42s	remaining: 2m 5s
2000:	learn: 10.3238266	test: 10.3942008	best: 10.3942008 (2000)	total: 55.6s	remaining: 1m 51s
2500:	learn: 9.9091959	test: 9.9766145	best: 9.9766145 (2500)	total: 1m 9s	remaining: 1m 36s
3000:	learn: 9.5587748	test: 9.6231329	best: 9.6231329 (3000)	total: 1m 23s	remaining: 1m 23s
3500:	learn: 9.2542549	test: 9.3167482	best: 9.3167482 (3500)	total: 1m 36s	remaining: 1m 9s
4000:	learn: 8.9875959	test: 9.0486415	best: 9.0486415 (4000)	total: 1m 50s	remaining: 55.3s
4500:	learn: 8.7518094	test: 8.8143209	best: 8.8143209 (4500)	total: 2m 4s	remaining: 41.6s
5000:	learn: 8.5439829	test: 8.6095672	best: 8.6095672 (5000)	total

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 13.4919159	test: 13.3787491	best: 13.3787491 (0)	total: 30.1ms	remaining: 3m
500:	learn: 12.4152668	test: 12.3180774	best: 12.3180774 (500)	total: 14.2s	remaining: 2m 35s
1000:	learn: 11.5398097	test: 11.4558163	best: 11.4558163 (1000)	total: 27.9s	remaining: 2m 19s
1500:	learn: 10.8611297	test: 10.7867837	best: 10.7867837 (1500)	total: 41.7s	remaining: 2m 4s
2000:	learn: 10.3493130	test: 10.2842400	best: 10.2842400 (2000)	total: 55.5s	remaining: 1m 50s
2500:	learn: 9.9335545	test: 9.8768300	best: 9.8768300 (2500)	total: 1m 9s	remaining: 1m 37s
3000:	learn: 9.5817621	test: 9.5324216	best: 9.5324216 (3000)	total: 1m 23s	remaining: 1m 23s
3500:	learn: 9.2761102	test: 9.2339880	best: 9.2339880 (3500)	total: 1m 37s	remaining: 1m 9s
4000:	learn: 9.0081187	test: 8.9733472	best: 8.9733472 (4000)	total: 1m 51s	remaining: 55.8s
4500:	learn: 8.7729139	test: 8.7417453	best: 8.7417453 (4500)	total: 2m 4s	remaining: 41.6s
5000:	learn: 8.5667359	test: 8.5378323	best: 8.5378323 (5000)	total

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 13.4327592	test: 13.6154453	best: 13.6154453 (0)	total: 30.3ms	remaining: 3m 1s
500:	learn: 12.3668228	test: 12.5272557	best: 12.5272557 (500)	total: 14.3s	remaining: 2m 36s
1000:	learn: 11.5006980	test: 11.6371678	best: 11.6371678 (1000)	total: 28.6s	remaining: 2m 22s
1500:	learn: 10.8277611	test: 10.9451306	best: 10.9451306 (1500)	total: 42.6s	remaining: 2m 7s
2000:	learn: 10.3214948	test: 10.4235788	best: 10.4235788 (2000)	total: 56.6s	remaining: 1m 53s
2500:	learn: 9.9081740	test: 9.9989331	best: 9.9989331 (2500)	total: 1m 11s	remaining: 1m 39s
3000:	learn: 9.5585562	test: 9.6411225	best: 9.6411225 (3000)	total: 1m 25s	remaining: 1m 25s
3500:	learn: 9.2541657	test: 9.3297274	best: 9.3297274 (3500)	total: 1m 39s	remaining: 1m 10s
4000:	learn: 8.9881100	test: 9.0580428	best: 9.0580428 (4000)	total: 1m 53s	remaining: 56.5s
4500:	learn: 8.7532837	test: 8.8210349	best: 8.8210349 (4500)	total: 2m 7s	remaining: 42.3s
5000:	learn: 8.5471283	test: 8.6126498	best: 8.6126498 (5000)	

In [111]:
model.models[0].get_feature_importance(prettified=True)[:60]

Unnamed: 0,Feature Id,Importances
0,count_50_last,19.379408
1,days_nuniue,11.42642
2,count_60_last,7.887247
3,time_step_std,7.046849
4,count_30_last,6.067397
5,count_45_last,5.779727
6,count_35_last,5.41319
7,count_25_last,3.954199
8,delta_time_differential_entropy,3.688116
9,std_percent_25,2.468533


In [112]:
class CatBoostKfoldWraperCLS(BaseEstimator):
    def __init__(self,num_folds,params,random_state=56):
        self.models = []
        self.params = params
        self.kfold = KFold(n_splits=num_folds,random_state=random_state,shuffle=True)
        
    def fit(self,train_data,cat_features=None,drop_cols=['label'],label_col='label_bin',verbose=False):
        scores = []
        for train_index, test_index in tqdm(self.kfold.split(train_data)):
            train_df = train_data.iloc[train_index]
            test_df = train_data.iloc[test_index]
            
            train_pool = Pool(train_df.drop([label_col]+drop_cols,axis=1),
                              label = train_df[label_col],
                              cat_features = cat_features)

            eval_pool = Pool(test_df.drop([label_col]+drop_cols,axis=1),
                             label = test_df[label_col],
                             cat_features = cat_features)
            
            cbm = CatBoostClassifier(**self.params)
            cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
            
            val_preds = cbm.predict_proba(eval_pool)[:,1]
            score = roc_auc_score(test_df['label_bin'],val_preds)
            print(score)
            scores += [score]
            self.models += [cbm]
        print(f"Total Score {np.mean(scores)}")
            
    def predict(self,test_data,cat_features=None):
        test_pool = Pool(test_data,cat_features=cat_features)
        preds = np.mean([model.predict_proba(test_pool)[:,1] for model in self.models],axis=0)
        return preds

In [113]:
params_cls = {'iterations':6_000,
          'loss_function':'CrossEntropy',
          'use_best_model':True,
          'learning_rate':0.01,
          'task_type':'GPU',
          #'min_child_samples':20,
          'max_depth':6,
          'eval_metric':'AUC',
          'random_seed':56}

In [114]:
all_train_df['label_bin'] = all_train_df['label'].map(lambda x: 1 if x == 0 else 0)

In [118]:
model_cls = CatBoostKfoldWraperCLS(num_folds=5,params=params_cls,random_state=56)
model_cls.fit(all_train_df,cat_features,verbose=500)

0it [00:00, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7812243	best: 0.7812243 (0)	total: 45.4ms	remaining: 4m 32s
500:	test: 0.8459478	best: 0.8459478 (500)	total: 23.6s	remaining: 4m 19s
1000:	test: 0.8492687	best: 0.8492687 (1000)	total: 46.3s	remaining: 3m 51s
1500:	test: 0.8507929	best: 0.8507929 (1500)	total: 1m 9s	remaining: 3m 27s
2000:	test: 0.8517871	best: 0.8517902 (1990)	total: 1m 32s	remaining: 3m 5s
2500:	test: 0.8525160	best: 0.8525160 (2500)	total: 1m 54s	remaining: 2m 40s
3000:	test: 0.8528947	best: 0.8529198 (2980)	total: 2m 17s	remaining: 2m 17s
3500:	test: 0.8533904	best: 0.8533904 (3500)	total: 2m 38s	remaining: 1m 53s
4000:	test: 0.8537021	best: 0.8537141 (3970)	total: 3m 1s	remaining: 1m 30s
4500:	test: 0.8538495	best: 0.8538789 (4390)	total: 3m 25s	remaining: 1m 8s
5000:	test: 0.8538205	best: 0.8538789 (4390)	total: 3m 50s	remaining: 46s
5500:	test: 0.8539244	best: 0.8539261 (5495)	total: 4m 14s	remaining: 23.1s
5999:	test: 0.8539964	best: 0.8540130 (5860)	total: 4m 36s	remaining: 0us
bestTest = 0.8540130

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7911034	best: 0.7911034 (0)	total: 43.3ms	remaining: 4m 19s
500:	test: 0.8427327	best: 0.8427327 (500)	total: 21.7s	remaining: 3m 57s
1000:	test: 0.8453903	best: 0.8453903 (1000)	total: 44.2s	remaining: 3m 40s
1500:	test: 0.8465663	best: 0.8465663 (1500)	total: 1m 9s	remaining: 3m 28s
2000:	test: 0.8472577	best: 0.8472577 (2000)	total: 1m 32s	remaining: 3m 5s
2500:	test: 0.8478036	best: 0.8478036 (2500)	total: 1m 55s	remaining: 2m 41s
3000:	test: 0.8481816	best: 0.8481816 (3000)	total: 2m 17s	remaining: 2m 17s
3500:	test: 0.8484374	best: 0.8484406 (3490)	total: 2m 39s	remaining: 1m 53s
4000:	test: 0.8487257	best: 0.8487379 (3985)	total: 3m 1s	remaining: 1m 30s
4500:	test: 0.8487863	best: 0.8488240 (4405)	total: 3m 22s	remaining: 1m 7s
5000:	test: 0.8490245	best: 0.8490394 (4980)	total: 3m 44s	remaining: 44.8s
5500:	test: 0.8491808	best: 0.8492003 (5480)	total: 4m 8s	remaining: 22.5s
5999:	test: 0.8492549	best: 0.8493056 (5700)	total: 4m 30s	remaining: 0us
bestTest = 0.849305

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7837349	best: 0.7837349 (0)	total: 43.9ms	remaining: 4m 23s
500:	test: 0.8435640	best: 0.8435640 (500)	total: 21.8s	remaining: 3m 59s
1000:	test: 0.8468685	best: 0.8468685 (1000)	total: 45.6s	remaining: 3m 47s
1500:	test: 0.8484707	best: 0.8484707 (1500)	total: 1m 7s	remaining: 3m 22s
2000:	test: 0.8490570	best: 0.8490602 (1965)	total: 1m 29s	remaining: 2m 58s
2500:	test: 0.8495403	best: 0.8495403 (2500)	total: 1m 51s	remaining: 2m 35s
3000:	test: 0.8498541	best: 0.8498541 (3000)	total: 2m 13s	remaining: 2m 13s
3500:	test: 0.8503319	best: 0.8503399 (3470)	total: 2m 35s	remaining: 1m 50s
4000:	test: 0.8505243	best: 0.8505249 (3830)	total: 2m 57s	remaining: 1m 28s
4500:	test: 0.8507848	best: 0.8507848 (4500)	total: 3m 20s	remaining: 1m 6s
5000:	test: 0.8507023	best: 0.8508062 (4525)	total: 3m 44s	remaining: 44.8s
5500:	test: 0.8506757	best: 0.8508062 (4525)	total: 4m 6s	remaining: 22.3s
5999:	test: 0.8507959	best: 0.8508062 (4525)	total: 4m 27s	remaining: 0us
bestTest = 0.8508

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7713842	best: 0.7713842 (0)	total: 45.7ms	remaining: 4m 34s
500:	test: 0.8460447	best: 0.8460447 (500)	total: 22.7s	remaining: 4m 9s
1000:	test: 0.8497750	best: 0.8497750 (1000)	total: 44.3s	remaining: 3m 41s
1500:	test: 0.8513299	best: 0.8513299 (1500)	total: 1m 5s	remaining: 3m 16s
2000:	test: 0.8524649	best: 0.8524649 (2000)	total: 1m 27s	remaining: 2m 54s
2500:	test: 0.8530754	best: 0.8530791 (2490)	total: 1m 48s	remaining: 2m 32s
3000:	test: 0.8534693	best: 0.8534757 (2995)	total: 2m 10s	remaining: 2m 10s
3500:	test: 0.8537921	best: 0.8538120 (3490)	total: 2m 32s	remaining: 1m 48s
4000:	test: 0.8540331	best: 0.8540899 (3860)	total: 2m 56s	remaining: 1m 27s
4500:	test: 0.8542594	best: 0.8542632 (4415)	total: 3m 21s	remaining: 1m 7s
5000:	test: 0.8544489	best: 0.8544489 (5000)	total: 3m 46s	remaining: 45.2s
5500:	test: 0.8547009	best: 0.8547121 (5480)	total: 4m 8s	remaining: 22.5s
5999:	test: 0.8547492	best: 0.8548217 (5690)	total: 4m 29s	remaining: 0us
bestTest = 0.85482

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7851593	best: 0.7851593 (0)	total: 45.5ms	remaining: 4m 32s
500:	test: 0.8413987	best: 0.8413987 (500)	total: 21.7s	remaining: 3m 57s
1000:	test: 0.8444180	best: 0.8444180 (1000)	total: 44.3s	remaining: 3m 41s
1500:	test: 0.8457172	best: 0.8457220 (1495)	total: 1m 6s	remaining: 3m 19s
2000:	test: 0.8467214	best: 0.8467214 (2000)	total: 1m 28s	remaining: 2m 57s
2500:	test: 0.8471644	best: 0.8471823 (2440)	total: 1m 50s	remaining: 2m 34s
3000:	test: 0.8474253	best: 0.8474334 (2890)	total: 2m 12s	remaining: 2m 12s
3500:	test: 0.8476453	best: 0.8476775 (3430)	total: 2m 34s	remaining: 1m 50s
4000:	test: 0.8478485	best: 0.8478869 (3965)	total: 2m 56s	remaining: 1m 28s
4500:	test: 0.8479721	best: 0.8479938 (4400)	total: 3m 18s	remaining: 1m 6s
5000:	test: 0.8480948	best: 0.8481154 (4890)	total: 3m 41s	remaining: 44.2s
5500:	test: 0.8481131	best: 0.8481757 (5120)	total: 4m 2s	remaining: 22s
5999:	test: 0.8481459	best: 0.8481757 (5120)	total: 4m 25s	remaining: 0us
bestTest = 0.848175

In [119]:
transaction_data = cudf.read_csv('./transactions.csv',parse_dates=['transaction_dttm'])
clients_data = cudf.read_csv('./clients.csv')

clients_data_merge = clients_data.merge(reports_data)[['report_dt','user_id']]
transaction_data = transaction_data.merge(clients_data_merge)
transaction_data = create_time_features(transaction_data)
transaction_data = prepare_mcc(transaction_data)

In [120]:
tqdm.pandas()
last_count_features = last_count(transaction_data)
sts_features = scipy_features(transaction_data)
shift_feature = shift_features(transaction_data)
time_features = generate_time_features(transaction_data)
mony_features = make_money_features(transaction_data)
nununique_feature = nununique_features(transaction_data)

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  logs = np.log(n * differences / (ci * m))


  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

  0%|          | 0/96000 [00:00<?, ?it/s]

In [123]:
clients_data = clients_data.set_index('user_id')
all_data = cudf.concat([mony_features,
                        last_count_features,
                        nununique_feature,
                        shift_feature,
                        sts_features,
                        time_features,
                        clients_data,
                       ],axis=1)
all_data

Unnamed: 0_level_0,simple_features_mean,simple_features_std,simple_features_min,simple_features_max,simple_features_median,simple_features_sum,simple_features_<lambda>,simple_features_diff,simple_features_step,curr_embeds_0,...,month_embeds_54,month_embeds_55,month_embeds_56,month_embeds_57,month_embeds_58,month_embeds_59,report,employee_count_nm,bankemplstatus,customer_age
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1246.037876,60860.880477,-153866.890625,104011.960937,4549.455078,13706.416641,16861.306152,257878.851562,0.004832,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,ОТ 101 ДО 500,0,3
9,-3593.718520,11797.055119,-90147.617188,-45.579891,-840.509674,-323434.666813,-434.467491,90102.037296,-0.039885,0.0,...,0.000000,0.000000,0.000000,-4931.944336,-4486.468750,-5923.344238,1,БОЛЕЕ 1001,0,3
13,-5668.971780,23507.953991,-58740.300781,70322.828125,-10529.004883,-124717.379150,5211.026978,129063.128906,-0.043924,1.0,...,-17234.970703,-16453.037109,-16394.193359,0.000000,0.000000,0.000000,6,ОТ 501 ДО 1000,0,2
37,-1053.522538,3913.114835,-35782.984375,5487.140625,-236.420776,-331859.599463,-141.720711,41270.125000,-0.025527,0.0,...,-5205.989258,-26518.769531,0.000000,0.000000,0.000000,0.000000,5,БОЛЕЕ 1001,0,2
41,-6786.663385,5251.737982,-16841.208984,-290.766998,-6328.293701,-108586.614166,-1537.244461,16550.441986,-0.410059,0.0,...,0.000000,0.000000,0.000000,0.000000,-16841.208984,-11552.795898,1,ОТ 101 ДО 500,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,-799.493409,2136.038913,-13020.519531,-40.769005,-239.170517,-29581.256115,-102.463661,12979.750526,-0.061595,0.0,...,0.000000,0.000000,0.000000,-13020.519531,-1346.094482,-2458.228760,12,,0,2
562205,-268.158335,590.682233,-6220.171387,1595.461060,-147.305283,-40491.908630,-27.259905,7815.632446,-0.034311,0.0,...,0.000000,0.000000,-734.152161,-813.717102,-1048.412842,-977.024231,12,,0,1
562312,-331.032523,309.587795,-1372.377075,-28.292030,-271.973526,-18537.821270,-138.175877,1344.085045,-0.246288,0.0,...,0.000000,0.000000,-1308.028442,-1372.377075,-868.479248,-1017.174133,12,,0,0
562721,-1929.467785,4172.623511,-18981.269531,5412.773926,-399.343048,-164004.761685,-22.946484,24394.043457,-0.079096,0.0,...,0.000000,0.000000,-429.593597,-10965.872070,-12050.644531,-18981.269531,12,,0,2


In [125]:
all_data = all_data.to_pandas()

In [127]:
all_data['employee_count_nm'] = all_data['employee_count_nm'].fillna('NaN')
all_data['customer_age'] = all_data['customer_age'].fillna('NaN')

In [131]:
reg_preds = model.predict(all_data,cat_features=cat_features)

In [133]:
cls_preds = model_cls.predict(all_data,cat_features=cat_features)

In [136]:
pd.DataFrame({'cls_score':cls_preds,'reg_preds':reg_preds},index=all_data.index).to_csv('catboost_pretrain.csv')

In [137]:
pd.read_csv('catboost_pretrain.csv')

Unnamed: 0,user_id,cls_score,reg_preds
0,3,0.096575,4.265687
1,9,0.005318,13.994430
2,13,0.019885,3.194905
3,37,0.002578,32.528032
4,41,0.058630,3.371407
...,...,...,...
95995,562043,0.431636,1.411513
95996,562205,0.007119,23.359255
95997,562312,0.069000,6.938552
95998,562721,0.020286,8.009974
