# Бейзлайн-решение 

## Подготовка

Установим нужные библиотечки

In [1]:
import sys
!{sys.executable} -m pip install numpy pandas catboost scikit-learn scikit-multilearn --quiet

Импортируем их

In [1]:
import random
import numpy as np
import pandas as pd
import catboost as cb
import sklearn.utils as sku
from skmultilearn.model_selection import iterative_train_test_split
import os
import json
from sklearn.metrics import fbeta_score, classification_report
from sklearn import preprocessing
from tqdm import tqdm
from collections import Counter

Зафиксируем random seed

In [2]:
SEED = 0xCAFEC0DE

random.seed(SEED)
np.random.seed(SEED)

Укажем пути до файлов

In [3]:
PAYMENTS_TRAIN_PATH = 'data/payments_train.csv'
TARGET_TRAIN_PATH = 'data/target_train.csv'
PAYMENTS_TEST_PATH = 'data/payments_test.csv'
CLIENT_ID_TEST_PATH = 'data/client_id_test.csv'

## Загружаем датасет

In [4]:
payments_dtypes = {
    'client_id': str,
    'contractor_id': str,
    'is_outgoing': bool,
    'amount': 'uint64',
    'dt_day': 'uint16',
    'dt_hour': 'uint8',
    'channel': pd.CategoricalDtype()
}
for i in range(12):
    payments_dtypes[f'flag_{i}'] = bool
# payments_dtypes

In [None]:
payments = pd.read_csv(PAYMENTS_TRAIN_PATH, dtype=payments_dtypes)
payments['time'] = payments.dt_day * 24 + payments.dt_hour
payments.sort_values(by=['time'], ascending=[True], inplace=True)
payments.head(5)

Unnamed: 0,client_id,contractor_id,is_outgoing,amount,dt_day,dt_hour,channel,flag_0,flag_1,flag_2,flag_3,flag_4,flag_5,flag_6,flag_7,flag_8,flag_9,flag_10,flag_11,time
3235304,314686,,True,293839,0,0,pos,True,False,False,False,False,False,False,False,False,False,False,False,0
22050015,546749,,True,1167711,0,0,pos,True,False,False,False,False,False,False,False,False,False,False,False,0
11623866,650458,650458.0,True,38942012,0,0,,False,False,False,False,False,False,False,False,False,False,False,False,0
1626288,792359,,True,733870,0,0,pos,True,False,False,False,False,False,False,False,False,False,False,False,0
19279193,713483,,True,282132,0,0,pos,True,False,False,False,False,False,False,False,False,False,False,False,0


In [None]:
target_dtypes = {
    'client_id': str
}
for i in range(35):
    target_dtypes[f'type_{i}'] = int
# target_dtypes

In [None]:
target = pd.read_csv(TARGET_TRAIN_PATH, dtype=target_dtypes).set_index('client_id')
target.head(5)

Unnamed: 0_level_0,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,type_8,type_9,...,type_25,type_26,type_27,type_28,type_29,type_30,type_31,type_32,type_33,type_34
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
775943,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
992314,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
255821,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,1,0
188791,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
46092,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Feature Engineering

Сгенерируем следующие фичи:
* Статистика сумм транзакций по клиентам
* Количество транзакций с флагами с 0 по 11 по клиентам

Обернём весь feature engineering в функцию, чтобы потом переиспользовать её для генерации фичей для тестовой выборки

In [None]:
payments.sample(10)

Unnamed: 0,client_id,contractor_id,is_outgoing,amount,dt_day,dt_hour,channel,flag_0,flag_1,flag_2,flag_3,flag_4,flag_5,flag_6,flag_7,flag_8,flag_9,flag_10,flag_11,time
14785445,330309,,False,16167599,104,23,,False,False,False,False,False,False,False,False,False,False,False,True,2519
3821509,149011,,True,1031205,263,22,,False,False,False,True,False,False,False,False,False,False,False,False,6334
7111087,874887,874887.0,False,6527900,51,7,,False,False,False,False,False,False,False,False,False,False,False,False,1231
1394525,123708,,True,35141,93,14,,False,False,False,True,False,False,False,False,False,False,False,False,2246
17791003,359834,,True,26963,243,12,,False,False,False,True,False,False,False,False,False,False,False,False,5844
15159343,192411,,True,42753958,11,13,,True,False,False,False,False,False,False,False,False,False,False,False,277
7343377,444826,,True,297378784,274,5,web,False,False,False,False,False,False,False,False,False,False,False,False,6581
17572083,537237,,True,105445560,310,13,web,False,False,False,False,False,False,False,False,False,False,False,False,7453
3783157,195055,,True,455455,343,2,pos,True,False,False,False,False,False,False,False,False,False,False,False,8234
16563083,212324,735159.0,True,120033319,319,18,app,False,False,False,False,False,False,False,False,False,False,False,False,7674


In [None]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)


def seq_diff(seq):
    if seq.shape[0] < 2:
        return np.zeros(1)
    a = np.concatenate(([0], seq.to_numpy()))
    b = np.concatenate((seq.to_numpy(), [0]))
    diff = b - a
    diff = diff[1:-1]
    return diff


def seq_diff_when(seq, func):
    seq_name, check_name = seq.columns
    yes = seq[seq_name][seq[check_name] == 1]
    no = seq[seq_name][seq[check_name] == 0]    
    yes = func(seq_diff(yes))
    no = func(seq_diff(no))
    return pd.Series([no, yes])


def max_count(seq):
    return Counter(seq).most_common(1)[0][1]


def most_popular(seq):
    return Counter(seq).most_common(1)[0][0]


def categorical_first_k_top(seq):
    seq = seq[~seq.isna()]
    popular_ids = [data[0] for data in Counter(seq).most_common(k)]
    while len(popular_ids) < k:
        popular_ids.append('unknown')
    return pd.Series(popular_ids)


def categorical_max_count(seq):
    seq = seq[~seq.isna()]
    counts = [data[1] for data in Counter(seq).most_common(k)]
    while len(counts) < k:
        counts.append(0)
    return pd.Series(counts)


def cat_notna(seq):
    return seq[~seq.isna()].shape[0]


def count_cat(seq):
    seq = seq[~seq.isna()]
    if not seq.shape[0]:
        return 0
    keys = Counter(seq).keys()
    return len(keys)

def count_cat_flag_yes(df):
    seq = df[df.columns[0]]
    flag = pd.Series(df[df.columns[1]], dtype='bool')
    return count_cat(seq[flag])

def count_cat_flag_no(df):
    seq = df[df.columns[0]]
    flag = pd.Series(df[df.columns[1]], dtype='bool')
    return count_cat(seq[~flag])

def get_popular_contractor_counts(seq):
    seq = seq[~seq.isna()]
    seq = np.array(seq).astype(int)
    res = []
    for contractor in popular_contractors:
        res.append((seq == int(contractor)).sum())
    return pd.Series(res)

In [None]:
c = Counter(payments.contractor_id)
popular_contractors = [data[0] for data in c.most_common(50)[1:]]
k = 9

In [None]:
def generate_features(pay):
    pay = encode_and_bind(pay, 'channel').drop(columns='channel')
        
    pay['dt_0'] = (pay.dt_hour >= 4) & (pay.dt_hour <= 9)
    pay['dt_1'] = (pay.dt_hour >= 10) & (pay.dt_hour <= 15)
    pay['dt_2'] = (pay.dt_hour >= 16) & (pay.dt_hour <= 21)
    pay['dt_3'] = (pay.dt_hour >= 22) | (pay.dt_hour <= 3)
        
    pay['delta'] = (1 - 2 * pay.is_outgoing) * pay.amount
    
    good_flags = [4, 8]
    for i in range(len(good_flags)):
        for j in range(i, len(good_flags)):
            pay[f'flag_{good_flags[i]}_{good_flags[j]}'] = pay[f'flag_{good_flags[i]}'] & pay[f'flag_{good_flags[j]}']
    
    gb = pay.groupby('client_id')
    
    print('amount statistics')
    fts = gb['amount'].agg(['mean', 'median', 'std', 'min', 'max'])
    fts['median_amount_count'] = gb['amount'].apply(max_count)
    fts['delta'] = gb['delta'].sum()
    
    print('time statistics')    
    fts['mean_hour'] = gb['dt_hour'].mean()
    fts['mean_time'] = gb['time'].mean()
    fts['time_range'] = gb['time'].apply(lambda seq: seq.max() - seq.min())
        
    fts['max_diff'] = gb['time'].apply(lambda seq: seq_diff(seq).max())
    fts['std_diff'] = gb['time'].apply(lambda seq: seq_diff(seq).std())
    fts['popular_hour'] = gb['dt_hour'].apply(most_popular)
    fts['most_frq_time_count'] = gb['time'].apply(max_count)
    fts['most_frq_day_count'] = gb['dt_day'].apply(max_count)
    fts[[f'dt_{i}_mean' for i in range(4)]] = gb[[f'dt_{i}' for i in range(4)]].mean()
    
    print('misc')
    fts['size'] = gb.size()
    fts['is_outgoing_mean'] = gb['is_outgoing'].mean()
    
    
    print('flag statistics')
    fts[[f'flag_{i}_mean' for i in range(12)]] = gb[[f'flag_{i}' for i in range(12)]].mean()
    
    flags = sum([[f'flag_{good_flags[i]}_{good_flags[j]}' for i in range(j)] for j in range(len(good_flags))], [])
    fts[[flag + '_count' for flag in flags]] = gb[[flag for flag in flags]].sum()
    fts[[flag + '_mean' for flag in flags]] = gb[[flag for flag in flags]].mean()
    
    
    print('channel statistics')
    channel_columns = []
    for feature in pay.columns:
        if feature.startswith('channel'):
            channel_columns.append(feature)
    fts[[i for i in channel_columns]] = gb[[i for i in channel_columns]].sum()
    known_channel_count = fts.channel_app + fts.channel_atm + fts.channel_pos + fts.channel_web
    for channel in pay.columns:
        if channel.startswith('channel_'):
            fts[f'{channel}_percent'] = gb[f'{channel}'].sum() / known_channel_count
    
    print('contractor statistics')
    fts['different_contractor'] = gb['contractor_id'].apply(count_cat)
    fts['different_contractor_in'] = gb[['contractor_id', 'is_outgoing']].apply(count_cat_flag_no)
    fts['different_contractor_out'] = gb[['contractor_id', 'is_outgoing']].apply(count_cat_flag_yes)
    
    fts['known_contractor_count'] = gb['contractor_id'].apply(cat_notna)
    fts['known_contractor_percent'] = fts.known_contractor_count / fts['size']
 
    print('#1')
    fts[[f'top{i}_contractor_count' for i in range(k)]] = gb['contractor_id'].apply(categorical_max_count).unstack()
    
    print('#2')
    for i in range(k):
        fts[f'top{i}_contractor_fraction'] = fts[f'top{i}_contractor_count'] / np.maximum(fts.known_contractor_count, 1)
    
    print('#3')
    fts[[f'top{i}_contractor' for i in range(k)]] = gb['contractor_id'].apply(categorical_first_k_top).unstack()
    
    fts[[f'contractor_{contractor}_count' for contractor in popular_contractors]] = gb['contractor_id'].apply(get_popular_contractor_counts).unstack()
    for contractor in popular_contractors:
        fts[f'contractor_{contractor}_percent'] = fts[f'contractor_{contractor}_count'] / fts.known_contractor_count
        
    check_top = ['471487', '964772', '58960', '919849', '935243', '826499']
    for contractor in check_top:
        fts[f'contractor_{contractor}_in_top1'] = (fts.top0_contractor == contractor)
        fts[f'contractor_{contractor}_in_top2'] = fts[f'contractor_{contractor}_in_top1'] | (fts.top1_contractor == contractor)
        fts[f'contractor_{contractor}_in_top3'] = fts[f'contractor_{contractor}_in_top2'] | (fts.top2_contractor == contractor)
        fts[f'contractor_{contractor}_in_top4'] = fts[f'contractor_{contractor}_in_top3'] | (fts.top3_contractor == contractor)
        fts[f'contractor_{contractor}_in_top5'] = fts[f'contractor_{contractor}_in_top4'] | (fts.top4_contractor == contractor)
    
    return fts

In [None]:
features = generate_features(payments)
features.head(5)

amount statistics
time statistics
misc
flag statistics
channel statistics
contractor statistics
#1
#2
#3


  self[k1] = value[k2]
  fts[f'contractor_{contractor}_percent'] = fts[f'contractor_{contractor}_count'] / fts.known_contractor_count
  fts[f'contractor_{contractor}_in_top1'] = (fts.top0_contractor == contractor)
  fts[f'contractor_{contractor}_in_top2'] = fts[f'contractor_{contractor}_in_top1'] | (fts.top1_contractor == contractor)
  fts[f'contractor_{contractor}_in_top3'] = fts[f'contractor_{contractor}_in_top2'] | (fts.top2_contractor == contractor)
  fts[f'contractor_{contractor}_in_top4'] = fts[f'contractor_{contractor}_in_top3'] | (fts.top3_contractor == contractor)
  fts[f'contractor_{contractor}_in_top5'] = fts[f'contractor_{contractor}_in_top4'] | (fts.top4_contractor == contractor)


Unnamed: 0_level_0,mean,median,std,min,max,median_amount_count,delta,mean_hour,mean_time,time_range,...,contractor_935243_in_top1,contractor_935243_in_top2,contractor_935243_in_top3,contractor_935243_in_top4,contractor_935243_in_top5,contractor_826499_in_top1,contractor_826499_in_top2,contractor_826499_in_top3,contractor_826499_in_top4,contractor_826499_in_top5
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100045,8362072.0,1706470.0,17336870.0,22,124737631,2,87820672.0,8.597484,5167.037736,8746,...,False,False,False,False,False,False,False,False,False,False
100055,22012620.0,4010524.0,60299310.0,2238,766121312,1,928630158.0,10.65745,4458.325653,8739,...,False,False,False,False,False,False,False,False,False,False
100068,37822830.0,3232108.0,84930120.0,34043,681967564,1,140373789.0,5.645472,6740.350674,4160,...,False,False,False,False,False,False,False,False,False,False
100076,11555910.0,1522173.0,35897450.0,29,255864840,1,-124991605.0,7.043011,1521.107527,3214,...,False,False,False,False,False,False,False,False,False,False
100089,30656950.0,21997923.0,44856930.0,1519,645257028,1,810141987.0,13.543561,4973.104167,8781,...,False,False,False,False,False,False,False,False,False,False


### Разбиение датасета на train / val

Реализуем функцию для разбиения датасета на обучающую и валидационную выборки в пропорциях 85%/15% соответвенно, примерно сохраняя распределение таргетов в обеих выборках. Из соображений быстродействия и детерминированности будем кешировать разбиение в JSON-файл. 

Для стратификации используется функция [iterative_train_test_split](http://scikit.ml/_modules/skmultilearn/model_selection/iterative_stratification.html#iterative_train_test_split) из библиотечки scikit-multilearn.

In [None]:
def stratified_split_cached(X, y, split_idx_file):
    if os.path.isfile(split_idx_file):
        with open(split_idx_file, 'r') as f:
            split_json = json.load(f)
        train_idx, val_idx = split_json['train'], split_json['val']
    else:
        y_shuffle = sku.shuffle(y, random_state=SEED)  # https://cpb-us-e1.wpmucdn.com/journeys.dartmouth.edu/dist/8/830/files/2020/06/EIqwWwsX0AAeh-o.jpeg
        train_idx, _, val_idx, _ = iterative_train_test_split(np.expand_dims(y_shuffle.index, 1), np.array(y_shuffle), test_size=0.15)
        train_idx, val_idx = train_idx.squeeze(1), val_idx.squeeze(1)
        with open(split_idx_file, 'w') as f:
            json.dump({'train': list(train_idx), 'val': list(val_idx)}, f)
    return X.loc[train_idx], y.loc[train_idx], X.loc[val_idx], y.loc[val_idx]

In [None]:
to_drop = [
 'flag_5_mean',
 'contractor_837693_count',
 'contractor_935119_percent',
 'contractor_20761_count',
 'contractor_129950_percent',
 'contractor_951341_percent',
 'contractor_535023_percent',
 'contractor_964772_percent',
 'contractor_459243_count',
 'contractor_39133_percent',
 'contractor_151306_count',
 'contractor_801640_percent',
 'contractor_756565_count',
 'contractor_935243_count',
 'contractor_427394_percent',
 'contractor_363393_count',
 'contractor_247294_percent',
 'contractor_770815_percent',
 'contractor_994449_count',
 'contractor_535023_count',
 'contractor_41319_percent',
 'contractor_48721_count',
 'contractor_350553_count',
 'contractor_404012_percent',
 'contractor_174391_count',
 'contractor_307488_percent',
 'contractor_945049_percent',
 'contractor_780284_percent',
 'contractor_129950_count',
 'contractor_528229_count',
 'contractor_189490_percent',
 'contractor_231144_percent',
 'contractor_747644_count',
 'contractor_454881_count',
 'contractor_935119_count',
 'contractor_943310_percent',
 'contractor_780284_count',
 'contractor_363393_percent',
 'contractor_951341_count',
 'contractor_459394_percent',
 'contractor_767680_count',
 'contractor_231144_count',
 'contractor_598708_percent',
 'contractor_894368_count',
 'contractor_654042_percent',
 'contractor_918674_percent',
 'contractor_383395_percent',
 'contractor_869701_count',
 'contractor_716216_count',
 'contractor_404012_count',
 'contractor_943310_count',
 'contractor_918674_count',
 'contractor_894368_percent',
 'contractor_427394_count',
 'contractor_716216_percent',
 'contractor_247294_count',
 'contractor_48721_percent',
 'contractor_39133_count',
 'contractor_307488_count',
 'contractor_770815_count',
 'contractor_801640_count',
 'contractor_753765_count',
 'contractor_945049_count',
 'contractor_41319_count',
 'contractor_383395_count',
 'contractor_189490_count',
 'contractor_654042_count',
 'contractor_459394_count',
 'contractor_598708_count',
 'flag_7_mean',
 'contractor_753765_percent',
 'contractor_837693_percent',
 'contractor_935243_percent',
 'contractor_459243_percent',
 'contractor_869701_percent',
 'contractor_174391_percent',
 'contractor_756565_percent',
 'contractor_767680_percent',
 'contractor_994449_percent',
 'contractor_151306_percent',
 'contractor_454881_percent',
 'contractor_350553_percent',
 'contractor_58960_in_top2',
 'contractor_964772_in_top3',
 'contractor_20761_percent',
 'contractor_826499_percent',
 'contractor_964772_in_top5',
 'contractor_58960_in_top4',
 'contractor_58960_in_top5',
 'contractor_935243_in_top4',
 'contractor_58960_in_top3',
 'contractor_919849_in_top4',
 'contractor_935243_in_top5',
 'contractor_964772_in_top2',
 'contractor_964772_in_top4',
 'contractor_826499_in_top4',
 'contractor_919849_in_top5',
 'contractor_471487_in_top4',
 'contractor_826499_in_top5',
 'contractor_826499_in_top2',
 'contractor_935243_in_top3',
 'contractor_826499_in_top3',
 'contractor_58960_in_top1',
 'contractor_935243_in_top2',
 'contractor_964772_in_top1',
 'contractor_919849_in_top2',
 'contractor_935243_in_top1',
 'contractor_919849_in_top3',
 'contractor_826499_in_top1',
 'contractor_471487_in_top1',
 'contractor_919849_in_top1',
]
for feature in features.columns:
    if (feature.startswith('contractor_') or feature.startswith('dt_') or feature.startswith('top') or feature.startswith('flag_')) and feature.endswith('_count'):
        to_drop.append(feature)

In [None]:
X_train, y_train, X_val, y_val = stratified_split_cached(features.drop(columns=to_drop), target, 'split_cache.json')

In [None]:
len(X_train), len(y_train), len(X_val), len(y_val)

(57169, 57169, 9836, 9836)

In [None]:
X_train.shape

(57169, 74)

In [None]:
X_train.columns

Index(['mean', 'median', 'std', 'min', 'max', 'median_amount_count', 'delta',
       'mean_hour', 'mean_time', 'time_range', 'max_diff', 'std_diff',
       'popular_hour', 'most_frq_time_count', 'most_frq_day_count',
       'dt_0_mean', 'dt_1_mean', 'dt_2_mean', 'dt_3_mean', 'size',
       'is_outgoing_mean', 'flag_0_mean', 'flag_1_mean', 'flag_2_mean',
       'flag_3_mean', 'flag_4_mean', 'flag_6_mean', 'flag_8_mean',
       'flag_9_mean', 'flag_10_mean', 'flag_11_mean', 'flag_4_8_mean',
       'channel_app', 'channel_atm', 'channel_pos', 'channel_web',
       'channel_app_percent', 'channel_atm_percent', 'channel_pos_percent',
       'channel_web_percent', 'different_contractor',
       'different_contractor_in', 'different_contractor_out',
       'known_contractor_count', 'known_contractor_percent',
       'top0_contractor_fraction', 'top1_contractor_fraction',
       'top2_contractor_fraction', 'top3_contractor_fraction',
       'top4_contractor_fraction', 'top5_contractor_fraction

## Моделлинг

Будем использовать 35 CatBoostClassifier'ов, по одному на каждый род деятельности.

Функция для создания Pool из двух датафреймов

In [None]:
def make_pool(X, categorical_features_indices=None, y=None):
    return cb.Pool(X, y, cat_features=categorical_features_indices)

Обучение набора моделей

In [None]:
models = []
categorical_features_indices = np.where(X_train.dtypes == object)[0]
for i in tqdm(range(35)):  
    model = cb.CatBoostClassifier(iterations=250,
                                  loss_function='Logloss', 
                                  random_seed=SEED,
                                  max_depth=5, 
                                  early_stopping_rounds=200)
    
    pool_train, pool_val = make_pool(X_train, categorical_features_indices, y_train[f'type_{i}']), \
                        make_pool(X_val, categorical_features_indices, y_val[f'type_{i}'])
    
    model.fit(pool_train, eval_set=pool_val, plot=False, verbose=0)
    
    models.append(model)

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [02:52<00:00,  4.92s/it]Custom logger is already specified. Specify more than one logger at same time is not thread safe.


## Проверяем качество модели на валидации

Функция формирования датафрейма с предсказаниями

In [None]:
def predict(X, model_zoo):
    # this threshold upgrate 0.408 -> 0.448
    # preds = [model.predict_proba(make_pool(X, categorical_features_indices))[:, 1] > 0.33 for i, model in enumerate(model_zoo)]
    preds = [model.predict(make_pool(X, categorical_features_indices)) for i, model in enumerate(model_zoo)]
    preds = pd.DataFrame(np.array(preds).transpose(1, 0), index=X.index, columns=[f'type_{i}' for i in range(35)]).astype(int)
    return preds

Предскажем значения для валидационного набора

In [None]:
val_preds = predict(X_val, models)
print('val: ', fbeta_score(y_val, val_preds, beta=0.5, average='micro', zero_division=0), sep="")

train_preds = predict(X_train, models)
print('train: ', fbeta_score(y_train, train_preds, beta=0.5, average='micro', zero_division=0), sep="")

val: 0.44832923062554625
train: 0.8844344659572138


In [None]:
list(pd.DataFrame({
    'feature_importance': pd.DataFrame({
                f'feature_importance_{i}': models[i].get_feature_importance(pool_train) for i in range(35)
            }).max(axis=1),
    'feature': X_train.columns
}).sort_values(by=['feature_importance'], ascending=False)['feature'])


['flag_11_mean',
 'different_contractor_out',
 'top0_contractor',
 'top1_contractor',
 'max',
 'top3_contractor',
 'top2_contractor',
 'is_outgoing_mean',
 'flag_6_mean',
 'contractor_602806_percent',
 'contractor_919849_percent',
 'different_contractor_in',
 'contractor_471487_in_top5',
 'mean',
 'time_range',
 'contractor_747644_percent',
 'std',
 'different_contractor',
 'top8_contractor',
 'contractor_58960_percent',
 'top4_contractor',
 'contractor_471487_in_top3',
 'top6_contractor',
 'top7_contractor',
 'known_contractor_percent',
 'top5_contractor',
 'mean_time',
 'flag_9_mean',
 'dt_0_mean',
 'median',
 'channel_app',
 'top8_contractor_fraction',
 'median_amount_count',
 'flag_8_mean',
 'flag_4_8_mean',
 'top5_contractor_fraction',
 'top6_contractor_fraction',
 'std_diff',
 'flag_1_mean',
 'flag_0_mean',
 'channel_web',
 'top0_contractor_fraction',
 'channel_atm',
 'dt_2_mean',
 'flag_2_mean',
 'contractor_471487_percent',
 'top4_contractor_fraction',
 'contractor_666627_perce

## Загружаем тестовые данные

In [None]:
payments_test = pd.read_csv(PAYMENTS_TEST_PATH, dtype=payments_dtypes)
payments_test.head()

## Генерируем фичи для тестовых данных

In [380]:
payments_test['time'] = payments_test.dt_day * 24 + payments_test.dt_hour
payments_test.sort_values(by=['time'], ascending=[True], inplace=True)
features_test = generate_features(payments_test)
# features_test = pd.DataFrame(scaler.transform(features_test))
features_test.head()

amount statistics
misc
flag statistics
channel statistics
contractor statistics


  fts[f'contractor_{contractor}_percent'] = fts[f'contractor_{contractor}_count'] / fts.known_contractor_count


time statistics


  fts['mean_hour'] = gb['dt_hour'].mean()
  fts['mean_time'] = gb['time'].mean()
  fts['time_range'] = gb['time'].apply(lambda seq: seq.max() - seq.min())
  fts['max_diff'] = gb['time'].apply(lambda seq: max(seq_diff(seq)))
  fts['min_diff'] = gb['time'].apply(lambda seq: min(seq_diff(seq)))
  fts['popular_hour'] = gb['dt_hour'].apply(most_popular)
  fts['most_frq_time_count'] = gb['time'].apply(max_count)


Unnamed: 0_level_0,mean,median,std,min,max,size,is_outgoing_mean,flag_0_count,flag_1_count,flag_2_count,...,contractor_247294_percent,contractor_894368_percent,contractor_716216_percent,mean_hour,mean_time,time_range,max_diff,min_diff,popular_hour,most_frq_time_count
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100050,35369200.0,4800238.5,77028270.0,798,474601237,160,0.73125,7,0,0,...,0.0,0.0,0.0,12.725,4572.725,8134,645,0,16,4
100128,207492400.0,42574180.0,521388200.0,2401,4263233615,97,0.804124,0,0,0,...,0.0,0.0,0.0,14.340206,3932.030928,7297,1670,0,15,3
100159,151667500.0,79896060.0,208179200.0,17615,1348514255,746,0.600536,0,0,0,...,0.0,0.0,0.0,11.292225,4091.967828,8770,195,0,9,4
10018,54241480.0,26540880.5,104207200.0,43,808270306,350,0.28,0,0,0,...,0.0,0.0,0.0,13.945714,1980.025714,7137,424,0,15,4
100237,200586500.0,50737877.0,393988600.0,31511,2521806684,106,0.839623,16,0,12,...,0.0,0.0,0.0,11.915094,5767.839623,5488,713,0,11,2


## Предскажем значения для тестовых данных и сгенерируем сабмит

In [381]:
features_test.drop(columns=to_drop, inplace=True)

In [385]:
print(*list(zip(features_test.columns, X_train.columns)), sep='\n')

('mean', 'mean')
('median', 'median')
('std', 'std')
('min', 'min')
('max', 'max')
('size', 'size')
('is_outgoing_mean', 'is_outgoing_mean')
('flag_0_count', 'flag_0_count')
('flag_1_count', 'flag_1_count')
('flag_2_count', 'flag_2_count')
('flag_3_count', 'flag_3_count')
('flag_4_count', 'flag_4_count')
('flag_6_count', 'flag_6_count')
('flag_8_count', 'flag_8_count')
('flag_9_count', 'flag_9_count')
('flag_10_count', 'flag_10_count')
('flag_11_count', 'flag_11_count')
('flag_0_mean', 'flag_0_mean')
('flag_1_mean', 'flag_1_mean')
('flag_2_mean', 'flag_2_mean')
('flag_3_mean', 'flag_3_mean')
('flag_4_mean', 'flag_4_mean')
('flag_6_mean', 'flag_6_mean')
('flag_8_mean', 'flag_8_mean')
('flag_9_mean', 'flag_9_mean')
('flag_10_mean', 'flag_10_mean')
('flag_11_mean', 'flag_11_mean')
('channel_app', 'channel_app')
('channel_atm', 'channel_atm')
('channel_pos', 'channel_pos')
('channel_web', 'channel_web')
('channel_app_percent', 'channel_app_percent')
('channel_atm_percent', 'channel_atm_per

In [383]:
preds_test = predict(features_test, models)
preds_test

Unnamed: 0_level_0,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,type_8,type_9,...,type_25,type_26,type_27,type_28,type_29,type_30,type_31,type_32,type_33,type_34
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
100128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100159,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10018,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
100237,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999572,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99966,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
999662,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
999674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [384]:
preds_test.to_csv('submission.csv')