### Здесь считаются признаки основанные на target таблице

In [1]:
import os
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [2]:
start_time = time.time()

In [3]:
files = ['Datasets/' + file for file in sorted(os.listdir('Datasets'))]

In [4]:
target = pd.read_csv('alfabattle2_abattle_train_target.csv')
sample_submission = pd.read_csv('alfabattle2_abattle_sample_prediction.csv')
pred_ts = pd.read_csv('alfabattle2_prediction_session_timestamp.csv')

In [5]:
to_pred = pd.merge(sample_submission, pred_ts)
to_pred.columns = ['client_pin', 'multi_class_target', 'timestamp']
to_pred['session_id'] = np.arange(len(to_pred))

In [6]:
# Мержим тестовый файл с тренировочным, чтобы легче было собирать данные с предыдущих сессий
target = pd.concat([target, to_pred])

In [7]:
target.sort_values(['client_pin', 'timestamp'], inplace=True)
target['timestamp'] = pd.to_datetime(target['timestamp'])


In [8]:
enc = LabelEncoder()
target['multi_class_target'] = enc.fit_transform(target['multi_class_target'])

In [9]:
enc.classes_

array(['card2card_transfer', 'card_recharge', 'chat', 'credit_info',
       'invest', 'main_screen', 'mobile_recharge', 'own_transfer',
       'phone_money_transfer', 'statement'], dtype=object)

In [10]:
target_names = target['multi_class_target'].unique()

In [11]:
# Количство сессий до и после текущей сессии

target['num'] = target.groupby('client_pin')['timestamp'].transform(lambda x: np.arange(len(x))[::-1])
target['num_rev'] = target.groupby('client_pin')['timestamp'].transform(lambda x: np.arange(len(x)))

In [12]:
# Количество сессий и времени, прощедших с последней сессии с соответсвующим таргетом

for name in target_names:
    print(name)
    name = str(name)
    target.loc[:, 'last_' + name] = target['timestamp']
    target.loc[target['multi_class_target'] != int(name), 'last_' + name] = None
    
    target['last_' + name] = target.groupby('client_pin')['last_' + name].fillna(method='ffill')
    target['last_' + name] = target.groupby('client_pin')['last_' + name].shift()
    
    target['last_' + name] = target['timestamp'] - target['last_' + name]
    target['last_' + name] = target['last_' + name].dt.total_seconds() / 3600 / 24
    
    target.loc[:, 'last_n_' + name] = np.arange(len(target))
    target.loc[target['multi_class_target'] != int(name), 'last_n_' + name] = None
    
    target['last_n_' + name] = target.groupby('client_pin')['last_n_' + name].fillna(method='ffill')
    target['last_n_' + name] = target.groupby('client_pin')['last_n_' + name].shift()
    target['last_n_' + name] = np.arange(len(target)) - target['last_n_' + name]

9
5
6
2
4
3
1
8
0
7


In [13]:
# 10 предыдущих таргетов
for i in range(1, 10):
    target['prev_target' + str(i)] = target.groupby('client_pin')['multi_class_target'].shift(i)


In [14]:
# предыдущая сессия для каждого пользователя
target['prev_session'] = target.groupby('client_pin')['session_id'].transform('shift')

In [15]:
# Временные признаки
target['weekday'] = target.timestamp.dt.weekday
target['hour'] = target.timestamp.dt.hour
target['day'] = target.timestamp.dt.day

In [16]:
# признаки, основанные на скользящих суммах from_last признаков

target['from_last'] = target.groupby('client_pin')['timestamp'].diff().dt.total_seconds() / 3600

for i in range(2, 10):
    target['from_last' + str(i)] = target.groupby('client_pin')['from_last'].transform(
        lambda x: x.rolling(i).sum())

In [17]:
# OHE енкодер

ohe = OneHotEncoder()
ohe.fit(target['prev_target1'].fillna(5).values.reshape(-1, 1))

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [18]:
# Добавляем ohe признаки в основную таблицу

target = pd.concat([target, pd.DataFrame(
            ohe.fit_transform(target['prev_target1'].fillna(5).values.reshape(-1, 1)).toarray(),
            columns=['ohe_' + str(i) for i in range(10)], index=target.index)], axis=1)

 Далее считаются кумулятивные суммы ohe признаков с разными параметрами

In [19]:
temp = target.groupby('client_pin')[['ohe_' + str(i) for i in range(10)]].cumsum()
temp.columns = ['cs_' + str(i) for i in range(10)]

temp = temp.div(temp.sum(axis=1), axis=0)

target = pd.concat([target, temp], axis=1)
del temp

In [20]:
temp = target.groupby('client_pin')[['ohe_' + str(i) for i in range(10)]].apply(
    lambda x: x.ewm(alpha=0.05).mean())
temp.columns = ['cs_ewm_0.05' + str(i) for i in range(10)]


target = pd.concat([target, temp], axis=1)
del temp

In [21]:
temp = target.groupby('client_pin')[['ohe_' + str(i) for i in range(10)]].apply(
    lambda x: x.ewm(alpha=0.1).mean())
temp.columns = ['cs_ewm_0.1' + str(i) for i in range(10)]


target = pd.concat([target, temp], axis=1)
del temp

In [22]:
temp = target.groupby('client_pin')[['ohe_' + str(i) for i in range(10)]].apply(
    lambda x: x.ewm(alpha=0.3).mean())
temp.columns = ['cs_ewm_0.3' + str(i) for i in range(10)]


target = pd.concat([target, temp], axis=1)
del temp

In [23]:
k = 2

In [24]:
# Считаем количство случаев, когда признак Х следует за признаком Y

temp = pd.pivot_table(target[(target.num > k + 7)], index='prev_target1', 
                      columns='multi_class_target', values='timestamp', aggfunc='count')

In [25]:
target = pd.merge(target, temp, how='left', on='prev_target1')

In [26]:
target.to_csv('data_for_model.csv', index=False)

In [29]:
print(f'The full process takes {(time.time() - start_time) / 60 / 60} minutes')

The full process takes 0.4741194432973862 minutes
