In [1]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
import keras.backend as K
import os
import pandas as pd
import sys
import pickle
import numpy as np

from tensorflow import config
from tensorflow import keras
from tensorflow.keras import layers as L
from tensorflow.keras import Model

from sklearn.model_selection import train_test_split
from tqdm import tqdm

# добавим корневую папку, в ней лежат все необходимые полезные функции для обработки данных
sys.path.append('../../')
sys.path.append('../')

In [2]:
# Эта ячейка нужна чтобы TensorFlow правильно работал с памятью видеокарты
gpus = config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      config.experimental.set_memory_growth(gpu, True)
    logical_gpus = config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
TRAIN_TRANSACTIONS_PATH = '/media/DATA/AlfaBattle/train_transactions_contest/'
TEST_TRANSACTIONS_PATH = '/media/DATA/AlfaBattle/test_transactions_contest/'

TRAIN_TARGET_PATH = '/media/DATA/AlfaBattle/train_target.csv'
PRE_TRANSACTIONS_PATH = '/media/DATA/AlfaBattle//preprocessed_transactions/'
PRE_TEST_TRANSACTIONS_PATH = '/media/DATA/AlfaBattle/preprocessed_test_transactions/'
PICKLE_VAL_BUCKET_PATH = '/media/DATA/AlfaBattle/val_buckets/'
PICKLE_VAL_TRAIN_BUCKET_PATH = '/media/DATA/AlfaBattle/val_train_buckets/'
PICKLE_VAL_TEST_BUCKET_PATH = '/media/DATA/AlfaBattle/val_test_buckets/'
CHECKPOINTS_ADV_PATH = '/media/DATA/AlfaBattle/checkpoints/tf_advanced_baseline/'

In [4]:
path_to_dataset = PICKLE_VAL_BUCKET_PATH
dir_with_datasets = os.listdir(path_to_dataset)
dataset_val = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])
dataset_val

['/media/DATA/AlfaBattle/val_buckets/processed_chunk_000.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_001.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_002.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_003.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_004.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_005.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_006.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_007.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_008.pkl',
 '/media/DATA/AlfaBattle/val_buckets/processed_chunk_009.pkl']

In [5]:
path_to_dataset = PICKLE_VAL_TRAIN_BUCKET_PATH
dir_with_datasets = os.listdir(path_to_dataset)
dataset_train = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])
dataset_train

['/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_000.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_001.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_002.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_003.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_004.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_005.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_006.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_007.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_008.pkl',
 '/media/DATA/AlfaBattle/val_train_buckets/processed_chunk_009.pkl']

* Для создания модели использоватся фреймворк `tensorflow`. В нем есть все, чтобы писать произвольные сложные архитектуры и быстро эксперементировать.

Используются следующие функции:

*  `data_generators.batches_generator` - функция-генератор, итеративно возвращает батчи. В зависимости от флага `is_train` может быть использована для генерации батчей на train/val/test стадию.
* функция `tf_training.train_epoch` - обучает модель одну эпоху.
* функция `tf_training.eval_model` - проверяет качество модели на отложенной выборке и возвращает roc_auc_score.
* функция `tf_training.inference` - делает предикты на новых данных и готовит фрейм для проверяющей системы.
* класс `training_aux.EarlyStopping` - реализует early_stopping, сохраняя лучшую модель. 

In [6]:
from data_generators import batches_generator, transaction_features
from tf_training import train_epoch, eval_model, inference
from training_aux import EarlyStopping

* Все признаки в модели будут категориальными. Для их представления в модели используются категориальные эмбеддинги. Для этого нужно каждому категориальному признаку задать размерность латентного пространства. Размерность считается по [формуле](https://forums.fast.ai/t/size-of-embedding-for-categorical-variables/42608) из библиотеки `fast.ai`. 

In [4]:
embedding_projections = {'currency': (11, 6),
                        'operation_kind': (7, 5),
                        'card_type': (175, 29),
                        'operation_type': (22, 9),
                        'operation_type_group': (4, 3),
                        'ecommerce_flag': (3, 3),
                        'payment_system': (7, 5),
                        'income_flag': (3, 3),
                        'mcc': (108, 22),
                        'country': (24, 9),
                        'city': (163, 28),
                        'mcc_category': (28, 10),
                        'day_of_week': (7, 5),
                        'hour': (24, 9),
                        'weekofyear': (53, 15),
                        'amnt': (10, 6),
                        'days_before': (23, 9),
                        'hour_diff': (10, 6),
                        'product': (5, 4)}

### 2. Создание модели

* Реализуем модель. Все входные признаки представим в виде эмбеддингов, сконкатенируем, чтобы получить векторное представление транзакции. Используем SpatialDropout, чтобы регуляризовать эмбеддинги. Подадим последовательности в `BiGRU` рекуррентную сеть. Используем все скрытые состояния сети, чтобы получить агрегированное представление об истории транзакции - пропустим все скрытые состояния `BiGRU` через `AvgPooling` и черерз `MaxPooling`. Представим признак `product` в виде отдельного эмбеддинга. Сконкатенируем его с результатами пулингов. На основе такого входа построим небольшой `MLP`, выступающий классификатором для целевой задачи. Используем градиентный спуск, чтобы решить оптимизационную задачу.

In [7]:
from data_generators import batches_generator, transaction_features
from tf_training import train_epoch, eval_model, inference
from training_aux import EarlyStopping

In [8]:
def build_transactions_rnn(transactions_cat_features, embedding_projections, product_col_name='product', 
                          rnn_units=128, classifier_units=32, optimizer=None):
    if not optimizer:
        optimizer = keras.optimizers.Adam(lr=1e-3)
        
    inputs = []
    cat_embeds = []
    
    for feature_name in transactions_cat_features:
        inp = L.Input(shape=(None, ), dtype='uint32', name=f'input_{feature_name}')
        inputs.append(inp)
        source_size, projection = embedding_projections[feature_name]
        emb = L.Embedding(source_size+1, projection, trainable=True, mask_zero=False, name=f'embedding_{feature_name}')(inp)
        cat_embeds.append(emb)
    
    # product feature
    inp = L.Input(shape=(1, ), dtype='uint32', name=f'input_product')
    inputs.append(inp)
    source_size, projection = embedding_projections['product']
    product_emb = L.Embedding(source_size+1, projection, trainable=True, mask_zero=False, name=f'embedding_product')(inp)
    product_emb_reshape = L.Reshape((projection, ))(product_emb)
    
    concated_cat_embeds = L.concatenate(cat_embeds)
    
    dropout_embeds = L.SpatialDropout1D(0.05)(concated_cat_embeds)
 
    sequences = L.Bidirectional(L.GRU(units=rnn_units, return_sequences=True))(dropout_embeds)
    
    pooled_avg_sequences = L.GlobalAveragePooling1D()(sequences)
    pooled_max_sequences = L.GlobalMaxPooling1D()(sequences)
    
    #add dropout=0.5
    concated = L.concatenate([pooled_avg_sequences, pooled_max_sequences, product_emb_reshape])
    
    dense_intermediate = L.Dense(classifier_units, activation='relu', 
                                 kernel_regularizer=keras.regularizers.L1L2(1e-7, 1e-5))(concated)
    
    proba = L.Dense(1, activation='sigmoid')(dense_intermediate)
    
    model = Model(inputs=inputs, outputs=proba)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    return model

### 3. Training

In [9]:
! rm -r {CHECKPOINTS_ADV_PATH}
! mkdir {CHECKPOINTS_ADV_PATH}

* Для того, чтобы детектировать переобучение используем EarlyStopping.

In [9]:
path_to_checkpoints = CHECKPOINTS_ADV_PATH
es = EarlyStopping(patience=3, mode='max', verbose=True, save_path=os.path.join(path_to_checkpoints, 'best_checkpoint.pt'), 
                   metric_name='ROC-AUC', save_format='tf')

In [10]:
num_epochs = 20
train_batch_size = 128
val_batch_szie = 128

In [11]:
model = build_transactions_rnn(transaction_features, embedding_projections, classifier_units=128)

In [12]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_currency (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
input_operation_kind (InputLaye [(None, None)]       0                                            
__________________________________________________________________________________________________
input_card_type (InputLayer)    [(None, None)]       0                                            
__________________________________________________________________________________________________
input_operation_type (InputLaye [(None, None)]       0                                            
______________________________________________________________________________________________

* Запустим цикл обучения, каждую эпоху будем логировать лосс, а так же roc-auc на валидации и на обучении. Будем сохрнаять веса после каждой эпохи, а так же лучшие с помощью early_stopping.

In [14]:
history_list = []
for epoch in range(num_epochs):
    print(f'Starting epoch {epoch+1}')
    history_list.append(train_epoch(model, dataset_train, batch_size=train_batch_size, shuffle=True, cur_epoch=epoch, 
                steps_per_epoch=7270))
    
    val_roc_auc = eval_model(model, dataset_val, batch_size=val_batch_szie)
    model.save_weights(os.path.join(path_to_checkpoints, f'epoch_{epoch+1}_val_{val_roc_auc:.3f}.hdf5'))
    
    es(val_roc_auc, model)
    
    train_roc_auc = eval_model(model, dataset_train, batch_size=val_batch_szie)
    print(f'Epoch {epoch+1} completed. Train roc-auc: {train_roc_auc}, Val roc-auc: {val_roc_auc}')
    
    if es.early_stop:
        print('Early stopping reached. Stop training...')
        break

Starting epoch 1
Validation ROC-AUC improved (-inf --> 0.786162).  Saving model ...
Epoch 1 completed. Train roc-auc: 0.785723912049941, Val roc-auc: 0.7861615906419561
Starting epoch 2
Epoch 2/2
Validation ROC-AUC improved (0.786162 --> 0.790268).  Saving model ...
Epoch 2 completed. Train roc-auc: 0.7972057133800797, Val roc-auc: 0.7902679270976908
Starting epoch 3
Epoch 3/3
Validation ROC-AUC improved (0.790268 --> 0.796795).  Saving model ...
Epoch 3 completed. Train roc-auc: 0.8113689883022634, Val roc-auc: 0.7967951177457846
Starting epoch 4
Epoch 4/4
Validation ROC-AUC improved (0.796795 --> 0.798660).  Saving model ...
Epoch 4 completed. Train roc-auc: 0.8216081534972254, Val roc-auc: 0.7986599699508234
Starting epoch 5
Epoch 5/5
Validation ROC-AUC improved (0.798660 --> 0.801650).  Saving model ...
Epoch 5 completed. Train roc-auc: 0.8367869210888826, Val roc-auc: 0.801649878077491
Starting epoch 6
Epoch 6/6
No imporvement in Validation ROC-AUC. Current: 0.794003. Current best

In [15]:
history_list

[<tensorflow.python.keras.callbacks.History at 0x7f5430f2d610>,
 <tensorflow.python.keras.callbacks.History at 0x7f5359176550>,
 <tensorflow.python.keras.callbacks.History at 0x7f544546d760>,
 <tensorflow.python.keras.callbacks.History at 0x7f544517efd0>,
 <tensorflow.python.keras.callbacks.History at 0x7f5445425280>,
 <tensorflow.python.keras.callbacks.History at 0x7f544545bdc0>,
 <tensorflow.python.keras.callbacks.History at 0x7f54451e7160>,
 <tensorflow.python.keras.callbacks.History at 0x7f543d884ee0>]

In [17]:
history = history_list[0]

In [18]:
history.history

{'loss': [0.11559984087944031]}

In [None]:
### 4. Submission

In [13]:
test_frame = pd.read_csv('/media/DATA/AlfaBattle/test_target_contest.csv')
test_frame.head()

Unnamed: 0,app_id,product
0,1063620,0
1,1063621,0
2,1063622,1
3,1063623,1
4,1063624,2


In [16]:
path_to_test_dataset = PICKLE_VAL_TEST_BUCKET_PATH
dir_with_test_datasets = os.listdir(path_to_test_dataset)
dataset_test = sorted([os.path.join(path_to_test_dataset, x) for x in dir_with_test_datasets])

dataset_test

['/media/DATA/AlfaBattle/val_test_buckets/processed_chunk_000.pkl',
 '/media/DATA/AlfaBattle/val_test_buckets/processed_chunk_001.pkl',
 '/media/DATA/AlfaBattle/val_test_buckets/processed_chunk_002.pkl',
 '/media/DATA/AlfaBattle/val_test_buckets/processed_chunk_003.pkl',
 '/media/DATA/AlfaBattle/val_test_buckets/processed_chunk_004.pkl']

In [17]:
path_to_checkpoints = CHECKPOINTS_ADV_PATH
! ls $path_to_checkpoints

best_checkpoint.pt.data-00000-of-00001	epoch_4_val_0.799.hdf5
best_checkpoint.pt.index		epoch_5_val_0.802.hdf5
checkpoint				epoch_6_val_0.794.hdf5
epoch_1_val_0.786.hdf5			epoch_7_val_0.796.hdf5
epoch_2_val_0.790.hdf5			epoch_8_val_0.788.hdf5
epoch_3_val_0.797.hdf5


In [18]:
model.load_weights(os.path.join(path_to_checkpoints, 'epoch_5_val_0.802.hdf5'))

In [22]:
dataset_test[0]

'/media/DATA/AlfaBattle/val_test_buckets/processed_chunk_000.pkl'

In [19]:
test_preds = inference(model, dataset_test, batch_size=128)

In [20]:
test_preds.head()

Unnamed: 0,app_id,score
0,1063655,0.020486
1,1063672,0.111355
2,1063694,0.006555
3,1063709,0.059587
4,1063715,0.018048


In [25]:
test_preds.to_csv('rnn_advanced_baseline_submission.csv', index=None) 