# COLAB

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ROOT_DIR = '/content/drive/MyDrive/alpha/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install numpy==1.21.6

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 441, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 

In [3]:
import numpy as np
np.__version__

'1.21.6'

In [4]:
%load_ext autoreload
%autoreload 2

import os
import random
import pandas as pd
import sys
import pickle
import tqdm
import numpy as np
import torch
import torch.nn as nn
from torchvision.ops import sigmoid_focal_loss

from scipy.stats import gmean, hmean
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

import warnings
warnings.filterwarnings("ignore")



# !!! измените "2" на номер доступной вам сuda
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"
#pd.set_option("display.max_columns", None)

# добавим родительскую директорию, в ней лежат все необходимые полезные функции для обработки данных
sys.path.append("../")


In [5]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 1171
seed_everything(SEED)

In [6]:
TRAIN_DATA_PATH = os.path.join(ROOT_DIR, "train_data")
TEST_DATA_PATH = os.path.join(ROOT_DIR, 'test_data')

TRAIN_TARGET_PATH = os.path.join(ROOT_DIR, 'train_target.csv')

In [7]:
train_target = pd.read_csv(TRAIN_TARGET_PATH)
train_target

Unnamed: 0,id,flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
2999995,2999995,0
2999996,2999996,0
2999997,2999997,0
2999998,2999998,0


In [8]:
from utils import read_parquet_dataset_from_local
from dataset_preprocessing_utils_with_mask import features, transform_credits_to_sequences, create_padded_buckets

В дальнейшем при построении рекуррентной нейронной сети нам понадобятся следующие статистики по тренировочной и тестовой выборкам: распределение длин кредитных историй и число уникальных значений каждого категориального значения. Посчитаем эти статистики:

In [9]:
from collections import defaultdict

train_lens = []
test_lens = []
uniques = defaultdict(set)

for step in tqdm.notebook.tqdm(range(0, 12, 2),
                     desc="Count statistics on train data"):
        credits_frame = read_parquet_dataset_from_local(TRAIN_DATA_PATH, step, 2, verbose=True)
        seq_lens = credits_frame.groupby("id").agg(seq_len=("rn", "max"))["seq_len"].values
        train_lens.extend(seq_lens)
        credits_frame.drop(columns=["id", "rn"], inplace=True)
        #credits_frame.drop(columns=["id"], inplace=True)

        for feat in credits_frame.columns.values:
            uniques[feat] = uniques[feat].union(credits_frame[feat].unique())
train_lens = np.hstack(train_lens)

for step in tqdm.notebook.tqdm(range(0, 2, 2),
                     desc="Count statistics on test data"):
        credits_frame = read_parquet_dataset_from_local(TEST_DATA_PATH, step, 2, verbose=True)
        seq_lens = credits_frame.groupby("id").agg(seq_len=("rn", "max"))["seq_len"].values
        test_lens.extend(seq_lens)
        credits_frame.drop(columns=["id", "rn"], inplace=True)
        #credits_frame.drop(columns=["id"], inplace=True)

        for feat in credits_frame.columns.values:
            uniques[feat] = uniques[feat].union(credits_frame[feat].unique())
test_lens = np.hstack(test_lens)
uniques = dict(uniques)

Count statistics on train data:   0%|          | 0/6 [00:00<?, ?it/s]

Reading chunks:
/content/drive/MyDrive/alpha/data/train_data/train_data_0.pq
/content/drive/MyDrive/alpha/data/train_data/train_data_1.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Reading chunks:
/content/drive/MyDrive/alpha/data/train_data/train_data_2.pq
/content/drive/MyDrive/alpha/data/train_data/train_data_3.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Reading chunks:
/content/drive/MyDrive/alpha/data/train_data/train_data_4.pq
/content/drive/MyDrive/alpha/data/train_data/train_data_5.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Reading chunks:
/content/drive/MyDrive/alpha/data/train_data/train_data_6.pq
/content/drive/MyDrive/alpha/data/train_data/train_data_7.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Reading chunks:
/content/drive/MyDrive/alpha/data/train_data/train_data_8.pq
/content/drive/MyDrive/alpha/data/train_data/train_data_9.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Reading chunks:
/content/drive/MyDrive/alpha/data/train_data/train_data_10.pq
/content/drive/MyDrive/alpha/data/train_data/train_data_11.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Count statistics on test data:   0%|          | 0/1 [00:00<?, ?it/s]

Reading chunks:
/content/drive/MyDrive/alpha/data/test_data/test_data_0.pq
/content/drive/MyDrive/alpha/data/test_data/test_data_1.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from collections import Counter


train_len_counter = pd.Series(Counter(train_lens)).sort_index()
test_len_counter = pd.Series(Counter(test_lens)).sort_index()

Один из аргументов в функции `dataset_preprocessing_utils.create_padded_buckets` &ndash; `bucket_info` &ndash; словарь, где для конкретной длины последовательности указано до какой длины нужно делать паддинг. Для данного бэйзлайна возьмем простое разбиение на 43 бакета:
| Длина последовательности | Длина после паддинга |
| :-: | :-:
| 1 &ndash; 40 | без изменений |
| 41 &ndash; 45 | 45 |
| 46 &ndash; 50 | 50 |
| 51 &ndash; 58 | 58 |

In [12]:
keys_ = list(range(1, 59))
lens_ = list(range(1, 41)) + [45] * 5 + [50] * 5 + [58] * 8
bucket_info = dict(zip(keys_, lens_))
bucket_info

{1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 45,
 42: 45,
 43: 45,
 44: 45,
 45: 45,
 46: 50,
 47: 50,
 48: 50,
 49: 50,
 50: 50,
 51: 58,
 52: 58,
 53: 58,
 54: 58,
 55: 58,
 56: 58,
 57: 58,
 58: 58}

Также рассмотрим уникальные значения признаков

In [13]:
for feat, uniq in uniques.items():
    print(f"Feature: {feat}, unique values: {uniq}")

Feature: pre_since_opened, unique values: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
Feature: pre_since_confirmed, unique values: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Feature: pre_pterm, unique values: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Feature: pre_fterm, unique values: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Feature: pre_till_pclose, unique values: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Feature: pre_till_fclose, unique values: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Feature: pre_loans_credit_limit, unique values: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
Feature: pre_loans_next_pay_summ, unique values: {0, 1, 2, 3, 4, 5, 6, 7}
Feature: pre_loans_outstanding, unique values: {1, 2, 3, 4, 5}
Feature: pre_loans_total_overdue, unique values: {0, 1}
Feature: pre_loans_max_overdue_sum, unique values: {0, 1, 2, 3}
Feature: pre_lo

Поскольку паддинг будет производиться нулями, а категориальные признаки закодированы, начиная с 0, перед паддингом будем сдвигать все значения на 1.

Вся предобработка данных реализована в виде функции `create_buckets_from_credits`:

In [14]:
def create_buckets_from_credits(path_to_dataset, bucket_info, save_to_path, frame_with_ids = None,
                                num_parts_to_preprocess_at_once: int = 1,
                                num_parts_total=50, has_target=False, shuffle_rn: bool = False, num_last_credits: int = 0):
    block = 0
    for step in tqdm.notebook.tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                     desc="Preparing credit data"):
        credits_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once, verbose=True)
        credits_frame.loc[:, features] += 1
        seq = transform_credits_to_sequences(credits_frame, num_last_credits=num_last_credits, shuffle_rn=shuffle_rn)
        print("Transforming credits to sequences is done.")

        if frame_with_ids is not None:
            seq = seq.merge(frame_with_ids, on="id")

        block_as_str = str(block)
        if len(block_as_str) == 1:
            block_as_str = "00" + block_as_str
        else:
            block_as_str = "0" + block_as_str

        processed_fragment =  create_padded_buckets(seq, bucket_info=bucket_info, has_target=has_target,
                                                    save_to_file_path=os.path.join(save_to_path,
                                                                                   f"processed_chunk_{block_as_str}.pkl"))
        block += 1

### K-FOLDS

Разобьем данные на 5 фолдов для обучения моделей и получения Out Of Bag прогнозов. Для нейросетевых методов будем использовать два вариента разбиения на фолды. В первом варианте порядок строк (исторических данных по кредитам клиента) остается неизменным. Во втором случае происходит случайное перемешивание строк.

**Первый вариант разбиения:**

In [15]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [16]:
cv.split(train_target, train_target['flag'])

<generator object _BaseKFold.split at 0x798e269829d0>

In [1]:
for fold_, (train_idx, val_idx) in enumerate(cv.split(train_target, train_target['flag']), 1):
    if fold_ == 5:
        print(fold_)
        !mkdir data_k_folds\\fold_{fold_}\\train_buckets_rnn
        !mkdir data_k_folds\\fold_{fold_}\\val_buckets_rnn

        TRAIN_BUCKETS_PATH = f"data_k_folds\\fold_{fold_}\\train_buckets_rnn"
        VAL_BUCKETS_PATH = f"data_k_folds\\fold_{fold_}\\val_buckets_rnn"

        train = train_target.iloc[train_idx]
        val = train_target.iloc[val_idx]


        create_buckets_from_credits(TRAIN_DATA_PATH,
                                bucket_info=bucket_info,
                                save_to_path=TRAIN_BUCKETS_PATH,
                                frame_with_ids=train,
                                num_parts_to_preprocess_at_once=2,
                                num_parts_total=12, has_target=True)

        create_buckets_from_credits(TRAIN_DATA_PATH,
                                bucket_info=bucket_info,
                                save_to_path=VAL_BUCKETS_PATH,
                                frame_with_ids=val,
                                num_parts_to_preprocess_at_once=2,
                                num_parts_total=12, has_target=True)

NameError: name 'cv' is not defined

### Тестовая выборка:

In [None]:
TEST_BUCKETS_PATH = "test_buckets_rnn"

In [None]:
%%time
тестовая выборка
create_buckets_from_credits(TEST_DATA_PATH,
                            bucket_info=bucket_info,
                            save_to_path=TEST_BUCKETS_PATH, num_parts_to_preprocess_at_once=2,
                            num_parts_total=2)

dataset_test = sorted([os.path.join(TEST_BUCKETS_PATH, x) for x in os.listdir(TEST_BUCKETS_PATH)])
dataset_test

Wall time: 0 ns


['G:\\Alfa_Bank_competition\\test_buckets_rnn\\processed_chunk_000.pkl']

### Обучение нейросетевых моделей

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


Для создания модели будем использовать фреймворк `torch`.

Для того, чтобы можно было обучать нейронную сеть и отслеживать ее качество, предоставлены следующие функции:
    `data_generators_with_mask.batches_generator` &ndash; функция-генератор, итеративно возвращает батчи, поддерживает батчи для  `torch.nn.Module` моделей. В зависимости от флага `is_train` может быть использована для генерации батчей на train/val/test стадии.
 - функция `train_models.train_epoch` &ndash; обучает модель одну эпоху.
 - функция `train_models.eval_model` &ndash; проверяет качество модели на отложенной выборке и возвращает roc_auc_score.
 - функция `ptrain_models.inference` &ndash; делает предикты на новых данных и готовит фрейм для проверяющей системы.
 - класс `training_aux.EarlyStopping` &ndash; реализует early_stopping, сохраняя лучшую модель. Пример использования приведен ниже.

In [None]:
from data_generators_with_mask import batches_generator
from train_models import train, train_epoch, eval_model, inference
from training_aux import EarlyStopping

Все признаки, описывающие кредитную историю клиентов &ndash; категориальные. Для их представления в модели используем категориальные эмбеддинги. Для этого нужно каждому категориальному признаку задать размерность латентного пространства. Используем [формулу](https://forums.fast.ai/t/size-of-embedding-for-categorical-variables/42608) из библиотеки `fast.ai`.

In [None]:
def compute_embed_dim(n_cat: int) -> int:
    return min(600, round(1.6 * n_cat**0.56))

In [None]:
embedding_projections = {feat: (max(uniq)+1, compute_embed_dim(max(uniq)+1)) for feat, uniq in uniques.items()}
embedding_projections

{'pre_since_opened': (20, 9),
 'pre_since_confirmed': (18, 8),
 'pre_pterm': (18, 8),
 'pre_fterm': (17, 8),
 'pre_till_pclose': (17, 8),
 'pre_till_fclose': (16, 8),
 'pre_loans_credit_limit': (20, 9),
 'pre_loans_next_pay_summ': (8, 5),
 'pre_loans_outstanding': (6, 4),
 'pre_loans_total_overdue': (2, 2),
 'pre_loans_max_overdue_sum': (4, 3),
 'pre_loans_credit_cost_rate': (14, 7),
 'pre_loans5': (18, 8),
 'pre_loans530': (20, 9),
 'pre_loans3060': (10, 6),
 'pre_loans6090': (6, 4),
 'pre_loans90': (20, 9),
 'is_zero_loans5': (2, 2),
 'is_zero_loans530': (2, 2),
 'is_zero_loans3060': (2, 2),
 'is_zero_loans6090': (2, 2),
 'is_zero_loans90': (2, 2),
 'pre_util': (20, 9),
 'pre_over2limit': (20, 9),
 'pre_maxover2limit': (20, 9),
 'is_zero_util': (2, 2),
 'is_zero_over2limit': (2, 2),
 'is_zero_maxover2limit': (2, 2),
 'enc_paym_0': (4, 3),
 'enc_paym_1': (4, 3),
 'enc_paym_2': (4, 3),
 'enc_paym_3': (4, 3),
 'enc_paym_4': (4, 3),
 'enc_paym_5': (4, 3),
 'enc_paym_6': (4, 3),
 'enc_pay

**BIDIRECTIONAL RNN + LAST Hidden**

In [None]:
path_to_checkpoints = "model_RNN_LAST_HIDDEN_FOLDS"

In [None]:
from models import CreditsRNN_LAST_HIDDEN

In [None]:
loss_function = sigmoid_focal_loss
alpha = 0.9

num_epochs = 5
train_batch_size = 64
val_batch_size = 64

for fold_ in range(1, 6):
    print(f"Training fold {fold_}:")
    TRAIN_BUCKETS_PATH = f"data_k_folds\\fold_{fold_}\\train_buckets_rnn"
    VAL_BUCKETS_PATH = f"data_k_folds\\fold_{fold_}\\val_buckets_rnn"

    dataset_train = sorted([os.path.join(TRAIN_BUCKETS_PATH, x) for x in os.listdir(TRAIN_BUCKETS_PATH)])
    dataset_val = sorted([os.path.join(VAL_BUCKETS_PATH, x) for x in os.listdir(VAL_BUCKETS_PATH)])

    best_model_name = f"best_checkpoint_rnn_LAST_HIDDEN_fold_{fold_}.pt"

    model_RNN_LAST_HIDDEN = CreditsRNN_LAST_HIDDEN(features,
                                               embedding_projections,
                                               rnn_units=128,
                                               top_classifier_units=32,
                                               dropout=0,
                                               spatial_dropout=0).to(device)

    optimizer = torch.optim.AdamW(lr=5e-4, params=model_RNN_LAST_HIDDEN.parameters())

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', factor=1/np.sqrt(10),
            patience=1,
            verbose=True, threshold=0.01,
            threshold_mode='abs', cooldown=0,
            min_lr=1e-6, eps=1e-08)

    train(model_RNN_LAST_HIDDEN, num_epochs, optimizer, loss_function, alpha, dataset_train, dataset_val,
          path_to_checkpoints, best_model_name, scheduler, train_batch_size, val_batch_size,
          shuffle=True, print_loss_every_n_batches=500, device=device, lenght=True)


Training fold 1:
Starting epoch 1


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015751436352729797

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (-inf --> 0.773711).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 1 completed. Train ROC AUC: 0.7770036490243711, val ROC AUC: 0.7737108338943375
Starting epoch 2


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015347626991569996

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.773711 --> 0.780416).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 2 completed. Train ROC AUC: 0.7860739515441013, val ROC AUC: 0.7804156591989901
Starting epoch 3


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015172006562352186

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.776783. Current best: 0.780416
EarlyStopping counter: 1 out of 3
Epoch 00003: reducing learning rate of group 0 to 1.5811e-04.


Evaluating model: 0it [00:00, ?it/s]

Epoch 3 completed. Train ROC AUC: 0.7842383174311967, val ROC AUC: 0.776783199093555
Starting epoch 4


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014849732629954815

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.779983. Current best: 0.780416
EarlyStopping counter: 2 out of 3


Evaluating model: 0it [00:00, ?it/s]

Epoch 4 completed. Train ROC AUC: 0.8001122132571945, val ROC AUC: 0.7799831127776533
Starting epoch 5


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014679970219731331

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.779353. Current best: 0.780416
EarlyStopping counter: 3 out of 3
Early stopping reached. Stop training...
Training fold 2:
Starting epoch 1


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015747772529721263

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (-inf --> 0.773056).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 1 completed. Train ROC AUC: 0.7757133823329072, val ROC AUC: 0.7730557238091074
Starting epoch 2


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015334466472268105

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.773056 --> 0.778673).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 2 completed. Train ROC AUC: 0.7840023528009195, val ROC AUC: 0.77867254949599
Starting epoch 3


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015159752219915398

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.775384. Current best: 0.778673
EarlyStopping counter: 1 out of 3
Epoch 00003: reducing learning rate of group 0 to 1.5811e-04.


Evaluating model: 0it [00:00, ?it/s]

Epoch 3 completed. Train ROC AUC: 0.784784037026769, val ROC AUC: 0.775384378348853
Starting epoch 4


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014805042184889317

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.778673 --> 0.779463).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 4 completed. Train ROC AUC: 0.8021661875012662, val ROC AUC: 0.7794633904200502
Starting epoch 5


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014605188742280006

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.779463 --> 0.780918).  Saving model...
Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.


Evaluating model: 0it [00:00, ?it/s]

Epoch 5 completed. Train ROC AUC: 0.8096950465625067, val ROC AUC: 0.78091835514003
Training fold 3:
Starting epoch 1


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015741558745503426

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (-inf --> 0.769151).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 1 completed. Train ROC AUC: 0.7765645706018005, val ROC AUC: 0.7691507376669753
Starting epoch 2


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015327190980315208

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.769151 --> 0.772390).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 2 completed. Train ROC AUC: 0.7833744014613515, val ROC AUC: 0.7723904640610401
Starting epoch 3


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015154223889112473

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.772390 --> 0.773862).  Saving model...
Epoch 00003: reducing learning rate of group 0 to 1.5811e-04.


Evaluating model: 0it [00:00, ?it/s]

Epoch 3 completed. Train ROC AUC: 0.7889804776563297, val ROC AUC: 0.7738616665369613
Starting epoch 4


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014826933853328228

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.773862 --> 0.776063).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 4 completed. Train ROC AUC: 0.8011950161996071, val ROC AUC: 0.7760625812304207
Starting epoch 5


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014658680185675621

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.775405. Current best: 0.776063
EarlyStopping counter: 1 out of 3
Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.


Evaluating model: 0it [00:00, ?it/s]

Epoch 5 completed. Train ROC AUC: 0.8063034575981931, val ROC AUC: 0.7754054613431123
Training fold 4:
Starting epoch 1


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015745621174573948

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (-inf --> 0.770635).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 1 completed. Train ROC AUC: 0.7768961248077156, val ROC AUC: 0.7706354720171745
Starting epoch 2


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015323743224143982

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.770635 --> 0.775791).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 2 completed. Train ROC AUC: 0.7858087533499766, val ROC AUC: 0.7757907111728621
Starting epoch 3


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015147854574024677

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.774816. Current best: 0.775791
EarlyStopping counter: 1 out of 3
Epoch 00003: reducing learning rate of group 0 to 1.5811e-04.


Evaluating model: 0it [00:00, ?it/s]

Epoch 3 completed. Train ROC AUC: 0.7869408241250896, val ROC AUC: 0.7748161399423831
Starting epoch 4


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014820588752627373

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.775791 --> 0.779045).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 4 completed. Train ROC AUC: 0.8020288882470499, val ROC AUC: 0.7790451910546548
Starting epoch 5


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014644358307123184

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.777173. Current best: 0.779045
EarlyStopping counter: 1 out of 3
Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.


Evaluating model: 0it [00:00, ?it/s]

Epoch 5 completed. Train ROC AUC: 0.8081114452754024, val ROC AUC: 0.7771734777479598
Training fold 5:
Starting epoch 1


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015760296955704697

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (-inf --> 0.772522).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 1 completed. Train ROC AUC: 0.7770552906313137, val ROC AUC: 0.7725222380278798
Starting epoch 2


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015336341224610806

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.772522 --> 0.777088).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 2 completed. Train ROC AUC: 0.7854092778408805, val ROC AUC: 0.7770882929430654
Starting epoch 3


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.015176887623965742

Evaluating model: 0it [00:00, ?it/s]

No imporvement in validation ROC-AUC. Current: 0.776888. Current best: 0.777088
EarlyStopping counter: 1 out of 3
Epoch 00003: reducing learning rate of group 0 to 1.5811e-04.


Evaluating model: 0it [00:00, ?it/s]

Epoch 3 completed. Train ROC AUC: 0.7874179667112435, val ROC AUC: 0.7768878253926795
Starting epoch 4


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014845321886241436

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.777088 --> 0.780638).  Saving model...


Evaluating model: 0it [00:00, ?it/s]

Epoch 4 completed. Train ROC AUC: 0.8018348266048133, val ROC AUC: 0.7806383196390706
Starting epoch 5


Training: 0it [00:00, ?it/s]

Training loss after epoch: 0.014673798345029354

Evaluating model: 0it [00:00, ?it/s]

Validation ROC-AUC improved (0.780638 --> 0.781359).  Saving model...
Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.


Evaluating model: 0it [00:00, ?it/s]

Epoch 5 completed. Train ROC AUC: 0.8095920036894951, val ROC AUC: 0.7813588927634885


Предсказания для тестовых данных:

In [None]:
model_RNN_LAST_HIDDEN = CreditsRNN_LAST_HIDDEN(features,
                                               embedding_projections,
                                               rnn_units=128,
                                               top_classifier_units=32,
                                               dropout=0,
                                               spatial_dropout=0).to(device)

model_RNN_LAST_HIDDEN.load_state_dict(torch.load(os.path.join(path_to_checkpoints, f'best_checkpoint_rnn_LAST_HIDDEN_fold_1.pt')))

test_preds_RNN_LAST_HIDDEN = inference(model_RNN_LAST_HIDDEN, dataset_test, batch_size=512, device=device, lenght=True)

test_preds_RNN_LAST_HIDDEN_TOTAL = test_preds_RNN_LAST_HIDDEN.copy().rename(columns={'score': 'score_fold_1'})

for fold_ in range(2, 6):
    model_RNN_LAST_HIDDEN.load_state_dict(torch.load(os.path.join(path_to_checkpoints, f'best_checkpoint_rnn_LAST_HIDDEN_fold_{fold_}.pt')))

    test_preds_RNN_LAST_HIDDEN = inference(model_RNN_LAST_HIDDEN, dataset_test, batch_size=512, device=device, lenght=True)


    test_preds_RNN_LAST_HIDDEN_TOTAL = test_preds_RNN_LAST_HIDDEN_TOTAL.merge(test_preds_RNN_LAST_HIDDEN, on='id')\
                                                                       .rename(columns={'score': f'score_fold_{fold_}'})

Test predictions: 0it [00:00, ?it/s]

Test predictions: 0it [00:00, ?it/s]

Test predictions: 0it [00:00, ?it/s]

Test predictions: 0it [00:00, ?it/s]

Test predictions: 0it [00:00, ?it/s]

In [None]:
test_preds_RNN_LAST_HIDDEN_TOTAL['score'] = (test_preds_RNN_LAST_HIDDEN_TOTAL['score_fold_1']\
                                          + test_preds_RNN_LAST_HIDDEN_TOTAL['score_fold_2']\
                                          + test_preds_RNN_LAST_HIDDEN_TOTAL['score_fold_3']\
                                          + test_preds_RNN_LAST_HIDDEN_TOTAL['score_fold_4']\
                                          + test_preds_RNN_LAST_HIDDEN_TOTAL['score_fold_5']) / 5


test_preds_RNN_LAST_HIDDEN_TOTAL

Unnamed: 0,id,score_fold_1,score_fold_2,score_fold_3,score_fold_4,score_fold_5,score
0,3047012,0.002171,7.252386e-12,0.000021,6.111582e-09,1.221641e-16,0.000438
1,3000786,0.002288,3.075226e-12,0.000112,2.206464e-08,2.846257e-16,0.000480
2,3019111,0.037445,1.411738e-04,0.025352,2.313488e-03,1.011724e-02,0.015074
3,3345810,0.154487,1.758348e-07,0.015331,1.157137e-04,1.018275e-05,0.033989
4,3434512,0.144362,2.115846e-11,0.002245,7.086300e-05,9.491834e-06,0.029337
...,...,...,...,...,...,...,...
499995,3464951,0.417768,4.333285e-01,0.447533,4.291284e-01,4.227760e-01,0.430107
499996,3028533,0.353061,3.774460e-01,0.390526,3.735240e-01,3.987354e-01,0.378658
499997,3464956,0.423485,4.115852e-01,0.406487,3.954434e-01,4.739827e-01,0.422197
499998,3366101,0.345639,3.429430e-01,0.333768,3.110878e-01,3.826981e-01,0.343227


In [None]:
test_preds_RNN_LAST_HIDDEN_TOTAL[['id', 'score']].to_csv("submission_RNN_LAST_HIDDEN_TOTAL_5_folds.csv", index=None)