<a href="https://colab.research.google.com/github/Dancingtree95/AlfaBattle2.0/blob/master/AlfaTransactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import torch

In [15]:
torch.cuda.is_available()

False

In [1]:
%%capture
!wget https://storage.yandexcloud.net/ds-ods/files/materials/6e991b7f/train_transactions_contest.zip
!wget https://storage.yandexcloud.net/ds-ods/files/materials/fc0f1aa3/test_transactions_contest.zip
!wget https://storage.yandexcloud.net/ds-ods/files/materials/3634ff95/train_target.csv
!wget https://storage.yandexcloud.net/ds-ods/files/materials/b2216108/test_target_contest.csv

In [2]:
%%capture
!unzip train_transactions_contest.zip
!rm train_transactions_contest.zip

In [3]:
!git clone https://github.com/smirnovevgeny/AlfaBattle2.0.git

Cloning into 'AlfaBattle2.0'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 57 (delta 22), reused 35 (delta 8), pack-reused 0[K
Unpacking objects: 100% (57/57), done.


In [9]:
import os
import pandas as pd
import sys
import pickle
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [2]:
sys.path.append('AlfaBattle2.0')
sys.path.append('AlfaBattle2.0/rnn_baseline')
sys.path.append('AlfaBattle2.0/rnn_baseline/baseline')

In [3]:
TRAIN_TRANSACTIONS_PATH = 'train_transactions_contest'
TRAIN_TARGET_PATH = 'train_target.csv'

In [4]:
target_frame = pd.read_csv(TRAIN_TARGET_PATH)
target_frame.head()

Unnamed: 0,app_id,product,flag
0,0,3,0
1,1,1,0
2,2,1,0
3,3,1,0
4,4,1,0


In [5]:
from utils import read_parquet_dataset_from_local
from dataset_preprocessing_utils import transform_transactions_to_sequences, create_padded_buckets

In [6]:
import pickle

with open('/content/AlfaBattle2.0/rnn_baseline/constants/buckets_info.pkl', 'rb') as f:
    mapping_seq_len_to_padded_len = pickle.load(f)
    
with open('/content/AlfaBattle2.0/rnn_baseline/constants/dense_features_buckets.pkl', 'rb') as f:
    dense_features_buckets = pickle.load(f)

with open('/content/AlfaBattle2.0/rnn_baseline/constants/embedding_projections.pkl', 'rb') as f:
  embedding_projections = pickle.load(f)

In [7]:
def create_buckets_from_transactions(path_to_dataset, save_to_path, frame_with_ids = None, 
                                     num_parts_to_preprocess_at_once: int = 1, 
                                     num_parts_total=50, has_target=False):
    block = 0
    for step in tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once), 
                                   desc="Transforming transactions data"):
        transactions_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once, 
                                                             verbose=True)
        for dense_col in ['amnt', 'days_before', 'hour_diff']:
            transactions_frame[dense_col] = np.digitize(transactions_frame[dense_col], bins=dense_features_buckets[dense_col])
            
        seq = transform_transactions_to_sequences(transactions_frame)
        seq['sequence_length'] = seq.sequences.apply(lambda x: len(x[1]))
        
        if frame_with_ids is not None:
            seq = seq.merge(frame_with_ids, on='app_id')

        block_as_str = str(block)
        if len(block_as_str) == 1:
            block_as_str = '00' + block_as_str
        else:
            block_as_str = '0' + block_as_str
            
        processed_fragment =  create_padded_buckets(seq, mapping_seq_len_to_padded_len, has_target=has_target, 
                                                    save_to_file_path=os.path.join(save_to_path, 
                                                                                   f'processed_chunk_{block_as_str}.pkl'))
        block += 1

In [8]:
train, val = train_test_split(target_frame, random_state=42, test_size=0.1)
train.shape, val.shape

((867429, 3), (96382, 3))

In [9]:
! rm -r val_buckets
! mkdir val_buckets

In [10]:
create_buckets_from_transactions(TRAIN_TRANSACTIONS_PATH, 
                                save_to_path='val_buckets',
                                frame_with_ids=val, num_parts_to_preprocess_at_once=2, num_parts_total=50, has_target=True)

Transforming transactions data:   0%|          | 0/25 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_000_0_to_23646.parquet
train_transactions_contest/part_001_23647_to_47415.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

  'padded_sequences': np.array(padded_seq),
  'targets': np.array(targets) if targets else [],
  'app_id': np.array(app_ids),
  'products': np.array(products),


Reading chunks:

train_transactions_contest/part_002_47416_to_70092.parquet
train_transactions_contest/part_003_70093_to_92989.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_004_92990_to_115175.parquet
train_transactions_contest/part_005_115176_to_138067.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_006_138068_to_159724.parquet
train_transactions_contest/part_007_159725_to_180735.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_008_180736_to_202834.parquet
train_transactions_contest/part_009_202835_to_224283.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_010_224284_to_245233.parquet
train_transactions_contest/part_011_245234_to_265281.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_012_265282_to_285632.parquet
train_transactions_contest/part_013_285633_to_306877.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_014_306878_to_329680.parquet
train_transactions_contest/part_015_329681_to_350977.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_016_350978_to_372076.parquet
train_transactions_contest/part_017_372077_to_392692.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_018_392693_to_413981.parquet
train_transactions_contest/part_019_413982_to_434478.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_020_434479_to_455958.parquet
train_transactions_contest/part_021_455959_to_477221.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_022_477222_to_496751.parquet
train_transactions_contest/part_023_496752_to_517332.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_024_517333_to_537036.parquet
train_transactions_contest/part_025_537037_to_557423.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_026_557424_to_576136.parquet
train_transactions_contest/part_027_576137_to_595745.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_028_595746_to_615602.parquet
train_transactions_contest/part_029_615603_to_635004.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_030_635005_to_654605.parquet
train_transactions_contest/part_031_654606_to_673656.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_032_673657_to_696025.parquet
train_transactions_contest/part_033_696026_to_714545.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_034_714546_to_733168.parquet
train_transactions_contest/part_035_733169_to_752514.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_036_752515_to_770940.parquet
train_transactions_contest/part_037_770941_to_788380.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_038_788381_to_805771.parquet
train_transactions_contest/part_039_805772_to_823299.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_040_823300_to_841218.parquet
train_transactions_contest/part_041_841219_to_859270.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_042_859271_to_878521.parquet
train_transactions_contest/part_043_878522_to_896669.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_044_896670_to_916056.parquet
train_transactions_contest/part_045_916057_to_935131.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_046_935132_to_951695.parquet
train_transactions_contest/part_047_951696_to_970383.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_048_970384_to_987313.parquet
train_transactions_contest/part_049_987314_to_1003050.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
path_to_dataset = 'val_buckets'
dir_with_datasets = os.listdir(path_to_dataset)
dataset_val = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])
dataset_val

['val_buckets/processed_chunk_000.pkl',
 'val_buckets/processed_chunk_001.pkl',
 'val_buckets/processed_chunk_002.pkl',
 'val_buckets/processed_chunk_003.pkl',
 'val_buckets/processed_chunk_004.pkl',
 'val_buckets/processed_chunk_005.pkl',
 'val_buckets/processed_chunk_006.pkl',
 'val_buckets/processed_chunk_007.pkl',
 'val_buckets/processed_chunk_008.pkl',
 'val_buckets/processed_chunk_009.pkl',
 'val_buckets/processed_chunk_010.pkl',
 'val_buckets/processed_chunk_011.pkl',
 'val_buckets/processed_chunk_012.pkl',
 'val_buckets/processed_chunk_013.pkl',
 'val_buckets/processed_chunk_014.pkl',
 'val_buckets/processed_chunk_015.pkl',
 'val_buckets/processed_chunk_016.pkl',
 'val_buckets/processed_chunk_017.pkl',
 'val_buckets/processed_chunk_018.pkl',
 'val_buckets/processed_chunk_019.pkl',
 'val_buckets/processed_chunk_020.pkl',
 'val_buckets/processed_chunk_021.pkl',
 'val_buckets/processed_chunk_022.pkl',
 'val_buckets/processed_chunk_023.pkl',
 'val_buckets/processed_chunk_024.pkl']

In [12]:
! rm -r train_buckets
! mkdir train_buckets

rm: cannot remove 'train_buckets': No such file or directory


In [13]:
create_buckets_from_transactions(TRAIN_TRANSACTIONS_PATH, 
                                save_to_path='train_buckets',
                                frame_with_ids=train, num_parts_to_preprocess_at_once=2, num_parts_total=50, has_target=True)

Transforming transactions data:   0%|          | 0/25 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_000_0_to_23646.parquet
train_transactions_contest/part_001_23647_to_47415.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

  'padded_sequences': np.array(padded_seq),
  'targets': np.array(targets) if targets else [],
  'app_id': np.array(app_ids),
  'products': np.array(products),


Reading chunks:

train_transactions_contest/part_002_47416_to_70092.parquet
train_transactions_contest/part_003_70093_to_92989.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_004_92990_to_115175.parquet
train_transactions_contest/part_005_115176_to_138067.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_006_138068_to_159724.parquet
train_transactions_contest/part_007_159725_to_180735.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_008_180736_to_202834.parquet
train_transactions_contest/part_009_202835_to_224283.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_010_224284_to_245233.parquet
train_transactions_contest/part_011_245234_to_265281.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_012_265282_to_285632.parquet
train_transactions_contest/part_013_285633_to_306877.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_014_306878_to_329680.parquet
train_transactions_contest/part_015_329681_to_350977.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_016_350978_to_372076.parquet
train_transactions_contest/part_017_372077_to_392692.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_018_392693_to_413981.parquet
train_transactions_contest/part_019_413982_to_434478.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_020_434479_to_455958.parquet
train_transactions_contest/part_021_455959_to_477221.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_022_477222_to_496751.parquet
train_transactions_contest/part_023_496752_to_517332.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_024_517333_to_537036.parquet
train_transactions_contest/part_025_537037_to_557423.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_026_557424_to_576136.parquet
train_transactions_contest/part_027_576137_to_595745.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_028_595746_to_615602.parquet
train_transactions_contest/part_029_615603_to_635004.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_030_635005_to_654605.parquet
train_transactions_contest/part_031_654606_to_673656.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_032_673657_to_696025.parquet
train_transactions_contest/part_033_696026_to_714545.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_034_714546_to_733168.parquet
train_transactions_contest/part_035_733169_to_752514.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_036_752515_to_770940.parquet
train_transactions_contest/part_037_770941_to_788380.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_038_788381_to_805771.parquet
train_transactions_contest/part_039_805772_to_823299.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_040_823300_to_841218.parquet
train_transactions_contest/part_041_841219_to_859270.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_042_859271_to_878521.parquet
train_transactions_contest/part_043_878522_to_896669.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_044_896670_to_916056.parquet
train_transactions_contest/part_045_916057_to_935131.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_046_935132_to_951695.parquet
train_transactions_contest/part_047_951696_to_970383.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

Reading chunks:

train_transactions_contest/part_048_970384_to_987313.parquet
train_transactions_contest/part_049_987314_to_1003050.parquet


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting buckets:   0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
path_to_dataset = 'train_buckets'
dir_with_datasets = os.listdir(path_to_dataset)
dataset_train = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])
dataset_train

['train_buckets/processed_chunk_000.pkl',
 'train_buckets/processed_chunk_001.pkl',
 'train_buckets/processed_chunk_002.pkl',
 'train_buckets/processed_chunk_003.pkl',
 'train_buckets/processed_chunk_004.pkl',
 'train_buckets/processed_chunk_005.pkl',
 'train_buckets/processed_chunk_006.pkl',
 'train_buckets/processed_chunk_007.pkl',
 'train_buckets/processed_chunk_008.pkl',
 'train_buckets/processed_chunk_009.pkl',
 'train_buckets/processed_chunk_010.pkl',
 'train_buckets/processed_chunk_011.pkl',
 'train_buckets/processed_chunk_012.pkl',
 'train_buckets/processed_chunk_013.pkl',
 'train_buckets/processed_chunk_014.pkl',
 'train_buckets/processed_chunk_015.pkl',
 'train_buckets/processed_chunk_016.pkl',
 'train_buckets/processed_chunk_017.pkl',
 'train_buckets/processed_chunk_018.pkl',
 'train_buckets/processed_chunk_019.pkl',
 'train_buckets/processed_chunk_020.pkl',
 'train_buckets/processed_chunk_021.pkl',
 'train_buckets/processed_chunk_022.pkl',
 'train_buckets/processed_chunk_02

In [11]:
!nvidia-smi

Fri Jun 24 13:04:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [12]:
torch.cuda.is_available()

False

In [16]:
from data_generators import batches_generator, transaction_features
from pytorch_training import train_epoch, eval_model, inference
from training_aux import EarlyStopping

In [17]:
class TransactionsRnn(nn.Module):
    def __init__(self, transactions_cat_features, embedding_projections, product_col_name='product', rnn_units=128, top_classifier_units=32):
        super(TransactionsRnn, self).__init__()
        self._transaction_cat_embeddings = nn.ModuleList([self._create_embedding_projection(*embedding_projections[feature]) 
                                                          for feature in transactions_cat_features])
                
        self._product_embedding = self._create_embedding_projection(*embedding_projections[product_col_name], padding_idx=None)
        
        self._gru = nn.GRU(input_size=sum([embedding_projections[x][1] for x in transactions_cat_features]),
                             hidden_size=rnn_units, batch_first=True, bidirectional=False)
        
        self._hidden_size = rnn_units
                
        self._top_classifier = nn.Linear(in_features=rnn_units+embedding_projections[product_col_name][1], 
                                         out_features=top_classifier_units)
        self._intermediate_activation = nn.ReLU()
        
        self._head = nn.Linear(in_features=top_classifier_units, out_features=1)
    
    def forward(self, transactions_cat_features, product_feature):
        batch_size = product_feature.shape[0]
        
        embeddings = [embedding(transactions_cat_features[i]) for i, embedding in enumerate(self._transaction_cat_embeddings)]
        concated_embeddings = torch.cat(embeddings, dim=-1)
        
        _, last_hidden = self._gru(concated_embeddings)
        last_hidden = torch.reshape(last_hidden.permute(1, 2, 0), shape=(batch_size, self._hidden_size))
        
        product_embed = self._product_embedding(product_feature)
        
        intermediate_concat = torch.cat([last_hidden, product_embed], dim=-1)
                
        classification_hidden = self._top_classifier(intermediate_concat)
        activation = self._intermediate_activation(classification_hidden)
        
        logit = self._head(activation)
        
        return logit
    
    @classmethod
    def _create_embedding_projection(cls, cardinality, embed_size, add_missing=True, padding_idx=0):
        add_missing = 1 if add_missing else 0
        return nn.Embedding(num_embeddings=cardinality+add_missing, embedding_dim=embed_size, padding_idx=padding_idx)

In [18]:
! rm -r checkpoints/pytorch_baseline
! mkdir checkpoints/pytorch_baseline

rm: cannot remove 'checkpoints/pytorch_baseline': No such file or directory
mkdir: cannot create directory ‘checkpoints/pytorch_baseline’: No such file or directory


In [19]:
path_to_checkpoints = 'checkpoints/pytorch_baseline/'
es = EarlyStopping(patience=3, mode='max', verbose=True, save_path=os.path.join(path_to_checkpoints, 'best_checkpoint.pt'), 
                   metric_name='ROC-AUC', save_format='torch')

In [20]:
num_epochs = 15
train_batch_size = 128
val_batch_szie = 128

In [21]:
model = TransactionsRnn(transaction_features, embedding_projections).to(device)

In [22]:
optimizer = torch.optim.Adam(lr=1e-3, params=model.parameters())

In [23]:
!rm test_transactions_contest.zip

rm: cannot remove 'test_transactions_contest.zip': No such file or directory


In [24]:
for epoch in range(num_epochs):
    print(f'Starting epoch {epoch+1}')
    train_epoch(model, optimizer, dataset_train, batch_size=train_batch_size, 
                shuffle=True, print_loss_every_n_batches=500, device=device)
    
    val_roc_auc = eval_model(model, dataset_val, batch_size=val_batch_szie, device=device)
    es(val_roc_auc, model)
    
    if es.early_stop:
        print('Early stopping reached. Stop training...')
        break
    torch.save(model.state_dict(), os.path.join(path_to_checkpoints, f'epoch_{epoch+1}_val_{val_roc_auc:.3f}.pt'))
    
    train_roc_auc = eval_model(model, dataset_train, batch_size=val_batch_szie, device=device)
    print(f'Epoch {epoch+1} completed. Train roc-auc: {train_roc_auc}, Val roc-auc: {val_roc_auc}')

Starting epoch 1


Training: 0it [00:00, ?it/s]

KeyboardInterrupt: ignored