In [1]:
# import os
# if 'Hackathon' not in os.listdir():
#     from google.colab import drive
#     drive.mount('/content/drive')
#     !7z x drive/MyDrive/Hackathon.zip
#     !pip install pytorch-lifestream
#     !pip install polars==0.20.31
#     !pip install pytorch_lightning==1.9.0

In [None]:
import os
import polars as pl
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import gc
from datetime import datetime
import pickle
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup

from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head
import torch
import torchmetrics
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import KFold
from  sklearn.metrics import roc_auc_score, f1_score

In [None]:
from utils import read_parquet, read_geo_parquet, read_data, SberDataset, Collate, val_step, SberModel, standart_split

In [None]:
TEST_TARGET_PATH = '/content/Hackathon/test_target_b.parquet'
TRAIN_TARGET_PATH = '/content/Hackathon/train_target.parquet'

TEST_TXN_PATH = '/content/Hackathon/trx_test.parquet'
TRAIN_TXN_PATH = '/content/Hackathon/trx_train.parquet'

TEST_DIAL_PATH = '/content/Hackathon/dial_test.parquet'
TRAIN_DIAL_PATH = '/content/Hackathon/dial_train.parquet'

TEST_GEO_PATH = '/content/Hackathon/geo_test.parquet'
TRAIN_GEO_PATH = '/content/Hackathon/geo_train.parquet'

TARGET_LIST = ['target_1', 'target_2', 'target_3', 'target_4']
EMBED_LIST = ['event_type', 'event_subtype', 'currency', 'src_type11', 'src_type12', 'dst_type11', 'dst_type12',
              'src_type21', 'src_type22', 'src_type31', 'src_type32']

embed_cols_train = EMBED_LIST

In [None]:
transactions, target_data, embed_max_data, dict_client_ids, dial, geo = read_data(TRAIN_TARGET_PATH, TEST_TARGET_PATH, TRAIN_TXN_PATH, TEST_TXN_PATH, TRAIN_GEO_PATH, TEST_GEO_PATH, TRAIN_DIAL_PATH, TEST_DIAL_PATH, EMBED_LIST, TARGET_LIST)

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
MIN_COUNT_TYPE = 5000
MAX_NUMBER_TYPES = 300
dict_vc_col = {}

for col in tqdm(EMBED_LIST):
    vc = transactions[col].value_counts(sort = True)
    good_vc = vc.filter(vc['count'] > MIN_COUNT_TYPE)[col].to_list()[:MAX_NUMBER_TYPES]
    dict_vc_col[col] = good_vc
    transactions = transactions.with_columns(pl.when(pl.col(col).is_in(good_vc)).then(pl.col(col)).otherwise(embed_max_data[col] + 1))

MIN_COUNT_TYPE = 5000
MAX_NUMBER_TYPES = 500

embed_max_data['geohash_4'] = geo['geohash_4'].max() + 1

for col in tqdm(['geohash_4']):
    vc = geo[col].value_counts(sort = True)
    good_vc = vc.filter(vc['count'] > MIN_COUNT_TYPE)[col].to_list()[:MAX_NUMBER_TYPES]
    dict_vc_col[col] = good_vc
    geo = geo.with_columns(pl.when(pl.col(col).is_in(good_vc)).then(pl.col(col)).otherwise(embed_max_data[col] + 1))

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
transactions = transactions.with_columns((1 + pl.col('amount')).log().alias('amount'))
mean_val, std_val = transactions['amount'] .mean(), transactions['amount'] .std()
transactions = transactions.with_columns( ((pl.col('amount') - mean_val) / std_val).alias('amount'))

In [None]:
test_users = target_data.filter(pl.col('target_1') == -1)['client_id'].unique().to_numpy()
full_train_users = np.array(sorted(set(target_data['client_id'].unique()) - set(test_users)))
split_list = standart_split(full_train_users)
all_users = sorted(target_data['client_id'].unique().to_list())

In [None]:
dict_gp_cols = []
gp_transactions_train = transactions.group_by('client_id', maintain_order = True).agg([pl.col(col) for col in embed_cols_train +  [ 'event_time', 'amount']])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_transactions_train.columns)} ]
gp_transactions_train = gp_transactions_train.to_numpy()

gp_target = target_data.group_by('client_id', maintain_order = True).agg([pl.col(col) for col in TARGET_LIST + ['mon']])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_target.columns)} ]
gp_target = gp_target.to_numpy()

gp_dial = dial.group_by('client_id').agg([pl.col('embedding'), pl.col('event_time')])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_dial.columns)} ]
gp_dial = gp_dial.to_numpy()

gp_geo = geo.group_by('client_id').agg([pl.col('geohash_4'), pl.col('event_time')])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_geo.columns)} ]
gp_geo = gp_geo.to_numpy()

gp_transactions_train_key = {x:i for i,x in enumerate(gp_transactions_train[:, dict_gp_cols[0]['client_id']])}
gp_target_key = {x:i for i,x in enumerate(gp_target[:, dict_gp_cols[1]['client_id']])}
gp_dial_key = {x:i for i,x in enumerate(gp_dial[:, dict_gp_cols[2]['client_id']])}
gp_geo_key = {x:i for i,x in enumerate(gp_geo[:, dict_gp_cols[3]['client_id']])}

gc.collect()

18

In [None]:
embs_dict = {}
for col in embed_cols_train:
    embs_dict[col] = {'in':int(embed_max_data[col]) + 3, "out" : 32}

trx_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={col:'identity' for col in   ['amount',  'diff_event_time']},
    embeddings=embs_dict
)

tgt_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={f'feat_{col}' : 'identity' for col in TARGET_LIST},
    embeddings={}
)

geo_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={'geo_time' : 'identity'},
    embeddings={'geo_hash':{'in':embed_max_data['geohash_4'] + 3, "out" : 32}}
)

In [None]:
import pickle
with open('saved_data.pickle', 'wb') as f:
    pickle.dump([embed_max_data, mean_val, std_val, dict_vc_col], f)

In [2]:
gc.collect()
torch.cuda.empty_cache()

loss_fct = nn.BCEWithLogitsLoss()
scaler = torch.cuda.amp.GradScaler()
clip_grad_norm = 1

lr = 1e-3
weight_decay = 1e-2
epochs = 15

device = 'cuda'


batch_size = 1024
params_train = {'batch_size': batch_size, 'shuffle': True, 'drop_last': True, 'num_workers': 4}
params_val = {'batch_size': batch_size, 'shuffle': False, 'drop_last': False, 'num_workers': 4}

for FOLD in range(5):

    val_users = split_list[FOLD][1]
    train_users = split_list[FOLD][0]


    drop_val_users = set(val_users) | set(test_users)

    sber_dataset = SberDataset(gp_transactions_train, gp_transactions_train_key, gp_target, gp_target_key, gp_dial, gp_dial_key, gp_geo, gp_geo_key,
                               all_users, dict_gp_cols, val_users = drop_val_users, embed_cols_train = embed_cols_train, TARGET_LIST = TARGET_LIST)
    collate_fn = Collate(TARGET_LIST, embed_cols_train)
    sber_dataloader = DataLoader(sber_dataset, collate_fn = collate_fn, **params_train)

    sber_dataset_val = SberDataset(gp_transactions_train, gp_transactions_train_key, gp_target, gp_target_key, gp_dial, gp_dial_key,gp_geo, gp_geo_key,
                                   val_users, dict_gp_cols, val = True, embed_cols_train = embed_cols_train, TARGET_LIST = TARGET_LIST)
    collate_fn = Collate(TARGET_LIST, embed_cols_train)
    sber_dataloader_val = DataLoader(sber_dataset_val, collate_fn = collate_fn, **params_val)


    model = SberModel(trx_encoder_params, tgt_encoder_params, geo_encoder_params, 4 ).to(device).train()
    optimizer = AdamW(params = model.parameters(), lr=lr, weight_decay=weight_decay)

    num_train_steps = int(len(sber_dataloader)  * epochs)
    print(num_train_steps)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, final_div_factor = 1e2,
                                                    pct_start = 0.,
                                                    steps_per_epoch=len(sber_dataloader), epochs=epochs)

    for epoch in range(epochs):
        average_loss = 0
        tk0 = tqdm(enumerate(sber_dataloader), total = len(sber_dataloader))
        for batch_number, (vals) in tk0:
            model.train()
            emb_feats = vals[0].to(device)
            emb_target = vals[1].to(device)
            emb_dial = vals[2].to(device)
            emb_geo = vals[3].to(device)
            target = vals[4].to(device)
            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                pred = model.forward(emb_feats, emb_target,  emb_dial, emb_geo)
                loss = loss_fct(pred, target)

            scaler.scale(loss).backward()
            if clip_grad_norm > 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
            scaler.step(optimizer)
            old_scaler = scaler.get_scale()
            scaler.update()
            new_scaler = scaler.get_scale()
            if new_scaler == old_scaler:
                scheduler.step()

            average_loss += loss.cpu().detach().numpy()
            lr_now = scheduler.get_last_lr()
            tk0.set_postfix(loss=average_loss / (batch_number + 1), stage="train", epoch = epoch, lr = lr_now)

        val_step(sber_dataloader_val, model, device)

    torch.save({'state_dict': model.state_dict()}, f'sber_big_new_{FOLD}.pt')