In [47]:
# import os
# if 'Hackathon' not in os.listdir():
#     from google.colab import drive
#     drive.mount('/content/drive')
#     !7z x drive/MyDrive/Hackathon.zip
#     !pip install pytorch-lifestream
#     !pip install polars==0.20.31
#     !pip install pytorch_lightning==1.9.0

In [1]:
import os
import polars as pl
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import gc
from datetime import datetime
import pickle
import zipfile
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup

from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head
import torch
import torchmetrics
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import KFold
from  sklearn.metrics import roc_auc_score, f1_score

In [51]:
# pd.__version__, np.__version__('2.0.3', '1.25.2')

In [2]:
# from utils import read_parquet, read_geo_parquet, read_data
from utils import read_parquet, read_geo_parquet, read_data, SberDataset, Collate, val_step, SberModel, standart_split

In [3]:
TEST_TARGET_PATH = '/content/Hackathon/test_target_b.parquet'
TRAIN_TARGET_PATH = '/content/Hackathon/train_target.parquet'

TEST_TXN_PATH = '/content/Hackathon/trx_test.parquet'
TRAIN_TXN_PATH = '/content/Hackathon/trx_train.parquet'

TEST_DIAL_PATH = '/content/Hackathon/dial_test.parquet'
TRAIN_DIAL_PATH = '/content/Hackathon/dial_train.parquet'

TEST_GEO_PATH = '/content/Hackathon/geo_test.parquet'
TRAIN_GEO_PATH = '/content/Hackathon/geo_train.parquet'

TRAIN_TARGET_PATH, TRAIN_TXN_PATH, TRAIN_DIAL_PATH, TRAIN_GEO_PATH = ('empty_folder', 'empty_folder', 'empty_folder', 'empty_folder')
TARGET_LIST = ['target_1', 'target_2', 'target_3', 'target_4']
EMBED_LIST = ['event_type', 'event_subtype', 'currency', 'src_type11', 'src_type12', 'dst_type11', 'dst_type12',
              'src_type21', 'src_type22', 'src_type31', 'src_type32']

embed_cols_train = EMBED_LIST

In [4]:
transactions, target_data, embed_max_data, dict_client_ids, dial, geo = read_data(TRAIN_TARGET_PATH, TEST_TARGET_PATH, TRAIN_TXN_PATH, TEST_TXN_PATH, TRAIN_GEO_PATH, TEST_GEO_PATH, TRAIN_DIAL_PATH, TEST_DIAL_PATH, EMBED_LIST, TARGET_LIST)

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [16]:
import pickle
with open('saved_data.pickle', 'rb') as f:
    embed_max_data, mean_val, std_val, dict_vc_col = pickle.load(f)

In [6]:
for col in tqdm(EMBED_LIST):
    good_vc = dict_vc_col[col]
    transactions = transactions.with_columns(pl.when(pl.col(col).is_in(good_vc)).then(pl.col(col)).otherwise(embed_max_data[col] + 1))

for col in tqdm(['geohash_4']):
    good_vc = dict_vc_col[col]
    geo = geo.with_columns(pl.when(pl.col(col).is_in(good_vc)).then(pl.col(col)).otherwise(embed_max_data[col] + 1))

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
transactions = transactions.with_columns((1 + pl.col('amount')).log().alias('amount'))
transactions = transactions.with_columns( ((pl.col('amount') - mean_val) / std_val).alias('amount'))

In [9]:
test_users = target_data.filter(pl.col('target_1') == -1)['client_id'].unique().to_numpy()

In [12]:
dict_gp_cols = []
gp_transactions_train = transactions.group_by('client_id', maintain_order = True).agg([pl.col(col) for col in embed_cols_train +  [ 'event_time', 'amount']])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_transactions_train.columns)} ]
gp_transactions_train = gp_transactions_train.to_numpy()

gp_target = target_data.group_by('client_id', maintain_order = True).agg([pl.col(col) for col in TARGET_LIST + ['mon']])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_target.columns)} ]
gp_target = gp_target.to_numpy()

gp_dial = dial.group_by('client_id').agg([pl.col('embedding'), pl.col('event_time')])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_dial.columns)} ]
gp_dial = gp_dial.to_numpy()

gp_geo = geo.group_by('client_id').agg([pl.col('geohash_4'), pl.col('event_time')])
dict_gp_cols += [ {x:i for i,x in enumerate(gp_geo.columns)} ]
gp_geo = gp_geo.to_numpy()

gp_transactions_train_key = {x:i for i,x in enumerate(gp_transactions_train[:, dict_gp_cols[0]['client_id']])}
gp_target_key = {x:i for i,x in enumerate(gp_target[:, dict_gp_cols[1]['client_id']])}
gp_dial_key = {x:i for i,x in enumerate(gp_dial[:, dict_gp_cols[2]['client_id']])}
gp_geo_key = {x:i for i,x in enumerate(gp_geo[:, dict_gp_cols[3]['client_id']])}

gc.collect()

0

In [21]:
embs_dict = {}
for col in embed_cols_train:
    embs_dict[col] = {'in':int(embed_max_data[col]) + 3, "out" : 32}

trx_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={col:'identity' for col in   ['amount',  'diff_event_time']},
    embeddings=embs_dict
)

tgt_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={f'feat_{col}' : 'identity' for col in TARGET_LIST},
    embeddings={}
)

geo_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={'geo_time' : 'identity'},
    embeddings={'geo_hash':{'in':embed_max_data['geohash_4'] + 4, "out" : 32}}
)

In [23]:
batch_size = 1024
params_val = {'batch_size': batch_size, 'shuffle': False, 'drop_last': False, 'num_workers': 4}
device = 'cuda'

sber_dataset_test = SberDataset(gp_transactions_train, gp_transactions_train_key, gp_target, gp_target_key, gp_dial, gp_dial_key,gp_geo, gp_geo_key,
        test_users, dict_gp_cols,val = True, embed_cols_train = embed_cols_train, TARGET_LIST = TARGET_LIST)
collate_fn = Collate(TARGET_LIST, embed_cols_train)
sber_dataset_test = DataLoader(sber_dataset_test, collate_fn = collate_fn, **params_val)

list_preds = []
for FOLD in range(5):
    model =  SberModel(trx_encoder_params, tgt_encoder_params, geo_encoder_params, 4 ).to(device).eval()
    model_dict = torch.load(f"sber_big_new_{FOLD}.pt", map_location='cuda')
    model.load_state_dict(model_dict['state_dict'])

    tk0 = tqdm(enumerate(sber_dataset_test), total = len(sber_dataset_test))
    average_loss = 0
    preds = []

    with torch.no_grad():
        for batch_number, (vals) in tk0:
            emb_feats = vals[0].to(device)
            emb_target = vals[1].to(device)
            emb_dial = vals[2].to(device)
            emb_geo = vals[3].to(device)
            target = vals[4].to(device)

            with torch.cuda.amp.autocast():
                ans = model(emb_feats, emb_target,  emb_dial, emb_geo)
            preds += [torch.sigmoid(ans ).cpu().detach().numpy()]

    list_preds += [np.concatenate(preds, 0)]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

In [24]:
rev_client_id = {v:k for k,v in dict_client_ids.items()}
preds_ = np.mean(list_preds, 0)

In [43]:
import zipfile

In [45]:
pred = pd.DataFrame()
pred['client_id'] = test_users
for i, x in enumerate(TARGET_LIST):
    pred[x] = preds_[:, i]
pred['client_id'] = pred['client_id'].map(rev_client_id)
ss = pd.read_csv('drive/MyDrive/sample_submission_sber.csv')
ss1 = ss[['client_id']].merge(pred, on = 'client_id', how = 'left')
ss1.to_csv('submission.csv', index = None)
zipfile.ZipFile('pred_sber_final_final.zip', mode='w').write("submission.csv")