In [1]:
import sys
import os

import pandas as pd
import numpy as np
import torch
from glob import glob
from torch.utils.data import DataLoader

from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.category_size_clip import CategorySizeClip
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing import FeatureBinScaler
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader
from sklearn.model_selection import train_test_split

import pytorch_lightning as pl

import logging
import pickle

from itertools import groupby
from functools import reduce
from operator import iadd

from collections import defaultdict

from ptls.data_load.feature_dict import FeatureDict
from ptls.frames.coles.split_strategy import AbsSplit

from functools import partial
from ptls.nn import TrxEncoder
from ptls.nn.seq_encoder.rnn_encoder import RnnEncoder
from ptls.frames.coles.coles_module import CoLESModule
from ptls.frames.inference_module import InferenceModule

import warnings
warnings.filterwarnings("ignore")

from ptls.data_load.utils import collate_feature_dict
from tqdm.auto import tqdm
import lightgbm as ltb
import json

In [2]:
from ptls.frames.bert import  MlmDataset, MlmIterableDataset
from ptls.frames.tabformer.tabformer_dataset import TabformerDataset,  TabformerIterableDataset
from ptls.nn import TabFormerFeatureEncoder, TransformerEncoder
from ptls.frames.tabformer.tabformer_module import TabformerPretrainModule

# Data preprocessing

In [3]:
dataset_conf = {
    'category_max_size': {
        'geohash_4': 4999,
        'geohash_5': 4999,
        'geohash_6': 4999,
    },
}

In [4]:
train_data_path = 'geo_train_prepr.parquet'
valid_data_path = 'geo_test_prepr.parquet'

In [5]:
process = IterableChain(
            SeqLenFilter(min_seq_len=32),
            ISeqLenLimit(max_seq_len=4096),
            FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
            CategorySizeClip(dataset_conf['category_max_size']),
            ToTorch()
)

In [6]:
train = ParquetDataset([train_data_path], post_processing=process, shuffle_files=True)
valid = ParquetDataset([valid_data_path], post_processing=process)

In [7]:
train_ds = TabformerIterableDataset(
    data=train,
    min_len=80,
    max_len=300
)

valid_ds = MlmIterableDataset(
    data=train,
    min_len=80,
    max_len=300
)

In [8]:
train_dl = PtlsDataModule(
    train_data=train_ds,
    train_num_workers=16,
    train_batch_size=256,
    valid_data=valid_ds,
    valid_num_workers=16,
    valid_batch_size=256
)

# Model

In [9]:
trx_encoder_params = dict(
    embeddings_noise=0.003, 
    embeddings={
        'geohash_4': {'in': 5000, 'out': 24},
        "geohash_5": {'in': 5000, "out": 24},
        "geohash_6": {'in': 5000, "out": 24},
      }
)

feature_encoder = TabFormerFeatureEncoder(3, 24)
seq_encoder = TransformerEncoder(
    input_size=3*24,
    n_heads=2,
    n_layers=2,
    use_positional_encoding=False
)

model = TabformerPretrainModule(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    seq_encoder=seq_encoder,
    feature_encoder=feature_encoder,
    mask_prob=0.2,
    total_steps=50000
)

# Train

In [None]:
import numpy as np
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    limit_val_batches=5000,
    gpus=[0],
    enable_progress_bar=False,
    gradient_clip_val=0.5,
    logger=pl.loggers.TensorBoardLogger(
        save_dir='./logdir',
        name='baseline_result_tabformer'
    ),
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=5000, save_top_k=-1),
    ]
)

In [None]:
trainer.fit(model, train_dl)

In [13]:
torch.save(model.seq_encoder, '../models/geo_baseline_tabformer.pt')

In [15]:
model = model.seq_encoder

# Inference

In [16]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from datetime import datetime
from ptls.data_load.padded_batch import PaddedBatch

In [17]:
class GetSplit(IterableProcessingDataset):
    def __init__(
        self,
        start_month,
        end_month,
        year=2022,
        col_id='client_id',
        col_time='event_time'
    ):
        super().__init__()
        self.start_month = start_month
        self.end_month = end_month
        self._year = year
        self._col_id = col_id
        self._col_time = col_time
        
    def __iter__(self):
        for rec in self._src:
            for month in range(self.start_month, self.end_month+1):
                features = rec[0] if type(rec) is tuple else rec
                features = features.copy()
                
                if month == 12:
                    month_event_time = datetime(self._year + 1, 1, 1).timestamp()
                else:
                    month_event_time = datetime(self._year, month + 1, 1).timestamp()
                    
                year_event_time = datetime(self._year, 1, 1).timestamp()
                
                mask = features[self._col_time] < month_event_time
                
                for key, tensor in features.items():
                    if key.startswith('target'):
                        features[key] = tensor[month - 1].tolist()    
                    elif key != self._col_id:
                        features[key] = tensor[mask] 
                            
                features[self._col_id] += '_month=' + str(month)

                yield features
                
def collate_feature_dict_with_target(batch, col_id='client_id', targets=False):
    batch_ids = []
    target_cols = []
    for sample in batch:
        batch_ids.append(sample[col_id])
        del sample[col_id]
        
        if targets:
            target_cols.append([sample[f'target_{i}'] for i in range(1, 5)])
            del sample['target_1']
            del sample['target_2']
            del sample['target_3']
            del sample['target_4']
            
    padded_batch = collate_feature_dict(batch)
    if targets:
        return padded_batch, batch_ids, target_cols
    return padded_batch, batch_ids


class InferenceModuleMultimodal(pl.LightningModule):
    def __init__(self, model, pandas_output=True, drop_seq_features=True, model_out_name='out'):
        super().__init__()

        self.model = model
        self.pandas_output = pandas_output
        self.drop_seq_features = drop_seq_features
        self.model_out_name = model_out_name

    def forward(self, x):
        x_len = len(x)
        if x_len == 3:
            x, batch_ids, target_cols = x
        else: 
            x, batch_ids = x
            
        out = self.model(x)
        if x_len == 3:
            target_cols = torch.tensor(target_cols)
            x_out = {
                'client_id': batch_ids,
                'target_1': target_cols[:, 0],
                'target_2': target_cols[:, 1],
                'target_3': target_cols[:, 2],
                'target_4': target_cols[:, 3],
                self.model_out_name: out
            }
        else:
            x_out = {
                'client_id': batch_ids,
                self.model_out_name: out
            }

        if self.pandas_output:
            return self.to_pandas(x_out)
        return x_out

    @staticmethod
    def to_pandas(x):
        expand_cols = []
        scalar_features = {}

        for k, v in x.items():
            if type(v) is torch.Tensor:
                v = v.cpu().numpy()

            if type(v) is list or len(v.shape) == 1:
                scalar_features[k] = v
            elif len(v.shape) == 2:
                expand_cols.append(k)
            else:
                scalar_features[k] = None

        dataframes = [pd.DataFrame(scalar_features)]
        for col in expand_cols:
            v = x[col].cpu().numpy()
            dataframes.append(pd.DataFrame(v, columns=[f'{col}_{i:04d}' for i in range(v.shape[1])]))

        return pd.concat(dataframes, axis=1)

In [18]:
train_process = IterableChain(
            ISeqLenLimit(max_seq_len=512),
            FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
            GetSplit(start_month=1, end_month=12),
            ToTorch()
)

test_process = IterableChain(
            ISeqLenLimit(max_seq_len=512),
            FeatureFilter(keep_feature_names=['client_id'], drop_feature_names=['target_1', 'target_2', 'target_3', 'target_4']),
            ToTorch()
)


train = ParquetDataset([train_data_path], post_processing=train_process)
test = ParquetDataset([valid_data_path], post_processing=test_process)

In [19]:
inference_train_dl = DataLoader(
        dataset=train,
        collate_fn=partial(collate_feature_dict_with_target, targets=True),
        shuffle=False,
        num_workers=16,
        batch_size=32,
    )

inference_test_dl = DataLoader(
        dataset=test,
        collate_fn=collate_feature_dict_with_target,
        shuffle=False,
        num_workers=16,
        batch_size=32,
    )

In [20]:
inf_module = InferenceModuleMultimodal(
        model=model,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name='emb',
    )

In [None]:
trainer = pl.Trainer(gpus=[0], max_epochs=-1)

In [None]:
inf_test_embeddings = pd.concat(
        trainer.predict(inf_module, inference_test_dl)
    )
inf_test_embeddings.to_parquet("geo_baseline_tabformer_test.parquet", index=False, engine="pyarrow", compression="snappy")

In [23]:
del inf_test_embeddings

In [None]:
inf_train_embeddings = pd.concat(
        trainer.predict(inf_module, inference_train_dl)
    )
inf_train_embeddings.to_parquet("geo_baseline_tabformer_train.parquet", index=False, engine="pyarrow", compression="snappy")

In [None]:
del inf_train_embeddings

# Downstream

In [None]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]
        
    def fit(self):
        
        train_embeddings = pd.read_parquet(self.train_path)
        X_train = train_embeddings.drop(columns=self.drop_feat)

        clfs = dict()
        for col_target in self.all_targets:
            clf = ltb.LGBMClassifier(**self.params)
            y_train = train_embeddings[col_target]
            clf.fit(X_train, y_train)
            print(f'Model fitted, target: {col_target}')
            clfs[col_target] = clf
        return clfs

    def get_scores(
        self, 
        clfs
    ):
        scores = pd.DataFrame([])

        test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        scores[self.col_id] = ids
            
        for col_target in self.all_targets:
            clf = clfs[col_target]
            score = clf.predict_proba(X_test)[:, 1]
            scores[col_target] = score
        
        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)
        
        scores.to_csv(self.result_path)
            
        return scores

In [None]:
params = {
    "n_estimators": 500,
      "boosting_type": "gbdt",
      "objective": "binary",
      "subsample": 0.5,
      "subsample_freq": 1,
      "learning_rate": 0.02,
      "feature_fraction": 0.75,
      "max_depth": 6,
      "lambda_l1": 1,
      "lambda_l2": 1,
      "min_data_in_leaf": 50,
      "random_state": 42,
      "n_jobs": 8,
}

dw = Downstream(
    train_path="geo_baseline_tabformer_train.parquet",
    test_path="geo_baseline_tabformer_test.parquet",
    params=params,
    result_path='baseline_tabformer_geo.csv' 
)

scores = dw.run()
scores

In [None]:
! python ../scripts/evaluate.py --ref_df_public public_target.parquet --ref_df_private private_target.parquet --pred_df baseline_tabformer_geo.csv --public_result_path public_tabformer_geo_score.txt --private_result_path private_tabformer_geo_score.txt