In [15]:
import pandas as pd
import numpy as np
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load import IterableChain
from datetime import datetime
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
import torch
from functools import partial
from torch.utils.data import DataLoader
from ptls.data_load.padded_batch import PaddedBatch
from ptls.data_load.utils import collate_feature_dict
from tqdm import tqdm

In [2]:
train_data_path = 'dial_train_prepr.parquet'
valid_data_path = 'dial_test_prepr.parquet'

# Last embedding

## Usefull function

In [3]:
class GetSplit(IterableProcessingDataset):
    def __init__(
        self,
        start_month,
        end_month,
        year=2022,
        col_id='client_id',
        col_time='event_time'
    ):
        super().__init__()
        self.start_month = start_month
        self.end_month = end_month
        self._year = year
        self._col_id = col_id
        self._col_time = col_time
        
    def __iter__(self):
        for rec in self._src:
            for month in range(self.start_month, self.end_month+1):
                features = rec[0] if type(rec) is tuple else rec
                features = features.copy()
                
                if month == 12:
                    month_event_time = datetime(self._year + 1, 1, 1).timestamp()
                else:
                    month_event_time = datetime(self._year, month + 1, 1).timestamp()
                    
                year_event_time = datetime(self._year, 1, 1).timestamp()
                
                mask = features[self._col_time] < month_event_time
                
                for key, tens in features.items():
                    if key.startswith('target'):
                        features[key] = tens[month - 1].tolist()    
                    elif key == 'embedding':
                        features[key] = torch.tensor(tens.tolist())[mask]
                        if len(features[key]) > 1:
                            features[key] = features[key][-1]
                        elif len(features[key]) == 1:
                            features[key] = features[key][0]
                        elif len(features[key]) == 0:
                            features[key] = torch.zeros(768)
                            
                features[self._col_id] += '_month=' + str(month)

                yield features

def collate_feature_dict_with_target(batch, col_id='client_id', targets=False):
    batch_ids = []
    target_cols = []
    for sample in batch:
        batch_ids.append(sample[col_id])
        del sample[col_id]
        
        if targets:
            target_cols.append([sample[f'target_{i}'] for i in range(1, 5)])
            del sample['target_1']
            del sample['target_2']
            del sample['target_3']
            del sample['target_4']
            
    padded_batch = collate_feature_dict(batch)
    if targets:
        return padded_batch, batch_ids, target_cols
    return padded_batch, batch_ids    

def to_pandas(x):
    expand_cols = []
    scalar_features = {}

    for k, v in x.items():
        if type(v) is torch.Tensor:
            v = v.cpu().numpy()

        if type(v) is list or len(v.shape) == 1:
            scalar_features[k] = v
        elif len(v.shape) == 2:
            expand_cols.append(k)
        else:
            scalar_features[k] = None

    dataframes = [pd.DataFrame(scalar_features)]
    for col in expand_cols:
        v = x[col].cpu().numpy()
        dataframes.append(pd.DataFrame(v, columns=[f'{col}_{i:04d}' for i in range(v.shape[1])]))

    return pd.concat(dataframes, axis=1)

def get_dataset(dl, target=True):
    dataset = []
    for batch in tqdm(dl):
        if target:
            out, batch_ids, target_cols = batch[0].payload['embedding'], batch[1], np.squeeze([batch[2]])
            x_out = {
                'client_id': batch_ids,
                'target_1': target_cols[:, 0],
                'target_2': target_cols[:, 1],
                'target_3': target_cols[:, 2],
                'target_4': target_cols[:, 3],
                'embs': out
            }
        else:
            out, batch_ids = batch[0].payload['embedding'], batch[1]
            x_out = {
                'client_id': batch_ids,
                'embs': out
            }
        dataset.append(to_pandas(x_out))
    return pd.concat(dataset, axis=0)

In [4]:
class ToTorchTmpLast(IterableProcessingDataset):
    def __init__(
        self,
    ):
        super().__init__()
        
    def __iter__(self):
        for rec in self._src:
            features = rec[0] if type(rec) is tuple else rec
            features = features.copy()
            for key, tens in features.items():
                if key == 'embedding':
                    features[key] = torch.tensor(tens.tolist())
                    if len(features[key]) > 1:
                        features[key] = features[key][-1]#torch.mean(features[key], dim=0)
                    elif len(features[key]) == 1:
                        features[key] = features[key][0]
                    elif len(features[key]) == 0:
                        features[key] = torch.zeros(768)

            yield features

In [None]:
train_process = IterableChain(
            FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
            GetSplit(start_month=1, end_month=12),
            ToTorch(),
)

test_process = IterableChain(
            FeatureFilter(keep_feature_names=['client_id'], drop_feature_names=['target_1', 'target_2', 'target_3', 'target_4']),
            ToTorchTmpLast(),
            ToTorch()
)


train = ParquetDataset([train_data_path], post_processing=train_process)
test = ParquetDataset([valid_data_path], post_processing=test_process)

In [7]:
inference_train_dl = DataLoader(
        dataset=train,
        collate_fn=partial(collate_feature_dict_with_target, targets=True),
        shuffle=False,
        num_workers=0,
        batch_size=512,
    )

inference_test_dl = DataLoader(
        dataset=test,
        collate_fn=collate_feature_dict_with_target,
        shuffle=False,
        num_workers=8,
        batch_size=512,
    )

In [None]:
train_dataset = get_dataset(inference_train_dl)
train_dataset.to_parquet("dial_last_baseline_train.parquet", index=False, engine="pyarrow", compression="snappy")
del train_dataset

In [None]:
test_dataset = get_dataset(inference_test_dl, target=False)
test_dataset.to_parquet("dial_last_baseline_test.parquet", index=False, engine="pyarrow", compression="snappy")
del test_dataset