In [28]:
import pandas as pd
import numpy as np
import random
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from tqdm.auto import tqdm, trange
from copy import deepcopy
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from catboost import CatBoostClassifier, Pool, cv
from ptls.preprocessing import PandasDataPreprocessor
from functools import partial
import pytorch_lightning as pl
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

In [29]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.frames.coles.losses import SoftmaxLoss
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.utils import collate_feature_dict

In [30]:
data = pd.read_csv('dodohack/Data Secrets First Cup/orders.csv',parse_dates=['SaleDate'])

In [31]:
def mounth_count_day(x):
    months = [31,28,31,30,31,30,31,31,30,31,30,31]
    return sum(months[:x])

def create_time_features(df,time_col):
    df['month'] = df[time_col].dt.month
    df['day'] = df[time_col].dt.day
    df['hour'] = df[time_col].dt.hour
    df['year'] = df[time_col].dt.year
    df['weekofyear'] = df[time_col].apply(lambda x: x.weekofyear)
    df['dayofweek'] = df[time_col].dt.dayofweek
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['absolute_time'] = (df[time_col] - df[time_col].min()).dt.days
    
    df['all_day_time'] = df[time_col].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df[time_col].apply(lambda x: x.dayofweek * 24 + x.hour)
    
    df['cl_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16')
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16')
    return df

In [32]:
data = create_time_features(data,'SaleDate')

In [33]:
data.columns

Index(['OrderUUId', 'addressId', 'deliverySectorId', 'ClientUUId', 'Date',
       'SaleDate', 'UnitUUId', 'NewClient', 'ClientOrderNumber', 'ProductUUId',
       'CategoryId', 'ProductTotalPrice', 'MenuPrice', 'OrderState',
       'OrderPaymentType', 'OrderTotalPrice', 'OrderType', 'apply_promo',
       'month', 'day', 'hour', 'year', 'weekofyear', 'dayofweek', 'dayofyear',
       'absolute_time', 'all_day_time', 'all_week_time', 'cl_early_morning',
       'cl_is_weekend'],
      dtype='object')

In [34]:
data['CategoryId'].nunique()

7

In [35]:
preprocessor = PandasDataPreprocessor(
    col_id='ClientUUId',
    col_event_time='SaleDate',
    event_time_transformation='dt_to_timestamp',
    cols_category=['addressId','deliverySectorId','UnitUUId',
                   'ProductUUId','CategoryId',
                   'OrderType','OrderState','OrderPaymentType'],
    
    cols_numerical=['month','hour','dayofweek','absolute_time',
                    'NewClient','ClientOrderNumber','ProductTotalPrice',
                    'MenuPrice','OrderTotalPrice','apply_promo'],
)

In [36]:
data['SaleDate'] = data['SaleDate'].map(lambda x:x.strftime('%Y-%m-%d %X'))
data_proc = preprocessor.fit_transform(data)

In [45]:
data['OrderPaymentType'].nunique()

3

In [46]:
trx_encoder = TrxEncoder(
    embeddings_noise=0.003,
    numeric_values={
        'month': 'identity',
        'hour':'identity',
        'dayofweek':'identity',
        'absolute_time':'identity',
        'NewClient':'identity',
        'ClientOrderNumber':'identity',
        'ProductTotalPrice':'identity',
        'MenuPrice':'identity',
        'OrderTotalPrice':'identity',
        'apply_promo':'identity',
        
    },
    embeddings={
        'addressId': {'in': 2706, 'out': 384},
        'deliverySectorId': {'in': 1259, 'out': 256},
        'UnitUUId': {'in': 784, 'out': 256},
        'ProductUUId': {'in': 1259, 'out': 128},
        'CategoryId': {'in': 7, 'out': 4},
        'OrderType': {'in': 3, 'out': 2},
        'OrderState': {'in': 2, 'out': 2},
        'OrderPaymentType': {'in': 3, 'out': 2},
    }
)

# Sequence Encoder
seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=128,  # Dimension of the generated embeddings
    type='gru',
)

# CoLES Module
coles_module = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.AdamW, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9)
)

In [47]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_proc,
            i_filters=[
                SeqLenFilter(min_seq_len=5),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=200,
        ),
    ),
    train_num_workers=8,
    train_batch_size=128,
)

In [48]:
trainer = pl.Trainer(max_epochs=12, devices=1)

trainer.fit(coles_module, train_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2024-07-20 01:54:57.354084: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 01:54:57.354143: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=12` reached.


In [49]:
inference_module = InferenceModule(
        model=seq_encoder,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name=f'emb'
)

In [50]:
inference_dataset_train = MemoryMapDataset(
    data=data_proc,
)

inference_dl_train = torch.utils.data.DataLoader(
    dataset=inference_dataset_train,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=8,
)

In [51]:
predict = pl.Trainer(devices=1).predict(inference_module, inference_dl_train)
all_feats = pd.concat(predict, axis=0).set_index('ClientUUId')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [52]:
all_feats.to_parquet('order_embeds.parquet')

In [53]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, TimeSeriesSplit, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator

In [54]:
all_feats = all_feats.T.drop_duplicates().T
all_feats['ClientUUId'] = all_feats.index
all_feats.index = range(len(all_feats))

In [55]:
train_data = pd.read_csv('dodohack/Data Secrets First Cup/train_target.csv',parse_dates=['LocalBeginDate','LocalEndDate'])
train_data = train_data.merge(all_feats)

In [56]:
params = {
    'iterations':500,
    'learning_rate':0.01,
    'loss_function':'CrossEntropy',
    'max_depth':4,
    'eval_metric':'AUC',
    'task_type':'GPU',
    'random_seed':56
}

drop_cols = [
    'LocalBeginDate',
    'LocalEndDate',
    'ClientUUId',
]

cat_cols = [
    'Id',
    'OrderType',
]

label_col = 'apply_promo'
num_fold = 5
n_fold_test = 10
test_c_stop = 1
num_repits = 1

In [57]:
class CatBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        scores = []
        
        for i in trange(num_repits):
            kfold = StratifiedGroupKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]

                train_pool = Pool(
                    train_df.drop([label_col]+drop_cols,axis=1),
                    label = train_df[label_col],
                    cat_features = cat_features
                )

                eval_pool = Pool(
                    test_df.drop([label_col]+drop_cols,axis=1),
                    label = test_df[label_col],
                    cat_features = cat_features
                )

                cbm = CatBoostClassifier(**self.params)
                cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)

                score = roc_auc_score(test_df[label_col],cbm.predict_proba(eval_pool)[:,1])
                scores += [score]
                self.models += [cbm]
        #print(f"Total Score {np.mean(scores)}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        test_pool = Pool(
            test_data.drop(drop_cols,axis=1),
            cat_features=cat_features
        )
        preds = np.mean([model.predict_proba(test_pool)[:,1] for model in self.models],axis=0)
        return preds
    
    def get_feature_importance(self):
        imp_0 = self.models[0].get_feature_importance(prettified=True).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].get_feature_importance(prettified=True).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [58]:
def model_builder(train_data):
    model = CatBoostKfoldWraper(num_fold,num_repits,params)
    
    model.fit(
        train_data,
        cat_features=cat_cols,
        drop_cols=drop_cols,
        label_col=label_col,
        verbose=500
    )
    
    return model

def model_predicter(model,test_data):
    return model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [59]:
class TestKFoldWrapper():
    def __init__(self,num_folds=10,top_c=1,random_state=56):
        self.top_c = top_c
        self.kfold = StratifiedGroupKFold(num_folds,random_state=random_state,shuffle=True)
    
    def run_experiments(self,model_builder,model_predicter,train_data,label_col=None):
        self.models = []
        self.scores = []
        c = 0
        
        for train_index, test_index in tqdm(self.kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
            train_df = train_data.iloc[train_index]
            test_df = train_data.iloc[test_index]
            
            model = model_builder(train_df)
            score = roc_auc_score(test_df[label_col],model_predicter(model,test_df))
            self.scores.append(score)
            self.models.append(model)
            c += 1
            if c >= self.top_c:
                break
        
        print(f"Total Score {np.mean(self.scores)}")

In [60]:
evaluator = TestKFoldWrapper(n_fold_test,test_c_stop)

evaluator.run_experiments(
    model_builder=model_builder,
    model_predicter=model_predicter,
    train_data=train_data,
    label_col=label_col
)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5399282	best: 0.5399282 (0)	total: 17.4ms	remaining: 8.67s
499:	test: 0.6548592	best: 0.6560553 (470)	total: 8s	remaining: 0us
bestTest = 0.6560552716
bestIteration = 470
Shrink model to first 471 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5492847	best: 0.5492847 (0)	total: 12.3ms	remaining: 6.12s
499:	test: 0.6151896	best: 0.6166139 (440)	total: 7.88s	remaining: 0us
bestTest = 0.6166139245
bestIteration = 440
Shrink model to first 441 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5372133	best: 0.5372133 (0)	total: 12.3ms	remaining: 6.12s
499:	test: 0.5678112	best: 0.5723202 (85)	total: 7.67s	remaining: 0us
bestTest = 0.5723201632
bestIteration = 85
Shrink model to first 86 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5401148	best: 0.5401148 (0)	total: 13ms	remaining: 6.47s
499:	test: 0.5730336	best: 0.5887601 (55)	total: 8.27s	remaining: 0us
bestTest = 0.588760078
bestIteration = 55
Shrink model to first 56 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5536158	best: 0.5536158 (0)	total: 13.3ms	remaining: 6.62s
499:	test: 0.5577962	best: 0.5894220 (40)	total: 7.98s	remaining: 0us
bestTest = 0.5894219875
bestIteration = 40
Shrink model to first 41 iterations.
Total Score 0.6415881809787627
