In [66]:
import pandas as pd
import numpy as np
import random
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from tqdm.auto import tqdm, trange
from copy import deepcopy
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from catboost import CatBoostClassifier, Pool, cv
from ptls.preprocessing import PandasDataPreprocessor
from functools import partial
import pytorch_lightning as pl
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

In [67]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.frames.coles.losses import SoftmaxLoss
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.utils import collate_feature_dict

In [68]:
data = pd.read_csv('./dodohack/Data Secrets First Cup/mobile_events.csv',parse_dates=['Timestamp'])

In [69]:
def mounth_count_day(x):
    months = [31,28,31,30,31,30,31,31,30,31,30,31]
    return sum(months[:x])

def create_time_features(df,time_col):
    df['month'] = df[time_col].dt.month
    df['day'] = df[time_col].dt.day
    df['hour'] = df[time_col].dt.hour
    df['year'] = df[time_col].dt.year
    df['weekofyear'] = df[time_col].apply(lambda x: x.weekofyear)
    df['dayofweek'] = df[time_col].dt.dayofweek
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['absolute_time'] = (df[time_col] - df[time_col].min()).dt.days
    
    df['all_day_time'] = df[time_col].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df[time_col].apply(lambda x: x.dayofweek * 24 + x.hour)
    
    df['cl_early_morning'] = ((df['hour'] > 4) & (df['hour'] <= 8)).astype('int16')
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16')
    return df

In [70]:
data = create_time_features(data,'Timestamp')

In [72]:
preprocessor = PandasDataPreprocessor(
    col_id='ClientUUId',
    col_event_time='Timestamp',
    event_time_transformation='dt_to_timestamp',
    cols_category=['EventName','Platform'],
    cols_numerical=['month','hour','dayofweek','absolute_time'],
)

In [73]:
data['Timestamp'] = data['Timestamp'].map(lambda x:x.strftime('%Y-%m-%d %X'))
data_proc = preprocessor.fit_transform(data)

In [75]:
data['Platform'].nunique()

2

In [76]:
trx_encoder = TrxEncoder(
    embeddings_noise=0.003,
    numeric_values={
        'month': 'identity',
        'hour':'identity',
        'dayofweek':'identity',
        'absolute_time':'identity'
    },
    embeddings={
        'EventName': {'in': 11, 'out': 8},
        'Platform': {'in': 2, 'out': 2}
    }
)

# Sequence Encoder
seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=32,  # Dimension of the generated embeddings
    type='gru',
)

# CoLES Module
coles_module = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.AdamW, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9)
)

In [78]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_proc,
            i_filters=[
                SeqLenFilter(min_seq_len=5),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=200,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
)

In [80]:
trainer = pl.Trainer(max_epochs=5, devices=1)

trainer.fit(coles_module, train_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /notebooks/lightning_logs
2024-07-20 01:10:49.820963: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 01:10:49.821027: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [81]:
inference_module = InferenceModule(
        model=seq_encoder,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name=f'emb'
)

In [82]:
inference_dataset_train = MemoryMapDataset(
    data=data_proc,
)

inference_dl_train = torch.utils.data.DataLoader(
    dataset=inference_dataset_train,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=8,
)

In [88]:
predict = pl.Trainer(devices=1).predict(inference_module, inference_dl_train)
all_feats = pd.concat(predict, axis=0).set_index('ClientUUId')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [86]:
all_feats.to_parquet('mobile_embeds.parquet')

In [106]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, TimeSeriesSplit, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator

In [94]:
all_feats = all_feats.T.drop_duplicates().T
all_feats['ClientUUId'] = all_feats.index
all_feats.index = range(len(all_feats))

In [95]:
train_data = pd.read_csv('dodohack/Data Secrets First Cup/train_target.csv',parse_dates=['LocalBeginDate','LocalEndDate'])
train_data = train_data.merge(all_feats)

In [108]:
params = {
    'iterations':500,
    'learning_rate':0.01,
    'loss_function':'CrossEntropy',
    'max_depth':4,
    'eval_metric':'AUC',
    'task_type':'CPU',
    'random_seed':56
}

drop_cols = [
    'LocalBeginDate',
    'LocalEndDate',
    'ClientUUId',
]

cat_cols = [
    'Id',
    'OrderType',
]

label_col = 'apply_promo'
num_fold = 5
n_fold_test = 10
test_c_stop = 1
num_repits = 1

In [109]:
class CatBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        scores = []
        
        for i in trange(num_repits):
            kfold = StratifiedGroupKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]

                train_pool = Pool(
                    train_df.drop([label_col]+drop_cols,axis=1),
                    label = train_df[label_col],
                    cat_features = cat_features
                )

                eval_pool = Pool(
                    test_df.drop([label_col]+drop_cols,axis=1),
                    label = test_df[label_col],
                    cat_features = cat_features
                )

                cbm = CatBoostClassifier(**self.params)
                cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)

                score = roc_auc_score(test_df[label_col],cbm.predict_proba(eval_pool)[:,1])
                scores += [score]
                self.models += [cbm]
        #print(f"Total Score {np.mean(scores)}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        test_pool = Pool(
            test_data.drop(drop_cols,axis=1),
            cat_features=cat_features
        )
        preds = np.mean([model.predict_proba(test_pool)[:,1] for model in self.models],axis=0)
        return preds
    
    def get_feature_importance(self):
        imp_0 = self.models[0].get_feature_importance(prettified=True).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].get_feature_importance(prettified=True).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [110]:
def model_builder(train_data):
    model = CatBoostKfoldWraper(num_fold,num_repits,params)
    
    model.fit(
        train_data,
        cat_features=cat_cols,
        drop_cols=drop_cols,
        label_col=label_col,
        verbose=500
    )
    
    return model

def model_predicter(model,test_data):
    return model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [111]:
class TestKFoldWrapper():
    def __init__(self,num_folds=10,top_c=1,random_state=56):
        self.top_c = top_c
        self.kfold = StratifiedGroupKFold(num_folds,random_state=random_state,shuffle=True)
    
    def run_experiments(self,model_builder,model_predicter,train_data,label_col=None):
        self.models = []
        self.scores = []
        c = 0
        
        for train_index, test_index in tqdm(self.kfold.split(train_data,train_data[label_col],groups=train_data['ClientUUId'])):
            train_df = train_data.iloc[train_index]
            test_df = train_data.iloc[test_index]
            
            model = model_builder(train_df)
            score = roc_auc_score(test_df[label_col],model_predicter(model,test_df))
            self.scores.append(score)
            self.models.append(model)
            c += 1
            if c >= self.top_c:
                break
        
        print(f"Total Score {np.mean(self.scores)}")

In [112]:
evaluator = TestKFoldWrapper(n_fold_test,test_c_stop)

evaluator.run_experiments(
    model_builder=model_builder,
    model_predicter=model_predicter,
    train_data=train_data,
    label_col=label_col
)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0:	test: 0.5206929	best: 0.5206929 (0)	total: 11.8ms	remaining: 5.9s
499:	test: 0.6963325	best: 0.6963566 (497)	total: 4.13s	remaining: 0us

bestTest = 0.6963565619
bestIteration = 497

Shrink model to first 498 iterations.
0:	test: 0.6349606	best: 0.6349606 (0)	total: 6.62ms	remaining: 3.3s
499:	test: 0.6880271	best: 0.6894087 (216)	total: 4.09s	remaining: 0us

bestTest = 0.6894087379
bestIteration = 216

Shrink model to first 217 iterations.
0:	test: 0.5726506	best: 0.5726506 (0)	total: 7.08ms	remaining: 3.53s
499:	test: 0.6889754	best: 0.6938371 (124)	total: 4.04s	remaining: 0us

bestTest = 0.6938371317
bestIteration = 124

Shrink model to first 125 iterations.
0:	test: 0.5377161	best: 0.5377161 (0)	total: 8.34ms	remaining: 4.16s
499:	test: 0.6529211	best: 0.6634131 (252)	total: 3.91s	remaining: 0us

bestTest = 0.6634130948
bestIteration = 252

Shrink model to first 253 iterations.
0:	test: 0.5507150	best: 0.5507150 (0)	total: 7.06ms	remaining: 3.52s
499:	test: 0.6803995	best: 0.681