In [None]:
!pip install git+https://github.com/dllllb/pytorch-lifestream.git@main
!pip install -U 'torch<2'
!pip install -U 'pytorch-lightning<2'
!pip install -U "torchvision<0.15.1"
!pip install duckdb

In [1]:
import pandas as pd
import numpy as np
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from tqdm.auto import tqdm, trange
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from catboost import CatBoostClassifier, Pool, cv
from ptls.preprocessing.pandas.pandas_preprocessor import PandasDataPreprocessor
from functools import partial
import random
import pytorch_lightning as pl
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
import torch

In [3]:
train_data = pd.read_csv('/kaggle/input/ioai-contest-1/train.csv')
train_data = train_data[train_data['Date'].apply(lambda x: not '02-29' in x)]
train_data['Date'] = pd.to_datetime(train_data['Date'])

In [4]:
def create_time_features(df):
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    df['year'] = df['Date'].dt.year
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['all_year_time'] = df['Date'].apply(lambda x: (x.year - 2019) * 365 + x.month * 30 + x.day)
    df['cl_is_weekend'] = (df['dayofweek'] > 4).astype('int16') 
    df['all_date'] = (df['year'] - 2019) * 12 + (df['month'] - 1)
    return df

train_data = create_time_features(train_data)

In [5]:
train_data['all_index'] = train_data['Company_ID'].astype(str) + '|' + train_data['Product_ID'].astype(str) + '|' + train_data['all_date'].astype(str)

In [6]:
for col in train_data.columns:
    if 'num' in col:
        train_data[col] = train_data[col].fillna(train_data[col].median())
    if 'cat' in col:
        train_data[col] = train_data[col].fillna(train_data[col].max() + 1)

In [7]:
train_df = train_data[train_data['all_date'] < 54]
train_df = train_df.sort_values(by=['Company_ID','Product_ID','Date'])

In [10]:
train_df['Date'] = train_df['Date'].astype(str)

In [11]:
preprocessor = PandasDataPreprocessor(
    col_id="all_index",
    col_event_time="Date",
    event_time_transformation="none",
    cols_category=[col for col in train_data.columns if 'cat' in col],
    cols_numerical=[col for col in train_data.columns if 'num' in col]+['month','day','dayofweek','year','all_year_time'],
    return_records=True,
)

Creating Dask Server
Link Dask Server - http://172.19.2.2:46155/status


Perhaps you already have a cluster running?
Hosting the HTTP server on port 46155 instead


In [None]:
train_data_proc = preprocessor.fit_transform(train_df)

In [None]:
trx_encoder = TrxEncoder(
    embeddings_noise=0.003,
    numeric_values={
         col: 'identity'
         for col in [col for col in train_data.columns if 'num' in col]+['month','day','dayofweek','year','all_year_time']
    },
    embeddings={
        'cat_0': {'in': 4, 'out': 4},
        'cat_1': {'in': 9500, 'out': 384},
        'cat_2': {'in': 3, 'out': 2},
        'cat_3': {'in': 4, 'out': 4},
        'cat_4': {'in': 3, 'out': 2},
        'cat_5': {'in': 5, 'out': 4},
        'cat_6': {'in': 5, 'out': 4},
        'cat_7': {'in': 6, 'out': 4},
        'cat_8': {'in': 16, 'out': 12},
        'cat_9': {'in': 106, 'out': 32},
        'cat_10': {'in': 6, 'out': 4},
        'cat_11': {'in': 23, 'out': 12},
        'cat_12': {'in': 9, 'out': 6},
        'cat_13': {'in': 85, 'out': 32},
        'cat_14': {'in': 18, 'out': 8},
    }
)

# Sequence Encoder
seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=384,  # Dimension of the generated embeddings
    type='gru',
)

# CoLES Module
coles_module = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.AdamW, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9)
)

In [None]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train_data_proc,
            i_filters=[
                SeqLenFilter(min_seq_len=5),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=200,
        ),
    ),
    train_num_workers=4,
    train_batch_size=32,
)

In [None]:
trainer = pl.Trainer(max_epochs=5, gpus=1)

trainer.fit(coles_module, train_dl)