In [1]:
from ptls.data_load.datasets import ParquetDataset, ParquetFiles

In [2]:
import os
import pandas as pd
import numpy as np
import torch
from functools import partial
import pytorch_lightning as pl
import warnings
warnings.filterwarnings("ignore")

from torch.utils.data import DataLoader

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

from tqdm.auto import tqdm
import lightgbm as ltb

In [3]:
processed_train = pd.DataFrame(ParquetDataset(ParquetFiles('./Data/train_geo_preprocessor.parquet'))) #загружаем препросцессинг данных train+test
processed_test = pd.DataFrame(ParquetDataset(ParquetFiles('./Data/test_geo_preprocessor.parquet')))

In [4]:
train = MemoryMapDataset(
    data=processed_train.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=22),
        ISeqLenLimit(max_seq_len=2530),
        ToTorch()
    ]
)

test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=22),
        ISeqLenLimit(max_seq_len=2530),
        ToTorch()
    ]
)

In [5]:
train_ds = ColesIterableDataset(
    data=train,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

valid_ds = ColesIterableDataset(
    data=test,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

In [6]:
train_dl = PtlsDataModule(
    train_data=train_ds,
    train_num_workers=3,
    train_batch_size=64,
    valid_data=valid_ds,
    valid_num_workers=3,
    valid_batch_size=64
)


In [7]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    embeddings={
        "geohash_4": {'in': 32087037, "out": 5},
        "geohash_5": {'in': 3820722, "out": 5},
        'geohash_6': {'in': 421733, 'out': 5},
      }
)


In [8]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=32,
    type='gru')

In [9]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9)
)

In [10]:
# # Укажите путь к файлу с весами модели
# model_weights_path = 'mlm-geo_trx_v55.pt'

# # Загрузите веса модели
# model.load_state_dict(torch.load(model_weights_path))

# # # Загрузите сохраненное состояние параметров модели

# # model_state_dict = torch.load("model_new1.pt")

# # # Загрузите параметры в модель
# # model.load_state_dict(model_state_dict)


In [11]:
trainer = pl.Trainer(
    max_epochs=10,
    limit_val_batches=5000,
  
    enable_progress_bar=True,
    gradient_clip_val=0.5,
    logger=pl.loggers.TensorBoardLogger(
        save_dir='./logdir',
        name='baseline_result'
    ),
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=5000, save_top_k=-1),
    ]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(model, train_dl)

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 181 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
181 M     Trainable params
0         Non-trainable params
181 M     Total params
726.609   Total estimated model params size (MB)


Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [13]:
torch.save(model.state_dict(), f"mlm-emb_geo_vX1.pt")

# Inference

In [14]:
import torch
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
from ptls.data_load.datasets import inference_data_loader

In [15]:
processed_train = pd.DataFrame(ParquetDataset(ParquetFiles('./Data/df_geo_preprocessor.parquet'))) #загружаем препросцессинг данных train+test
id_client = pd.DataFrame(ParquetDataset(ParquetFiles('./Data/result_sample_Client_Month_df_12_06_2024.parquet'))) #загружаем перечень id_client и даты отчётов на которые надо сделаит предсказание

In [16]:
processed_train.shape, id_client.shape

((789051, 5), (747847, 3))

In [17]:
# Функция для фильтрации данных по месяцу
def filter_by_month(timestamps, data_list, month):
    target_date = month - relativedelta(months=1)
    target_year = target_date.year
    target_month = target_date.month
    filtered_timestamps = []
    filtered_data_list = [[] for _ in range(len(data_list))]

    for i, ts in enumerate(timestamps):
        dt = datetime.fromtimestamp(ts)
        
        if dt.year == target_year and dt.month == target_month:
            filtered_timestamps.append(ts)
            for j in range(len(data_list)):
                filtered_data_list[j].append(data_list[j][i])

    filtered_timestamps = torch.tensor(filtered_timestamps)
    filtered_data_list = [torch.tensor(data) for data in filtered_data_list]

    return filtered_timestamps, filtered_data_list

# Основная функция для фильтрации DataFrame по месяцу
def filter_dataframe_by_month(df, timestamp_column, data_columns, month):
    for index, row in tqdm(df.iterrows()):
        timestamps = row[timestamp_column]
        data_list = [row[col] for col in data_columns]
        filtered_timestamps, filtered_data_list = filter_by_month(timestamps, data_list, month)
        
        df.at[index, timestamp_column] = filtered_timestamps
        for col, filtered_data in zip(data_columns, filtered_data_list):
            df.at[index, col] = filtered_data

    return df

# Определяем столбцы с временными метками и данными
timestamp_column = 'event_time'

data_columns = [ 
        "geohash_4",
        "geohash_5",
        "geohash_6"]

In [18]:
#фильтруем данные по заданным id_client
df = pd.DataFrame()
for mon in tqdm(id_client.report_next_end.unique()):
    month_to_filter = mon
    date = processed_train[processed_train['client_id'].isin(id_client[id_client['report_next_end'] == mon].client_id)]
    date['report_next_end'] = mon 
    date = filter_dataframe_by_month(date, timestamp_column, data_columns, month_to_filter)
    date = date.applymap(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)
    date.dropna(inplace=True)
    df = pd.concat([df,date])
    

  0%|          | 0/13 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [19]:
# Функция для проверки наличия tensor([])
def contains_empty_tensor(x):
    return isinstance(x, torch.Tensor) and x.numel() == 0
# Преобразуем tensor([]) в NaN
df = df.applymap(lambda x: None if contains_empty_tensor(x) else x)

# Удаляем строки, содержащие NaN
df = df.dropna()
df.shape    

(396434, 6)

In [20]:
test = MemoryMapDataset(
    data=df.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id',]),
        ToTorch(),
    ]
)

In [21]:
trainer = pl.Trainer( max_epochs=-1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
model.to('cpu')
train_dl = inference_data_loader(test, num_workers=0, batch_size=64)
train_embeds = torch.vstack(trainer.predict(model, train_dl))
train_embeds.shape

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |                                             | 0/? [00:00<?, ?it/s]

torch.Size([396434, 32])

In [23]:
train_df = pd.DataFrame()
train_df['client_id'] = df['client_id']
train_df['report_next_end'] = df['report_next_end']

In [24]:
train_df['emb_trx'] = [i for i in train_embeds.numpy()]
train_df

Unnamed: 0,client_id,report_next_end,emb_trx
3,004c7779a0ecbcd972a24627f32dcbbd4d63c610a9ee0a...,2022-11-30,"[0.7512722, 0.034746684, 0.1970293, -0.4776746..."
20,01703640603e9dfa59cd76cc233d086ecc34d3ece32c56...,2022-11-30,"[0.24660093, 0.14926656, -0.16413787, -0.21441..."
23,0191f3d77d4c8f28ad676fe2e1fda9e3c7a9dc0351f0d0...,2022-11-30,"[0.48362908, 0.40308118, -0.42224756, -0.41774..."
29,01df36c22b9fd7e8ad8d7b1c396b2b1367a2606e5b6255...,2022-11-30,"[0.9418559, -0.2446612, 0.3373396, -0.65160483..."
40,0292402a3043ce0dc6335c9461a7aa68fbc88d8a49e63d...,2022-11-30,"[0.35298893, 0.23598681, -0.33515975, -0.28535..."
...,...,...,...
788887,f556afdbb6a63a7bedab03ec5abbd3e04980ec76c60e8e...,2022-12-30,"[0.32593557, 0.1657233, -0.2606573, -0.2983085..."
788918,f7811e306436328798b252bb698a0affc552304c3272fe...,2022-12-30,"[0.7641082, 0.6429504, -0.75637424, -0.7103421..."
788942,f8ebd0078c17acad1d9bfc9bc87c890f742529e447c90d...,2022-12-30,"[0.8075682, 0.2910181, 0.4616567, -0.66443074,..."
789023,fe3f567e6cb1803ca215ebe293b650ab6ccf25561cbafa...,2022-12-30,"[0.8575122, 0.3442208, -0.8776071, -0.5205566,..."


In [26]:
train_df.to_parquet(f'./Data/geo_emb_select_1_v2.parquet', engine='pyarrow')