In [None]:
import sys

! {sys.executable} -m pip install pytorch-lifestream
! {sys.executable} -m pip install catboost
! {sys.executable} -m pip install torchmetrics

## Prepare your data

- Use `Pyspark` in local or cluster mode for big dataset and `Pandas` for small.
- Split data into required parts (train, valid, test, ...).
- Use `ptls.preprocessing` for simple data preparation.
- Transform features to compatible format using `Pyspark` or `Pandas` functions.
You can also use `ptls.data_load.preprocessing` for common data transformation patterns.
- Split sequences to `ptls-data` format with `ptls.data_load.split_tools`. Save prepared data into `Parquet` format or
keep it in memory (`Pickle` also works).
- Use one of the available `ptls.data_load.datasets` to define input for the models.

In [None]:
import torch

import numpy as np
import pandas as pd
import torchmetrics
import pytorch_lightning as pl

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from functools import partial
from ptls.frames import PtlsDataModule
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head
from ptls.data_load.datasets import MemoryMapDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles.multimodal_dataset import MultiModalIterableDataset
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping


In [None]:
from functools import partial
from datetime import timedelta

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import catboost

import torch
import pytorch_lightning as pl
from torch.utils.data.dataloader import DataLoader
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from ptls.nn import TrxEncoder
from ptls.nn.seq_encoder.rnn_encoder import RnnEncoder
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.coles.multimodal_dataset import MultiModalDataset
from ptls.frames.coles.multimodal_dataset import MultiModalIterableDataset
from ptls.frames.coles.multimodal_dataset import MultiModalSortTimeSeqEncoderContainer
from ptls.frames.coles.multimodal_inference_dataset import MultiModalInferenceDataset
from ptls.frames.coles.multimodal_inference_dataset import MultiModalInferenceIterableDataset
from ptls.frames.inference_module import InferenceModuleMultimodal
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load import IterableProcessingDataset
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.datasets import MemoryMapDataset
from ptls.preprocessing import PandasDataPreprocessor

In [None]:
df_target = pd.read_csv(
    "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
)
df_target

In [None]:
df_trx = pd.read_csv(
    "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true",
    compression="gzip"
)
df_trx

In [None]:
len(df_target)

In [None]:
len(df_trx)

In [None]:
sourceA = df_trx[["client_id", "trans_date", "small_group"]]
sourceB = df_trx[["client_id", "trans_date", "amount_rur"]]

In [None]:
sourceA_drop_indices = np.random.choice(sourceA.index, int(150000), replace=False)
sourceB_drop_indices = np.random.choice(sourceB.index, int(450000), replace=False)

sourceA = sourceA.drop(sourceA_drop_indices).reset_index(drop=True)
sourceB = sourceB.drop(sourceB_drop_indices).reset_index(drop=True)

In [None]:
sourceA["trans_date"] = sourceA["trans_date"].apply(lambda x: x * 3600)
sourceB["trans_date"] = sourceB["trans_date"].apply(lambda x: x * 3600)

In [None]:
sourceA_preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    return_records=False,
)

sourceB_preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [None]:
processed_sourceA = sourceA_preprocessor.fit_transform(sourceA)
processed_sourceB = sourceB_preprocessor.fit_transform(sourceB)

In [None]:
processed_sourceA.columns = [
    "sourceA_" + str(col) if str(col) != "client_id" else str(col)
    for col in processed_sourceA.columns
]

In [None]:
processed_sourceB.columns = [
    "sourceB_" + str(col) if str(col) != "client_id" else str(col)
    for col in processed_sourceB.columns
]

In [None]:
joined_data = processed_sourceA.merge(processed_sourceB, how="outer", on="client_id")

In [None]:
joined_data

In [None]:
joined_data = joined_data.applymap(lambda x: torch.tensor([]) if pd.isna(x) else x)

In [None]:
joined_data

In [None]:
train_df, test_df = train_test_split(joined_data,
                                     test_size=0.4,
                                     random_state=42)
train_df, valid_df = train_test_split(train_df,
                                      test_size=0.1,
                                      random_state=42)

In [None]:
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
train_dict = train_df.to_dict("records")
valid_dict = valid_df.to_dict("records")
test_dict = test_df.to_dict("records")

In [None]:
source_features = {
    "sourceA": {
        "categorical": ["small_group"],
        "numeric": [],
    },
    "sourceB": {
        "categorical": [],
        "numeric": ["amount_rur"],
    },
}

In [None]:
inf_test_data = MultiModalInferenceIterableDataset(
    data = test_dict,
    source_features = source_features,
    col_id = "client_id",
    col_time = "trans_date",
    source_names = ("sourceA", "sourceB")
)

In [None]:
inf_test_loader = DataLoader(
    dataset = inf_test_data,
    collate_fn = partial(inf_test_data.collate_fn, col_id="client_id"),
    shuffle = False,
    num_workers = 0,
    batch_size = 8
)

In [None]:
!git clone https://github.com/google-research/google-research.git

In [None]:
import sys
sys.path.append("google-research/graph_embedding/metrics")

In [None]:
from metrics import (rankme,
        coherence,
        pseudo_condition_number,
        alpha_req,
        stable_rank,
        ne_sum,
        self_clustering)

In [None]:
!pip install git+https://github.com/simonzhang00/ripser-plusplus.git

In [None]:
import ripserplusplus as rpp
def ripser_metric(embeddings):
    """Вычисление метрики на основе ripserplusplus."""
    start_time = time()  
    
    if not isinstance(embeddings, np.ndarray):
        embeddings = np.array(embeddings)

    
    diagrams = rpp.run("--format point-cloud", embeddings)

    
    persistence_sum = sum([birth - death for birth, death in diagrams[0] if death > birth])

    elapsed_time = time() - start_time  
    

    return persistence_sum, elapsed_time

In [None]:
def create_datasets(train_dict, valid_dict, params, source_features):
    splitter = SampleSlices(
        split_count=params["split_count"],
        cnt_min=params["cnt_min"],
        cnt_max=params["cnt_max"],
    )

    train_data = MultiModalIterableDataset(
        data=train_dict,
        splitter=splitter,
        source_features=source_features,
        col_id="client_id",
        col_time="trans_date",
        source_names=("sourceA", "sourceB"),
    )

    valid_data = MultiModalIterableDataset(
        data=valid_dict,
        splitter=splitter,
        source_features=source_features,
        col_id="client_id",
        col_time="trans_date",
        source_names=("sourceA", "sourceB"),
    )

    data_loader = PtlsDataModule(
        train_data=train_data,
        train_batch_size=params["batch_size"],
        train_num_workers=0,
        valid_data=valid_data,
    )

    return data_loader

In [None]:
def compute_metrics(model, pl_trainer, inf_test_loader, selected_metrics=None, n_samples=10, sample_fraction=1/20):
    import gc
    from sklearn.utils import resample
    from time import time

    model.eval()
    inference_module = InferenceModuleMultimodal(
        model=model,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name="emb",
        col_id="client_id",
    )
    inference_module.model.is_reduce_sequence = True

    
    inf_test_embeddings = pd.concat(
        pl_trainer.predict(inference_module, inf_test_loader),
        axis=0,
    )
    embeddings_np = inf_test_embeddings.drop(columns=["client_id"]).to_numpy(dtype=np.float32)
    sample_size = max(1, int(sample_fraction * embeddings_np.shape[0]))

    
    available_metrics = {
        "rankme": rankme,
        "coherence": coherence,
        "pseudo_condition_number": pseudo_condition_number,
        "alpha_req": alpha_req,
        "stable_rank": stable_rank,
        "ne_sum": ne_sum,
        "self_clustering": self_clustering,
        "ripser": ripser_metric
    }
    if selected_metrics is None:
        selected_metrics = list(available_metrics.keys())

    metrics = {name: [] for name in selected_metrics}
    times = {name: [] for name in selected_metrics}

    for i in range(n_samples):
        sample = resample(embeddings_np, n_samples=sample_size, replace=False, random_state=42 + i)
        u, s, _ = np.linalg.svd(sample, compute_uv=True, full_matrices=False)

        for metric_name in selected_metrics:
            if metric_name not in available_metrics:
                continue

            try:
                if metric_name == "ripser":
                    val, t = available_metrics[metric_name](sample)
                else:
                    t0 = time()
                    val = available_metrics[metric_name](sample, u=u, s=s)
                    t = time() - t0

                metrics[metric_name].append(val)
                times[metric_name].append(t)
            except Exception as e:
                print(f"⚠️ Failed to compute {metric_name} on sample {i}: {e}")

        gc.collect()

    averaged_metrics = {k: np.mean(v) for k, v in metrics.items()}
    std_metrics = {k: np.std(v) for k, v in metrics.items()}
    
    averaged_times = {k: np.mean(v) for k, v in times.items()}
    std_times = {k: np.std(v) for k, v in times.items()}

    print("\n📊 Средние значения метрик и время вычисления:")
    for metric_name in averaged_metrics:
        metric_value = averaged_metrics[metric_name]
        metric_time = averaged_times.get(metric_name, None)
        print(f"🧠 {metric_name:30s} = {metric_value:.4f} | ⏱ {metric_time:.4f} сек")

    return averaged_metrics, averaged_times, std_metrics, std_times, inf_test_embeddings


In [None]:
import catboost


def evaluate_model(model, pl_trainer, checkpoint=None, selected_metrics=None, topk=5):
    model.eval()
    metrics, times, std_metrics, std_times, inf_test_embeddings = compute_metrics(model, pl_trainer, inf_test_loader, selected_metrics)
    targets_df = df_target.set_index("client_id")
    inf_test_df = inf_test_embeddings.merge(targets_df, how="inner", on="client_id").set_index("client_id")
    
    X = inf_test_df.drop(columns=["bins"])
    y = inf_test_df["bins"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    classifier = catboost.CatBoostClassifier(
        iterations=150,
        random_seed=42,
        verbose=0,
    )
    classifier.fit(X_train, y_train)
    
    accuracy = classifier.score(X_test, y_test)

    del classifier
    
    return metrics, times, std_metrics, std_times, accuracy

In [None]:
fixed_params = {
    "batch_size": 64,
    "learning_rate": 0.001,
    "split_count": 3,
    "cnt_min": 10,
    "cnt_max": 50,
    "embedding_dim": 16,  
    "category_embedding_dim": 8,  
    "hidden_size": 128,  
}


variable_params = {
    "batch_size": [16, 32, 64, 128], 
    "learning_rate": [0.0001, 0.001, 0.05],
    "split_count": [2, 3, 5],
    "cnt_min": [5, 10, 20],
    "cnt_max": [50, 80, 100],
    "embedding_dim": [8, 16, 32],
    "category_embedding_dim": [8, 16, 24],
    "hidden_size": [64, 128, 256, 1024],
}


all_hyperparameter_grids = []
for variable_param_name, variable_param_values in variable_params.items():
    for value in variable_param_values:
        hyperparameter_grid = {**fixed_params, variable_param_name: value}
        all_hyperparameter_grids.append((variable_param_name, hyperparameter_grid))


In [None]:
metric_names = [
    "rankme", "coherence", "pseudo_condition_number",
    "alpha_req", "stable_rank", "ne_sum", "self_clustering", "ripser"
]

In [None]:
category_embedding_dims = {
    "small_group": (150, fixed_params["category_embedding_dim"]),
}

In [None]:
import os

In [None]:
checkpoints_path = "checkpoints"
os.makedirs(checkpoints_path, exist_ok=True)

In [None]:
splitter = SampleSlices(split_count=5, cnt_min=25, cnt_max=50)

In [None]:
class CustomLogger(pl.Callback):
    def __init__(self):
        super().__init__()
        self.early_stopping_epoch = None  
    
    def on_train_epoch_end(self, trainer, pl_module):
        train_loss = trainer.callback_metrics.get("train_loss", None)
        val_loss = trainer.callback_metrics.get("val_loss", None)
        
        if train_loss is not None and val_loss is not None:
            print(f"Epoch {trainer.current_epoch}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        
        
        if trainer.early_stopping_callback is not None and trainer.early_stopping_callback.wait_count == 0:
            self.early_stopping_epoch = trainer.current_epoch


custom_logger = CustomLogger()
early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    patience=5,
    mode="min",
    verbose=True
)

In [None]:
! rm -rf /kaggle/working/checkpoints

In [None]:
! rm /kaggle/working/age_tr_params_tun_full.csv

In [None]:
num_epochs = 30
output_csv = "age_tr_params_tun_full.csv"


metric_keys = [
    "rankme", "coherence", "pseudo_condition_number", 
    "alpha_req", "stable_rank", "ne_sum", "self_clustering", "ripser"
]

columns = (
    list(fixed_params.keys()) +
    ["checkpoint", "epoch_num", "accuracy", "early_stop_epoch", "hidden_size"] +
    [f"metric_{k}" for k in metric_keys] +
    [f"std_metric_{k}" for k in metric_keys] +
    [f"time_{k}" for k in metric_keys] +
    [f"std_time_{k}" for k in metric_keys]
)

In [None]:
from time import time
import os
import gc
import torch
import pandas as pd
import glob
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from functools import partial

cur_time = time()

for param in all_hyperparameter_grids:
    
    print(f'All params are frozen except {param[0]}')
    params = param[1]
    

    train_loader = create_datasets(train_dict, valid_dict, params, source_features)

    sourceA_encoder_params = dict(
        embeddings_noise=0.003,
        linear_projection_size=64,
        embeddings={
            "small_group": {"in": len(np.unique(sourceA['small_group'])), "out": 32}
        },
    )
    
    sourceB_encoder_params = dict(
        embeddings_noise=0.003,
        linear_projection_size=64,
        numeric_values={"amount_rur": "identity"},
    )
    
    sourceA_encoder = TrxEncoder(**sourceA_encoder_params)
    sourceB_encoder = TrxEncoder(**sourceB_encoder_params)
    
    seq_encoder = MultiModalSortTimeSeqEncoderContainer(
        trx_encoders={
            "sourceA": sourceA_encoder,
            "sourceB": sourceB_encoder,
        },
        input_size=64,
        hidden_size=params["hidden_size"],
        seq_encoder_cls=RnnEncoder,
        type="gru",
    )

    model = CoLESModule(
        seq_encoder=seq_encoder,
        optimizer_partial=partial(torch.optim.Adam, lr=params["learning_rate"]),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.5),
    )

    early_stopping_callback = EarlyStopping(
        monitor="loss",
        patience=5,
        mode="min",
        verbose=True
    )

    checkpoint_callback = ModelCheckpoint(
        dirpath=checkpoints_path,
        filename=f"model_{params['batch_size']}_{params['learning_rate']}_{params['split_count']}_{params['cnt_min']}_{params['cnt_max']}_{params['hidden_size']}{{epoch:02d}}",
        save_top_k=-1,
        every_n_epochs=1,
    )

    
    pl_trainer = pl.Trainer(
        callbacks=[checkpoint_callback, early_stopping_callback, custom_logger],
        default_root_dir=checkpoints_path,
        check_val_every_n_epoch=1,
        max_epochs= num_epochs,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        precision=16
    )
    model.train()
    pl_trainer.fit(model, train_loader)

    early_stop_epoch = getattr(custom_logger, "early_stopping_epoch", None) or num_epochs

    
    checkpoint_files = glob.glob(f"{checkpoints_path}/model_{params['batch_size']}_{params['learning_rate']}_{params['split_count']}_{params['cnt_min']}_{params['cnt_max']}_{params['hidden_size']}*.ckpt")
    checkpoint_files.sort()
    print(f"Elapsed time: {time() - cur_time:.2f} seconds")

    print(f'Early stop is {early_stop_epoch}')

    for i, checkpoint in enumerate(checkpoint_files):
        print(f"Processing checkpoint number {i}")
        model = CoLESModule.load_from_checkpoint(checkpoint, seq_encoder=seq_encoder)
    
        
        metrics, times, std_metrics, std_times, accuracy = evaluate_model(model, pl_trainer, checkpoint)
    
        
        metrics_flattened = {f"metric_{k}": round(v, 4) for k, v in metrics.items()}
        std_metrics_flattened = {f"std_metric_{k}": round(v, 4) for k, v in std_metrics.items()}
        times_flattened = {f"time_{k}": round(v, 4) for k, v in times.items()}
        std_times_flattened = {f"std_time_{k}": round(v, 4) for k, v in std_times.items()}
    
        
        new_result = {
            **params,
            "checkpoint": checkpoint,
            "epoch_num": int(i),
            "accuracy": accuracy,
            "early_stop_epoch": int(early_stop_epoch),
            **metrics_flattened,
            **std_metrics_flattened,
            **times_flattened,
            **std_times_flattened,
        }
    
        
        results = pd.DataFrame([new_result], columns=columns)
        print('----------')
        print(results["early_stop_epoch"])

        if not os.path.exists(output_csv):  
            pd.DataFrame(columns=columns).to_csv(output_csv, mode="w", index=False, header=True)
        
        results.to_csv(output_csv, mode="a", header=False, index=False)

        del metrics, accuracy, new_result
        torch.cuda.empty_cache()
        gc.collect()

    print(f"Removing checkpoints for parameters: {params}")
    for checkpoint in checkpoint_files:
        os.remove(checkpoint)

    del model
    del train_loader
    torch.cuda.empty_cache()
    gc.collect()

print("Optimization complete!")