In [1]:

from fim.data.dataloaders import DataLoaderFactory
import matplotlib.pyplot as plt
from pprint import pprint

In [2]:
dataset_config = {
    "name": "HawkesDataLoader",
    "path_collections": {
        "train": (
            "/home/berghaus/FoundationModels/FIM/data/synthetic_data/hawkes/1k_5_st_hawkes_mixed_2000_paths_250_events/train",
        ),
        "validation": (
            "/home/berghaus/FoundationModels/FIM/data/synthetic_data/hawkes/1k_5_st_hawkes_mixed_2000_paths_250_events/val",
        )
    },
    "loader_kwargs": {
        "batch_size": 1,
        "num_workers": 8,
        "test_batch_size": 1,
        "variable_num_of_paths": True,
        "min_path_count": 100,
        "max_path_count": 1000,
        "max_number_of_minibatch_sizes": 10,
        "variable_sequence_lens": True,
        "min_sequence_len": 10,
        "max_sequence_len": 250,
        "num_kernel_evaluation_points": 10,
        "is_bulk_model": False
    },
    "dataset_kwargs": {
        "files_to_load": {
            "base_intensities": "base_intensities.pt",
            "event_times": "event_times.pt",
            "event_types": "event_types.pt",
            "kernel_evaluations": "kernel_evaluations.pt",
            "kernel_grids": "kernel_grids.pt"
        }
    }
}

In [None]:
from torch import Tensor
import torch


def normalize_obs_grid(obs_grid: Tensor, seq_lengths: Tensor) -> tuple[Tensor, Tensor]:
        batch_indices = torch.arange(obs_grid.size(0), device=obs_grid.device).view(-1, 1).expand(-1, obs_grid.size(1))
        path_indices = torch.arange(obs_grid.size(1), device=obs_grid.device).view(1, -1).expand(obs_grid.size(0), -1)
        max_times = obs_grid[batch_indices, path_indices, seq_lengths-1]          
        norm_constants = max_times.amax(dim=[1,2])
        obs_grid_normalized = obs_grid / norm_constants.view(-1, 1, 1, 1)
        return obs_grid_normalized

In [4]:
dataloader = DataLoaderFactory.create(**dataset_config)

In [5]:
import torch

num_samples = 10
i = 0

data = []
for sample in dataloader.train_it:
    if i >= num_samples:
        break
    i += 1
    sample["event_times"] = normalize_obs_grid(sample["event_times"], sample["seq_lengths"])
    sample["delta_times"] = sample["event_times"][:, :, 1:] - sample["event_times"][:, :, :-1]
    # Add a delta time of 0 for the first event
    sample["delta_times"] = torch.cat([torch.zeros_like(sample["delta_times"][:, :, :1]), sample["delta_times"]], dim=2)
    data.append(sample)

In [6]:
data[0].keys()

dict_keys(['base_intensities', 'event_times', 'event_types', 'kernel_evaluations', 'kernel_grids', 'seq_lengths', 'delta_times'])

In [7]:
import numpy as np
from datasets import Dataset


def convert_to_easytpp_format(sample):
    # Extract tensors from the dictionary
    event_times = sample["event_times"][0]  # [P, L, 1]
    delta_times = sample["delta_times"][0]  # [P, L, 1]
    event_types = sample["event_types"][0]  # [P, L, 1]
    seq_lengths = sample["seq_lengths"][0]  # [P]

    # Get dimensions
    num_sequences = event_times.shape[0]

    # Initialize list to store the converted data
    easytpp_data = []

    # Process each sequence
    for i in range(num_sequences):
        seq_len = seq_lengths[i].item()

        # Extract valid events for this sequence
        times = event_times[i, :seq_len, 0].cpu().numpy().tolist()
        deltas = delta_times[i, :seq_len, 0].cpu().numpy().tolist()
        types = event_types[i, :seq_len, 0].cpu().numpy().tolist()

        # Create a single sequence entry
        sequence_entry = {
            "time_since_start": times,
            "time_since_last_event": deltas,
            "type_event": types,
            "seq_idx": i,
            "seq_len": seq_len,
            "dim_process": 1
        }
        
        easytpp_data.append(sequence_entry)

    # Create a Hugging Face Dataset
    dataset = Dataset.from_list(easytpp_data)
    return dataset

In [8]:
# Convert each sample to EasyTPP format
converted_datasets = [convert_to_easytpp_format(sample) for sample in data]

# Create the final dataset with the train split
data = [{"test": converted_dataset} for converted_dataset in converted_datasets]

In [None]:
def print_stats(dataset, split='test'):
   dataset = dataset[split]
   stats = {"num_sequences": len(dataset), "max_sequence_length": 0, "min_sequence_length": 1000000, "max_event_time": 0, "min_event_time": 1000000}
   seq_lengths = [len(seq['time_since_last_event']) for seq in dataset]
   times_since_last_event = [time for seq in dataset for time in seq['time_since_last_event']]
   min_delta_event_time = min(times_since_last_event)
   max_delta_event_time = max(times_since_last_event)
   min_seq_length = min(seq_lengths)
   max_seq_length = max(seq_lengths)
   avg_seq_length = sum(seq_lengths) / len(seq_lengths)
   avg_delta_event_time = sum(times_since_last_event) / len(times_since_last_event)
   stats["max_sequence_length"] = max_seq_length
   stats["min_sequence_length"] = min_seq_length
   stats["num_sequences"] = len(seq_lengths)
   stats["avg_sequence_length"] = avg_seq_length
   stats["max_event_time"] = max_delta_event_time
   stats["min_event_time"] = min_delta_event_time
   stats["avg_event_time"] = avg_delta_event_time
   encoutered_marks = set()
   for seq in dataset:
       for mark in seq['type_event']:
           if mark not in encoutered_marks:
               encoutered_marks.add(mark)
   stats["num_marks"] = len(encoutered_marks)
   pprint(stats)

In [10]:
print_stats(data[0])

{'avg_event_time': 0.014049674406222562,
 'avg_sequence_length': 34.63,
 'max_event_time': 0.16662979125976562,
 'max_sequence_length': 85,
 'max_total_time': 1.0,
 'min_event_time': 0.0,
 'min_sequence_length': 20,
 'num_marks': 5,
 'num_sequences': 100}
