In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pdb import set_trace

def split_into_chunks(df: pd.DataFrame, chunk_size: int=1, timestamp_col: str='timestamp') -> list[pd.DataFrame]:
    if chunk_size < 1:
        raise ValueError("chunk_size must be at least 1")
    unique_ts, ts_indices = np.unique(df[timestamp_col].values, return_index=True)
    unique_ts = unique_ts[np.argsort(ts_indices)] # Maintain original order, to preserve time relations

    chunk_indices = np.arange(len(unique_ts)) // chunk_size
    chunk_mapper = pd.Series(chunk_indices, index=unique_ts)
    
    chunk_ids = df[timestamp_col].map(chunk_mapper).values # Map original ts to chunk ids
    
    sort_idx = np.argsort(chunk_ids) # Find sorting indexes to preserve chunk_ids's shape
    sorted_chunk_ids = chunk_ids[sort_idx] # Reorder chunk_ids according to sorting indexes
    split_points = np.where(np.diff(sorted_chunk_ids))[0] + 1 # Detect the change between chunk_ids, where new chunk starts and split there
    chunks = np.split(df.iloc[sort_idx], split_points) # Split the dataframe into chunks by sort_idx
    
    return [chunk.sort_index() for chunk in chunks] # Restore original order within chunks

def generate_test_data(
    num_rows=100,
    num_columns=3,
    start_timestamp="2025-01-01 00:00:00",
    max_duplicates=3,
    time_frequency=10
):
    base_ts = [pd.Timestamp(start_timestamp) + timedelta(seconds=np.random.randint(1, time_frequency))
              for i in range(num_rows)]

    timestamps = []
    for ts in base_ts:
        timestamps.extend([ts] * np.random.randint(1, max_duplicates + 1))

    timestamps = np.random.choice(timestamps, size=num_rows)

    data = np.random.rand(num_rows, num_columns)
    columns = [f'col_{i}' for i in range(num_columns)]
    
    df = pd.DataFrame(data, columns=columns)
    df.insert(0, 'timestamp', timestamps)
    
    return df

In [7]:
test_df = generate_test_data(
    num_rows=200,
    num_columns=5,
    start_timestamp="2025-01-01 00:00:00",
    max_duplicates=5,
    time_frequency=10
)

print("Generated DataFrame:")
display(test_df)
print("\nTimestamps distribution between chunks:")
print(test_df['timestamp'].value_counts().sort_index())

chunks = split_into_chunks(test_df, 2)

print(f"\nNumber of chunks created: {len(chunks)}")
_ = [print(f"\nChunk {i} | Timestamps: {chunk.timestamp.unique()} | Rows: {len(chunk)}") for i, chunk in enumerate(chunks)]
for i in range(len(chunks)):
    print(f"\nHead of chunk {i}")
    print(chunks[i].sample(5))

Generated DataFrame:


Unnamed: 0,timestamp,col_0,col_1,col_2,col_3,col_4
0,2025-01-01 00:00:05,0.711485,0.557581,0.164389,0.473505,0.607735
1,2025-01-01 00:00:04,0.683196,0.833589,0.349409,0.882958,0.639881
2,2025-01-01 00:00:07,0.121159,0.013314,0.557835,0.122895,0.266255
3,2025-01-01 00:00:05,0.206433,0.955176,0.208173,0.640285,0.334138
4,2025-01-01 00:00:06,0.181188,0.977880,0.281319,0.749194,0.050339
...,...,...,...,...,...,...
195,2025-01-01 00:00:03,0.768990,0.675231,0.219112,0.212412,0.652473
196,2025-01-01 00:00:04,0.838449,0.428258,0.214649,0.973914,0.072284
197,2025-01-01 00:00:02,0.233248,0.839193,0.487631,0.117524,0.787221
198,2025-01-01 00:00:05,0.701294,0.830080,0.443356,0.538522,0.626774



Timestamps distribution between chunks:
timestamp
2025-01-01 00:00:01    24
2025-01-01 00:00:02    28
2025-01-01 00:00:03    27
2025-01-01 00:00:04    20
2025-01-01 00:00:05    16
2025-01-01 00:00:06    19
2025-01-01 00:00:07    27
2025-01-01 00:00:08    23
2025-01-01 00:00:09    16
Name: count, dtype: int64

Number of chunks created: 5

Chunk 0 | Timestamps: <DatetimeArray>
['2025-01-01 00:00:05', '2025-01-01 00:00:04']
Length: 2, dtype: datetime64[ns] | Rows: 36

Chunk 1 | Timestamps: <DatetimeArray>
['2025-01-01 00:00:07', '2025-01-01 00:00:06']
Length: 2, dtype: datetime64[ns] | Rows: 46

Chunk 2 | Timestamps: <DatetimeArray>
['2025-01-01 00:00:01', '2025-01-01 00:00:08']
Length: 2, dtype: datetime64[ns] | Rows: 47

Chunk 3 | Timestamps: <DatetimeArray>
['2025-01-01 00:00:03', '2025-01-01 00:00:02']
Length: 2, dtype: datetime64[ns] | Rows: 55

Chunk 4 | Timestamps: <DatetimeArray>
['2025-01-01 00:00:09']
Length: 1, dtype: datetime64[ns] | Rows: 16

Head of chunk 0
              ti

  return bound(*args, **kwds)
