In [None]:
# Install dependencies if needed
# !pip install polars numpy pandas

In [1]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path

# EventFlow imports
import sys
sys.path.insert(0, str(Path.cwd().parent / "src"))

from eventflow.core.adapters import (
    TableAdapter, TableAdapterConfig,
    SequenceAdapter, SequenceAdapterConfig,
    RasterAdapter, RasterAdapterConfig,
    GraphAdapter, GraphAdapterConfig,
    StreamAdapter, StreamAdapterConfig,
)

print("EventFlow adapters loaded successfully!")

EventFlow adapters loaded successfully!


## 1. Load Sample Chicago Crime Data

We'll create a sample dataset mimicking Chicago crime records with spatial and temporal features.

In [2]:
# Generate sample Chicago crime data
np.random.seed(42)
n_records = 1000

# Chicago bounds (approximate)
lat_min, lat_max = 41.64, 42.02
lon_min, lon_max = -87.94, -87.52

# Crime types
crime_types = ["THEFT", "BATTERY", "ASSAULT", "BURGLARY", "ROBBERY", "NARCOTICS"]

# Generate dates over 30 days
base_date = datetime(2024, 1, 1)
dates = [base_date + timedelta(hours=np.random.randint(0, 24*30)) for _ in range(n_records)]

# Create DataFrame
crime_df = pl.DataFrame({
    "case_id": [f"JE{100000 + i}" for i in range(n_records)],
    "timestamp": dates,
    "latitude": np.random.uniform(lat_min, lat_max, n_records),
    "longitude": np.random.uniform(lon_min, lon_max, n_records),
    "primary_type": np.random.choice(crime_types, n_records),
    "beat": np.random.randint(100, 2500, n_records),
    "district": np.random.randint(1, 26, n_records),
    "domestic": np.random.choice([True, False], n_records, p=[0.15, 0.85]),
    "arrest": np.random.choice([True, False], n_records, p=[0.25, 0.75]),
})

print(f"Generated {len(crime_df)} crime records")
crime_df.head(10)

Generated 1000 crime records


case_id,timestamp,latitude,longitude,primary_type,beat,district,domestic,arrest
str,datetime[μs],f64,f64,str,i32,i32,bool,bool
"""JE100000""",2024-01-05 06:00:00,41.992051,-87.828537,"""ROBBERY""",2107,17,False,False
"""JE100001""",2024-01-19 03:00:00,41.914207,-87.865338,"""NARCOTICS""",1463,10,True,True
"""JE100002""",2024-01-12 06:00:00,41.658276,-87.64952,"""ASSAULT""",2487,1,False,True
"""JE100003""",2024-01-05 10:00:00,41.936976,-87.680418,"""BURGLARY""",303,3,False,True
"""JE100004""",2024-01-03 23:00:00,41.954618,-87.67406,"""ASSAULT""",260,8,False,False
"""JE100005""",2024-01-30 04:00:00,41.925191,-87.843864,"""ROBBERY""",491,8,False,False
"""JE100006""",2024-01-01 20:00:00,41.943824,-87.842289,"""THEFT""",868,20,False,False
"""JE100007""",2024-01-26 14:00:00,41.95355,-87.847263,"""ROBBERY""",1193,22,True,True
"""JE100008""",2024-01-06 01:00:00,41.710834,-87.867024,"""BATTERY""",504,15,False,False
"""JE100009""",2024-01-20 10:00:00,41.729563,-87.664596,"""BATTERY""",707,24,False,False


## 2. Create Aggregated Event Data

Aggregate crimes by spatial grid cell and time window to create event counts suitable for modeling.

In [3]:
# Create spatial grid bins (10x10 grid over Chicago)
n_lat_bins, n_lon_bins = 10, 10

crime_df = crime_df.with_columns([
    ((pl.col("latitude") - lat_min) / (lat_max - lat_min) * n_lat_bins).floor().cast(pl.Int32).alias("lat_bin"),
    ((pl.col("longitude") - lon_min) / (lon_max - lon_min) * n_lon_bins).floor().cast(pl.Int32).alias("lon_bin"),
    pl.col("timestamp").dt.date().alias("date"),
    pl.col("timestamp").dt.hour().alias("hour"),
])

# Create spatial cell ID
crime_df = crime_df.with_columns(
    (pl.col("lat_bin") * n_lon_bins + pl.col("lon_bin")).alias("cell_id")
)

# Aggregate by cell and date
daily_counts = crime_df.group_by(["cell_id", "date", "lat_bin", "lon_bin"]).agg([
    pl.len().alias("event_count"),
    pl.col("arrest").sum().alias("arrest_count"),
    pl.col("domestic").sum().alias("domestic_count"),
    pl.col("latitude").mean().alias("centroid_lat"),
    pl.col("longitude").mean().alias("centroid_lon"),
]).sort(["date", "cell_id"])

print(f"Aggregated to {len(daily_counts)} cell-day observations")
daily_counts.head(10)

Aggregated to 858 cell-day observations


cell_id,date,lat_bin,lon_bin,event_count,arrest_count,domestic_count,centroid_lat,centroid_lon
i32,date,i32,i32,u32,u32,u32,f64,f64
8,2024-01-01,0,8,1,1,0,41.673646,-87.577965
10,2024-01-01,1,0,1,0,0,41.692895,-87.926803
20,2024-01-01,2,0,1,0,0,41.735961,-87.937182
21,2024-01-01,2,1,1,0,0,41.722849,-87.896262
24,2024-01-01,2,4,1,0,0,41.72837,-87.748934
29,2024-01-01,2,9,1,0,0,41.735953,-87.534178
30,2024-01-01,3,0,1,0,1,41.770208,-87.902955
32,2024-01-01,3,2,1,0,1,41.769614,-87.818213
33,2024-01-01,3,3,1,0,1,41.76647,-87.80413
36,2024-01-01,3,6,1,1,0,41.776477,-87.648149


## 3. TableAdapter — For GLM/Poisson Regression

Convert to tabular format suitable for count regression models with offset for exposure.

In [4]:
# Prepare features for TableAdapter
# Add day-of-week and exposure (log area proxy)
table_df = daily_counts.with_columns([
    pl.col("date").dt.weekday().alias("day_of_week"),
    pl.lit(1.0).alias("exposure"),  # uniform exposure for this demo
])

# Configure TableAdapter
table_config = TableAdapterConfig(
    target_col="event_count",
    feature_cols=["lat_bin", "lon_bin", "day_of_week", "arrest_count", "domestic_count"],
    offset_col="exposure",
    include_intercept=True,
)

table_adapter = TableAdapter(table_config)
table_output = table_adapter.convert(table_df)

print(f"Table shape: {table_output.data.shape}")
print(f"Feature names: {table_output.feature_names}")
print(f"Target column: {table_output.target}")

# Get X, y for sklearn
X, y = table_output.get_X_y()
print(f"\nX shape: {X.shape}, y shape: {y.shape if y is not None else 'None'}")
print(f"X[:5]:\n{X[:5]}")

2025-12-01 02:49:09,144 - eventflow.core.adapters.table - INFO - Converting EventFrame to table format
2025-12-01 02:49:09,147 - eventflow.core.adapters.table - INFO - Created table with 858 rows, 6 features


Table shape: (858, 8)
Feature names: ['_intercept', 'lat_bin', 'lon_bin', 'day_of_week', 'arrest_count', 'domestic_count']
Target column: event_count

X shape: (858, 6), y shape: (858,)
X[:5]:
[[1. 0. 8. 1. 1. 0.]
 [1. 1. 0. 1. 0. 0.]
 [1. 2. 0. 1. 0. 0.]
 [1. 2. 1. 1. 0. 0.]
 [1. 2. 4. 1. 0. 0.]]


## 4. SequenceAdapter — For RNN/Transformer Models

Convert time series per location into padded sequences with attention masks.

In [5]:
# Prepare sequences: each cell's time series
sequence_data = daily_counts.select(["cell_id", "date", "event_count"]).sort(["cell_id", "date"])

print(f"Sequence data rows: {len(sequence_data)}")

# Configure SequenceAdapter - using the aggregated data directly
seq_config = SequenceAdapterConfig(
    spatial_col="cell_id",
    timestamp_col="date",
    feature_cols=["event_count"],
    sequence_length=30,  # max 30 days
    padding_value=0.0,
    padding_side="right",
)

seq_adapter = SequenceAdapter(seq_config)
seq_output = seq_adapter.convert(sequence_data)

print(f"\nSequence tensor shape: {seq_output.sequences.shape}")
print(f"Attention mask shape: {seq_output.masks.shape}")
print(f"Sequence lengths (first 10): {seq_output.lengths[:10]}")
print(f"\nFirst sequence (first 10 values): {seq_output.sequences[0, :10]}")
print(f"First mask (first 10 values): {seq_output.masks[0, :10]}")

2025-12-01 02:49:18,792 - eventflow.core.adapters.sequence - INFO - Converting EventFrame to sequence format
2025-12-01 02:49:18,812 - eventflow.core.adapters.sequence - INFO - Created 100 sequences with max_length=30, n_features=1


Sequence data rows: 858

Sequence tensor shape: (100, 30, 1)
Attention mask shape: (100, 30)
Sequence lengths (first 10): [10  9  8  9 10  4  8  6 11  8]

First sequence (first 10 values): [[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]]
First mask (first 10 values): [ True  True  True  True  True  True  True  True  True  True]


## 5. RasterAdapter — For CNN Models

Convert spatial-temporal data into 3D arrays (time × height × width) for convolutional networks.

In [6]:
# Configure RasterAdapter
raster_config = RasterAdapterConfig(
    grid_col="cell_id",
    timestamp_col="date",
    feature_cols=["event_count", "arrest_count"],
    grid_shape=(n_lat_bins, n_lon_bins),
    channel_first=True,  # PyTorch format: (C, H, W)
    fill_value=0.0,
)

raster_adapter = RasterAdapter(raster_config)
raster_output = raster_adapter.convert(daily_counts)

print(f"Raster shape: {raster_output.raster.shape}")
print(f"Shape interpretation: (time_steps, channels, height, width)")
print(f"Timestamps: {len(raster_output.timestamps)} unique dates")
print(f"Channels: {raster_output.channel_names}")
print(f"\nFirst timestep, first channel (event_count) - 10x10 grid:\n{raster_output.raster[0, 0]}")

2025-12-01 02:49:47,499 - eventflow.core.adapters.raster - INFO - Converting EventFrame to raster format
2025-12-01 02:49:47,505 - eventflow.core.adapters.raster - INFO - Created raster with shape (30, 2, 10, 10): 30 timesteps, 2 channels, 10x10 grid


Raster shape: (30, 2, 10, 10)
Shape interpretation: (time_steps, channels, height, width)
Timestamps: 30 unique dates
Channels: ['event_count', 'arrest_count']

First timestep, first channel (event_count) - 10x10 grid:
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 1. 0. 0. 0. 0. 1.]
 [1. 0. 1. 1. 0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
 [2. 1. 0. 1. 1. 0. 0. 0. 2. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 1. 0.]
 [0. 0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 2. 0. 1. 0. 0.]
 [0. 2. 2. 1. 0. 0. 0. 1. 0. 0.]]


## 6. GraphAdapter — For GNN Models

Convert spatial relationships into graph structure with node features and adjacency.

In [None]:
# Prepare graph data: nodes are cells, edges connect neighbors
# Aggregate features per cell (across all dates)
node_features_df = daily_counts.group_by("cell_id").agg([
    pl.col("event_count").sum().alias("total_events"),
    pl.col("event_count").mean().alias("avg_events"),
    pl.col("arrest_count").sum().alias("total_arrests"),
    pl.col("centroid_lat").mean().alias("lat"),
    pl.col("centroid_lon").mean().alias("lon"),
    pl.col("lat_bin").first().alias("lat_bin"),
    pl.col("lon_bin").first().alias("lon_bin"),
]).sort("cell_id")

print(f"Nodes (cells): {len(node_features_df)}")

# Configure GraphAdapter
graph_config = GraphAdapterConfig(
    node_col="cell_id",
    feature_cols=["total_events", "avg_events", "total_arrests", "lat", "lon"],
    adjacency_type="spatial",
    spatial_threshold=0.05,  # connect nodes within ~5km
    include_self_loops=True,
)

graph_adapter = GraphAdapter(graph_config)
graph_output = graph_adapter.convert(node_features_df)

print(f"\nNode features shape: {graph_output.node_features.shape}")
print(f"Edge index shape: {graph_output.edge_index.shape}")
print(f"Number of edges: {graph_output.edge_index.shape[1]}")
print(f"Adjacency matrix shape: {graph_output.adjacency.shape}")
print(f"\nFirst 5 edges (src → dst): {list(zip(graph_output.edge_index[0, :5], graph_output.edge_index[1, :5]))}")

## 7. StreamAdapter — For Neural ODE Models

Convert to continuous-time event streams with inter-event times for temporal point processes.

In [None]:
# Use original crime events (not aggregated) for stream
stream_df = crime_df.select([
    "case_id",
    "timestamp",
    "primary_type",
    "cell_id",
    "latitude",
    "longitude",
]).sort("timestamp")

print(f"Events for streaming: {len(stream_df)}")

# Configure StreamAdapter
stream_config = StreamAdapterConfig(
    timestamp_col="timestamp",
    event_type_col="primary_type",
    state_cols=["latitude", "longitude"],
    time_scale="normalize",  # normalize timestamps
    time_origin="first",  # relative to first event
)

stream_adapter = StreamAdapter(stream_config)
stream_output = stream_adapter.convert(stream_df)

print(f"\nTimestamps shape: {stream_output.timestamps.shape}")
print(f"States shape: {stream_output.states.shape}")
print(f"Inter-event times shape: {stream_output.inter_times.shape}")
print(f"Event types shape: {stream_output.event_types.shape}")
print(f"Unique event types: {np.unique(stream_output.event_types)}")
print(f"\nFirst 10 normalized timestamps: {stream_output.timestamps[:10]}")
print(f"First 10 inter-event times: {stream_output.inter_times[:10]}")

## Summary

This demo showed how EventFlow transforms raw crime data into 5 different ML-ready formats:

| Adapter | Output | Use Case |
|---------|--------|----------|
| **TableAdapter** | DataFrame with X, y | GLM, XGBoost, Random Forest |
| **SequenceAdapter** | Padded tensors + masks | LSTM, Transformer |
| **RasterAdapter** | 4D arrays (T, C, H, W) | CNN, ConvLSTM |
| **GraphAdapter** | Node features + edges | GCN, GAT, GraphSAGE |
| **StreamAdapter** | Continuous timestamps | Neural ODE, TPP |

All adapters support serialization to Parquet, NumPy, and PyTorch formats for seamless integration with ML pipelines.