In [3]:
import polars as pl
import datashader as ds
import datashader.transfer_functions as tf
from datetime import timedelta, datetime
import matplotlib.pyplot as plt
from threading import Lock
import numpy as np
import warnings

pl.Config.set_engine_affinity("streaming")
pl.Config.set_streaming_chunk_size(200000)

polars.config.Config

In [2]:
import os

os.environ["SHELL"] = "/app/bin/host-spawn"
os.environ["PATH"] = (
    "/var/home/caleb/Projects/stream-plotting/.venv/bin:/home/caleb/.local/share/zinit/plugins/starship---starship:/home/caleb/.local/share/zinit/polaris/bin:/home/linuxbrew/.linuxbrew/bin:/home/linuxbrew/.linuxbrew/sbin:/home/caleb/.cargo/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin"
)
os.environ["POLARS_VISUALIZE_PHYSICAL_PLAN"] = (
    "/home/caleb/Projects/stream-plotting/a.dot"
)

In [28]:
all_lf = []
for i in range(1, 1000):
    lf = pl.LazyFrame()
    lf = lf.with_columns(
        pl.datetime_range(
            datetime(
                2001, 1, (((i - 1) // 60) // 24) + 1, ((i - 1) // 60) % 24, (i - 1) % 60
            ),
            datetime(2001, 1, ((i // 60) // 24) + 1, (i // 60) % 24, i % 60),
            "123ns",
            eager=False,
        )
        .sample(100_000_000)
        .sort()
        .alias("time")
    )
    lf = lf.with_columns(
        (pl.int_range(100000).sample(100_000_000, with_replacement=True) / 10000).alias(
            "x"
        ),
        (pl.int_range(100000).sample(100_000_000, with_replacement=True) / 10000).alias(
            "y"
        ),
        (pl.int_range(10).sample(100_000_000, with_replacement=True)).alias(
            "group"
        ),
        (pl.int_range(100000).sample(100_000_000, with_replacement=True) / 10000).alias(
            "colour"
        ),
    )
    lf.sink_parquet(f"test/test{i}.parquet")

KeyboardInterrupt: 

In [4]:
# pl.scan_parquet("test/").head().collect()

In [4]:
lf = pl.scan_parquet("test/*")

In [None]:
def plot_with_datashader(
    plots_dict: dict,
    df: pl.DataFrame,
    x: str,
    y: str,
    c: str,
    period_ns: int,
    plot_width=800,
    plot_height=600,
    out_dir=".",
    prefix="plot",
) -> pl.DataFrame:
    """
    Plots a Polars DataFrame using Datashader and returns the same DataFrame.
    Calculates plot index using time and metadata, and uses it in the output filename.

    Additional Parameters:
    - start_time (int): Epoch timestamp in nanoseconds of the full data start.
    - period_ns (int): Period in nanoseconds for splitting.
    - out_dir (str): Output directory for saving plots.
    - prefix (str): Prefix for filenames.
    """
    # Determine plot index from min x (timestamp)
    first_row = df[0]
    group_len = first_row["len"].item()
    plot_idx = first_row["group_idx"].item()
    x_min = first_row[f"{x}_min"].item()
    x_max = first_row[f"{x}_max"].item()
    y_min = first_row[f"{y}_min"].item()
    y_max = first_row[f"{y}_max"].item()
    pdf = df.to_pandas()
    # Create canvas and render
    cvs = ds.Canvas(
        plot_width=plot_width,
        plot_height=plot_height,
        x_range=(x_min, x_max),
        y_range=(y_min, y_max),
    )
    current_agg = cvs.points(pdf, x, y, ds.mean(c))
    lock_key = f"{plot_idx}_lock"
    count_key = f"{plot_idx}_count"

    # Initialize shared memory keys if not present
    with plots_dict.setdefault(f"{plot_idx}_lock", Lock()):
        existing = plots_dict.get(plot_idx)
        if existing is None:
            plots_dict[plot_idx] = current_agg.values.copy()
            plots_dict[count_key] = np.array(df.height)
        else:
            np.nanmean(
                np.dstack((existing, current_agg.values)),
                axis=2,
                out=plots_dict[plot_idx],
            )
            plots_dict[count_key] += np.array(df.height)

    if plots_dict[count_key] == np.array(group_len):
        total = np.nan_to_num(plots_dict[plot_idx], nan=0.0)
        current_agg.values = total
        # Save the image
        fig, ax = plt.subplots()
        fig.set_figheight(plot_height / 100)
        fig.set_figwidth(plot_width / 100)
        ax.imshow(tf.shade(current_agg).to_pil(), aspect="auto")
        ax.axis("off")
        filename = f"{out_dir}/{prefix}_{period_ns}_{plot_idx}.png"
        fig.savefig(filename, bbox_inches="tight", pad_inches=0)
        plt.close(fig)
        del plots_dict[count_key]
        del plots_dict[lock_key]
        del plots_dict[plot_idx]


def plot_lf(
    lf: pl.LazyFrame, period: timedelta, x: str, y: str, c: str, out_dir="plots"
):
    # Convert period to nanoseconds
    # period_ns = pl.duration_string_to_duration(period).cast(pl.Int64).max().item()

    # Add temporary epoch column for plotting
    epoch_col = f"{x}_epoch_tmp"
    lf = lf.set_sorted(x)
    lf = lf.with_columns(pl.col(x).dt.epoch("ns").alias(epoch_col)).with_columns(
        (
            (pl.col(epoch_col) - pl.col(epoch_col).min())
            // (period.total_seconds() * 1e9)
        )
        .alias("group_idx")
        .cast(pl.UInt32)
    )
    period_ns = int(period.total_seconds() * 1e9)
    plots = {}

    def custom_plot_fn(df: pl.DataFrame):
        for tmp_df in df.partition_by("group_idx"):
            plot_with_datashader(
                plots,
                tmp_df,
                x=epoch_col,
                y=y,
                c=c,
                period_ns=period_ns,
                out_dir=out_dir,
                prefix="plot",
            )
        return df

    # Run the dynamic grouping and plotting

    agg = (
        lf.select(epoch_col, "group_idx", y, c)
        .group_by("group_idx")
        .agg(
            pl.col(epoch_col).max().alias(f"{epoch_col}_max"),
            pl.col(epoch_col).min().alias(f"{epoch_col}_min"),
            pl.col(y).max().alias(f"{y}_max"),
            pl.col(y).min().alias(f"{y}_min"),
            pl.len().alias("len"),
        )
    )
    return (
        lf.select(epoch_col, "group_idx", y, c)
        .join(agg, on="group_idx")
        .map_batches(
            custom_plot_fn,
            predicate_pushdown=False,
            projection_pushdown=False,
            slice_pushdown=False,
            streamable=True,
        )
    )

In [8]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    a = plot_lf(lf.head(100000000), timedelta(seconds=1), "time", "value", "colour")
    a.select(pl.col("value").first()).sink_parquet("out.parquet", engine="streaming")

In [1]:
import polars as pl
import numpy as np

groups = [1]  # single group for testing
grid_x, grid_y = 4, 4
interval = 10  # seconds per tile
n_tiles = grid_x * grid_y

n_points_per_shape = 100

all_data = []

for group in groups:
    for tile_idx in range(n_tiles):
        time_base = tile_idx * interval

        # Choose a shape type based on tile index for variety
        shape_type = tile_idx % 3  # 0=line, 1=diagonal, 2=circle

        if shape_type == 0:
            # Horizontal line at y=0.5
            x = np.linspace(0,1,n_points_per_shape)
            y = np.full_like(x, 0.5)
        elif shape_type == 1:
            # Diagonal line y=x
            x = np.linspace(0,1,n_points_per_shape)
            y = x.copy()
        else:
            # Circle centered at (0.5,0.5) radius 0.4
            theta = np.linspace(0, 2*np.pi, n_points_per_shape)
            x = 0.5 + 0.4 * np.cos(theta)
            y = 0.5 + 0.4 * np.sin(theta)

        # Create time values within this interval
        times = np.linspace(time_base, time_base + interval, n_points_per_shape)

        # Random colour values between 0 and 1 for demonstration
        colours = np.random.rand(n_points_per_shape)

        df_tile = pl.DataFrame({
            'group': np.full(n_points_per_shape, group),
            'time': times,
            'x': x,
            'y': y,
            'colour': colours
        })

        all_data.append(df_tile)

# Combine all tiles into a single dataframe
df_test = pl.concat(all_data)

print(df_test)


shape: (1_600, 5)
┌───────┬───────────┬──────────┬─────┬──────────┐
│ group ┆ time      ┆ x        ┆ y   ┆ colour   │
│ ---   ┆ ---       ┆ ---      ┆ --- ┆ ---      │
│ i64   ┆ f64       ┆ f64      ┆ f64 ┆ f64      │
╞═══════╪═══════════╪══════════╪═════╪══════════╡
│ 1     ┆ 0.0       ┆ 0.0      ┆ 0.5 ┆ 0.508041 │
│ 1     ┆ 0.10101   ┆ 0.010101 ┆ 0.5 ┆ 0.246014 │
│ 1     ┆ 0.20202   ┆ 0.020202 ┆ 0.5 ┆ 0.555914 │
│ 1     ┆ 0.30303   ┆ 0.030303 ┆ 0.5 ┆ 0.776178 │
│ 1     ┆ 0.40404   ┆ 0.040404 ┆ 0.5 ┆ 0.671155 │
│ …     ┆ …         ┆ …        ┆ …   ┆ …        │
│ 1     ┆ 159.59596 ┆ 0.959596 ┆ 0.5 ┆ 0.448691 │
│ 1     ┆ 159.69697 ┆ 0.969697 ┆ 0.5 ┆ 0.847193 │
│ 1     ┆ 159.79798 ┆ 0.979798 ┆ 0.5 ┆ 0.324033 │
│ 1     ┆ 159.89899 ┆ 0.989899 ┆ 0.5 ┆ 0.549718 │
│ 1     ┆ 160.0     ┆ 1.0      ┆ 0.5 ┆ 0.046398 │
└───────┴───────────┴──────────┴─────┴──────────┘


In [None]:
4*10*10
30*4*4

In [70]:
import polars as pl
import datashader as ds
import datashader.transfer_functions as tf
import numpy as np
import torch
import os
torch.set_num_threads(1)
# Parameters
tile_w, tile_h = 256, 256
grid_x, grid_y = 5, 5
interval = 100000000  # seconds per tile
batch_interval = interval * grid_x * grid_y

composite_w = tile_w * grid_x
composite_h = tile_h * grid_y

output_folder = "plots"
os.makedirs(output_folder, exist_ok=True)
# Create Datashader canvas covering the full composite size
cvs = ds.Canvas(plot_width=composite_w, plot_height=composite_h)
def render_composite(df):
    grp = df['group'][0]
    batch_time = df['batch_time_window'][0]
    # Convert to pandas for Datashader processing
    df_pd = df['x_canvas', 'y_canvas','colour'].to_pandas(use_threads=False, split_blocks=True)

    # Render points onto composite
    agg = cvs.points(df_pd, 'x_canvas', 'y_canvas', ds.mean('colour'))
    img = tf.set_background(tf.shade(agg), "white").to_pil()
    # Convert to tensor: [H, W, C] -> [C, H, W]
    tensor = torch.from_numpy(np.array(img)).permute(2,0,1).float().div(255)
    # tensor_tiles: [n_tiles, C, tile_h, tile_w]
    tensor_tiles = tensor.view(
        4,
        grid_y, tile_h,
        grid_x, tile_w
    ).permute(1,3,0,2,4).reshape(-1, 4, tile_h, tile_w)

    # Save tensor batch
    
    torch.save(tensor_tiles, f"{output_folder}/{grp}_{batch_time}.pt")

    # Return original DataFrame to Polars
    return pl.DataFrame({"tmp": [1]})

# Example Polars pipeline
df = lf.head(50_000_000)
# Assume df is your lazy Polars dataframe with 'group', 'time', 'x', 'y', 'colour'
# df = pl.scan_parquet("")
# Compute tile indices and shifted canvas coordinates
df = df.with_columns([
    ((pl.col('time').dt.epoch("ns") // interval).alias('time_window')),
])

# Compute batch time window index (covers entire composite)
df = df.with_columns([
    ((pl.col('time_window') // (grid_x * grid_y)) * (grid_x * grid_y)).alias('batch_time_window')
])

# Compute tile row and column within grid
df = df.with_columns([
    ((pl.col('time_window') % (grid_x * grid_y)) // grid_x).alias('tile_row'),
    ((pl.col('time_window') % (grid_x * grid_y)) % grid_x).alias('tile_col'),
])

# Compute per-tile x and y (normalize if needed to [0,1])
# Here assuming x,y are normalized already:
df = df.with_columns(
    (pl.col('x') - pl.col('x').min()) / pl.col('x').max(),
    (pl.col('y') - pl.col('y').min()) / pl.col('y').max()
)
df = df.with_columns([
    (pl.col('x') * tile_w + pl.col('tile_col') * tile_w).alias('x_canvas'),
    (pl.col('y') * tile_h + pl.col('tile_row') * tile_h).alias('y_canvas'),
])
result = df.select([
    "group",
    "batch_time_window",
    pl.col('x_canvas').cast(pl.Float32),
    pl.col('y_canvas').cast(pl.Float32),
    pl.col('colour').cast(pl.Float32)
]).group_by(['group', 'batch_time_window']).map_groups(render_composite, schema=pl.Schema({"tmp":pl.Int64}))
# # Group by group and batch_time_window to process each composite canvas
# result = df.select('group', 'batch_time_window', 'x_canvas', 'y_canvas','colour').group_by(['group', 'batch_time_window']).map_groups(render_composite, schema=pl.Schema({"tmp":pl.Int64}))


# # # Trigger execution (side effects will run here)
_ = result.collect()


In [34]:
tensor = torch.load(output_folder+"/6_9783072175.pt")

In [27]:
tensor.shape

torch.Size([25, 4, 256, 256])

In [35]:
import torchvision
import torchvision.transforms as T
transform = T.ToPILImage()
for x in range(10):
    img = transform(tensor[x])
    img.save(f'{x}.png', format='PNG')

In [None]:
lf = lf.head(100000000)
a = (
    plot_lf(lf, timedelta(seconds=30), "time", "value", "colour")
    .select(pl.col("value").first())
    .sink_parquet("out.tmp", engine="streaming", lazy=True)
)
b = (
    plot_lf(lf, timedelta(seconds=10), "time", "value", "colour")
    .select(pl.col("value").first())
    .sink_parquet("out.tmp", engine="streaming", lazy=True)
)
c = (
    plot_lf(lf, timedelta(seconds=2), "time", "value", "colour")
    .select(pl.col("value").first())
    .sink_parquet("out.tmp", engine="streaming", lazy=True)
)
d = (
    plot_lf(lf, timedelta(milliseconds=1000), "time", "value", "colour")
    .select(pl.col("value").first())
    .sink_parquet("out.tmp", engine="streaming", lazy=True)
)
pl.collect_all([a, b, c, d], engine="streaming")

In [None]:
print(pl.explain_all([a, b, c, d]))

In [64]:
odf = df.collect()

In [83]:
import cProfile
import pstats
import io

# Import your modules and define variables (replace ... as appropriate)
import torch
import polars as pl
import datashader as ds
import datashader.transfer_functions as tf
import numpy as np

# Define your function here (copy of render_composite)
def render_composite(df):
    grp = df['group'][0]
    batch_time = df['batch_time_window'][0]
    df_pd = df['x_canvas', 'y_canvas','colour'].to_pandas(use_threads=False, split_blocks=True)
    agg = cvs.points(df_pd, 'x_canvas', 'y_canvas', ds.mean('colour'))
    img = tf.shade(agg)
    img = tf.set_background(img, "white")
    img = img.to_pil()
    tensor = torch.from_numpy(np.array(img)).permute(2,0,1).float().div(255)
    tensor_tiles = tensor.view(
        4,
        grid_y, tile_h,
        grid_x, tile_w
    ).permute(1,3,0,2,4).reshape(-1, 4, tile_h, tile_w)
    torch.save(tensor_tiles, f"{output_folder}/{grp}_{batch_time}.pt")
    return pl.DataFrame({"tmp": [1]})

# Example: Prepare a dummy dataframe for testing (replace with real data)
df = odf

# Set up any needed globals the function expects
tile_w, tile_h, grid_x, grid_y = 256, 256, 5, 5
composite_w = tile_w * grid_x
composite_h = tile_h * grid_y
output_folder = "plots"
cvs = ds.Canvas(plot_width=composite_w, plot_height=composite_h)

# --- Profiling section ---
pr = cProfile.Profile()
pr.enable()

render_composite(df)

pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('cumtime')
ps.print_stats(20)  # show top 20 functions by cumulative time
print(s.getvalue())

         6157 function calls (5908 primitive calls) in 1.140 seconds

   Ordered by: cumulative time
   List reduced from 696 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.584    0.584 /home/caleb/Projects/stream-plotting/.venv/lib/python3.12/site-packages/torch/serialization.py:908(save)
        1    0.012    0.012    0.584    0.584 /home/caleb/Projects/stream-plotting/.venv/lib/python3.12/site-packages/torch/serialization.py:797(__exit__)
        1    0.572    0.572    0.572    0.572 /home/caleb/Projects/stream-plotting/.venv/lib/python3.12/site-packages/torch/serialization.py:1141(_save)
        1    0.000    0.000    0.296    0.296 /home/caleb/Projects/stream-plotting/.venv/lib/python3.12/site-packages/datashader/core.py:183(points)
        1    0.000    0.000    0.296    0.296 /home/caleb/Projects/stream-plotting/.venv/lib/python3.12/site-packages/datashader/core.py:1326(bypixel)
      2/1   

In [89]:
from line_profiler import LineProfiler

lp = LineProfiler()
lp.add_function(render_composite)
lp_wrapper = lp(render_composite)
lp_wrapper(df.head(1000000))
lp.print_stats()

Timer unit: 1e-09 s

Total time: 0.182103 s
File: /tmp/ipykernel_880/86583721.py
Function: render_composite at line 13

Line #      Hits         Time  Per Hit   % Time  Line Contents
    13                                           def render_composite(df):
    14         1      23940.0  23940.0      0.0      grp = df['group'][0]
    15         1       4050.0   4050.0      0.0      batch_time = df['batch_time_window'][0]
    16         1    2160555.0    2e+06      1.2      df_pd = df['x_canvas', 'y_canvas','colour'].to_pandas(use_threads=False, split_blocks=True)
    17         1   14451085.0    1e+07      7.9      agg = cvs.points(df_pd, 'x_canvas', 'y_canvas', ds.mean('colour'))
    18         1  105250693.0    1e+08     57.8      img = tf.shade(agg)
    19         1   18399818.0    2e+07     10.1      img = tf.set_background(img, "white")
    20         1    3794805.0    4e+06      2.1      img = img.to_pil()
    21         1    3999133.0    4e+06      2.2      tensor = torch.from_n