In [1]:
import sys
!conda install --yes --prefix {sys.prefix} marimo polars

Channels:
 - defaults
 - conda-forge
 - jetbrains
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
import marimo as mo

import polars as pl
import numpy as np
import time

# defeat marimo's ban on from lets_plot import *
module = __import__('lets_plot')
globals().update(vars(module))

In [3]:
## create datasets of various sizes

sizes = [10, 100, 1_000, 10_000, 20_000, 50_000]

df_list = []
for size in sizes:
    df = pl.DataFrame({
        'x': np.random.randn(size).tolist(), 
        'y': pl.Series(['A', 'B', 'C']).sample(size, with_replacement=True),
    })
    df_list.append(df)

In [4]:
from lets_plot._type_utils import _standardize_value

In [5]:
_standardize_value(df_list[0])

{'x': [1.036332751679348,
  0.39809023900926965,
  -0.7035416159345949,
  -1.1917412949259274,
  1.4199940944569494,
  0.6840286974958839,
  -0.3658836405486371,
  -0.3727545099168241,
  1.1424909944469603,
  0.379091369761879],
 'y': ['A', 'C', 'B', 'C', 'A', 'C', 'A', 'A', 'A', 'B']}

In [6]:
# How long does it take to create the plot object and render it?
for i, data in enumerate(df_list):
    rows = data.shape[0]

    # Time the plot creation
    start = time.time()
    my_plot = ggplot(data, aes(x='x', color='y')) + stat_ecdf()
    midpoint = time.time()
    mo.vstack([my_plot])
    end = time.time()

    print(f"Size: {rows:>7,} rows | Took {midpoint - start:.4f} s to create object and {end - midpoint:.4f} s to render")

Size:      10 rows | Took 0.0161 s to create object and 0.0080 s to render
Size:     100 rows | Took 0.0001 s to create object and 0.0029 s to render
Size:   1,000 rows | Took 0.0001 s to create object and 0.0213 s to render
Size:  10,000 rows | Took 0.0001 s to create object and 0.8939 s to render
Size:  20,000 rows | Took 0.0001 s to create object and 3.3297 s to render
Size:  50,000 rows | Took 0.0001 s to create object and 19.9367 s to render


In [7]:
# create ecdf manually
ecdf_results = []

for my_df in df_list:
    this_size = my_df.shape[0]

    # Create ECDF by sorting and adding cumulative proportions
    ecdf_df = (
        my_df
        .sort(['y', 'x'])
        .with_columns(
        ((pl.int_range(pl.len()) + 1) / pl.len()).over('y').alias('ecdf_prob')
        )
    )
    ecdf_results.append(ecdf_df)

In [8]:
# Time plotting of manual ecdf with geom_step
for edf in ecdf_results:
    erows = edf.shape[0]

    # Time the plot creation
    e_start = time.time()
    # note use of geom_step()
    my_manual_plot = ggplot(edf, aes(x='x', y='ecdf_prob', color='y')) + geom_step()
    e_midpoint = time.time()
    # this is how to force rendering in marimo
    mo.vstack([my_manual_plot])
    e_end = time.time()

    print(f"Size: {erows:>7,} rows | Took {e_midpoint - e_start:.4f} s to create object and {e_end - e_midpoint:.4f} s to render")

Size:      10 rows | Took 0.0001 s to create object and 0.0026 s to render
Size:     100 rows | Took 0.0001 s to create object and 0.0027 s to render
Size:   1,000 rows | Took 0.0001 s to create object and 0.0121 s to render
Size:  10,000 rows | Took 0.0001 s to create object and 0.1154 s to render
Size:  20,000 rows | Took 0.0001 s to create object and 0.2273 s to render
Size:  50,000 rows | Took 0.0001 s to create object and 0.6046 s to render
