In [1]:
from pathlib import Path
import polars as pl
import altair as alt
from altair import Undefined

alt.themes.enable("carbong10")
IMG_DIR = Path("img")
IMG_DIR.mkdir(exist_ok=True)

## Research setup

Hypothesis: Polars in lazy is faster than non-lazy and using the GPU is faster than CPU. 

Hardware: Google Colab Pro with 12C, 12GB RAM and 1x A100 GPU with 40GB VRAM. 

Research questions:
- Is Polars lazy mode faster than non-lazy mode?
- Is using the GPU faster than using the CPU?
- Is there a combination of streaming and lazy mode that is faster than the others?
- What is the effect of streaming on the performance?


In [226]:
df = pl.concat(
    [
        pl.read_parquet("results_polars_gpu_100m.parquet").filter(
            ~((pl.col("func") == "polars_join") & pl.col("gpu"))
        ),
        pl.read_parquet("results_polars_gpu_1_10M.parquet").filter(
            ~((pl.col("func") == "polars_join") & pl.col("gpu"))
        ),
        pl.read_parquet("results_polars_gpu_join.parquet"),
    ]
)
df = df.filter(~(~pl.col("lazy") & pl.col("preload")))
df

func,gpu,streaming,lazy,limit,preload,duration
str,bool,bool,bool,i64,bool,f64
"""polars_filter""",true,false,true,14000000,true,0.448067
"""polars_filter""",true,false,true,34000000,true,1.065185
"""polars_filter""",true,false,true,54000000,true,1.666106
"""polars_filter""",true,false,true,74000000,true,2.281038
"""polars_filter""",true,false,true,94000000,true,2.934398
…,…,…,…,…,…,…
"""polars_join""",false,false,true,500000,false,0.768133
"""polars_join""",false,false,true,900000,false,2.669745
"""polars_join""",false,false,false,100000,false,0.029764
"""polars_join""",false,false,false,500000,false,0.707799


In [None]:
df = pl.concat(
    [
        pl.read_parquet("results_polars_1_100M.parquet"),
        pl.read_parquet("results_polars_gpu_join.parquet"),
    ]
)
df = df.filter(
    ~(~pl.col("lazy") & pl.col("preload")) & (pl.col("limit") != 900000)
).filter(~(pl.col("gpu") & ~pl.col("lazy")))
df

func,gpu,streaming,lazy,limit,preload,duration
str,bool,bool,bool,i64,bool,f64
"""polars_filter""",true,false,true,1000000,true,0.177867
"""polars_filter""",true,false,true,5000000,true,0.211995
"""polars_filter""",true,false,true,10000000,true,0.370296
"""polars_filter""",true,false,true,50000000,true,1.974447
"""polars_filter""",false,true,true,1000000,true,0.020249
…,…,…,…,…,…,…
"""polars_join""",false,true,true,500000,false,0.690454
"""polars_join""",false,false,true,100000,false,0.049945
"""polars_join""",false,false,true,500000,false,0.768133
"""polars_join""",false,false,false,100000,false,0.029764


In [2]:
df = pl.concat(
    [
        pl.read_parquet("results_polars_20241108.parquet"),
        pl.read_parquet("results_polars_<1M_20241108.parquet"),
    ]
)
df = (
    df.filter(~(~pl.col("lazy") & pl.col("preload")))
    .filter(~(pl.col("gpu") & ~pl.col("lazy")))
    .rename({"duration": "duration_raw"})
    .explode("duration_raw")
    .group_by("func", "limit", "gpu", "streaming", "lazy", "preload")
    .agg(
        pl.col("duration_raw")
        .clip(
            lower_bound=pl.mean("duration_raw") - 3 * pl.std("duration_raw"),
            upper_bound=pl.mean("duration_raw") + 3 * pl.std("duration_raw"),
        )
        .mean()
        .alias("duration"),
        pl.col("duration_raw"),
    )
)
df

func,limit,gpu,streaming,lazy,preload,duration,duration_raw
str,i64,bool,bool,bool,bool,f64,list[f64]
"""polars_join""",100000,false,false,false,false,0.035059,"[0.029715, 0.035681, … 0.036581]"
"""polars_filter""",1000000,false,true,true,false,0.151104,"[0.152431, 0.144599, … 0.141922]"
"""polars_filter""",100000,false,false,true,false,0.019485,"[0.020099, 0.019721, … 0.019263]"
"""polars_sort""",1000000,true,false,true,true,0.074385,"[0.156531, 0.059455, … 0.040045]"
"""polars_sort""",5000000,true,false,true,false,0.293563,"[0.279404, 0.297536, … 0.300995]"
…,…,…,…,…,…,…,…
"""polars_groupby""",100000,false,true,true,false,0.189428,"[0.191177, 0.194306, … 0.192873]"
"""polars_filter""",500000,true,false,true,true,0.013579,"[0.014496, 0.013184, … 0.01357]"
"""polars_filter""",5000000,false,false,true,false,0.097568,"[0.097959, 0.099946, … 0.096611]"
"""polars_groupby""",5000000,true,false,true,true,0.050037,"[0.051128, 0.048506, … 0.050467]"


In [3]:
with pl.Config(tbl_rows=1000):
    print(
        df.group_by("lazy", "preload", "gpu", "streaming")
        .agg(pl.len())
        .sort("lazy", "preload", "gpu", "streaming")
    )
## Dataset Size

shape: (7, 5)
┌───────┬─────────┬───────┬───────────┬─────┐
│ lazy  ┆ preload ┆ gpu   ┆ streaming ┆ len │
│ ---   ┆ ---     ┆ ---   ┆ ---       ┆ --- │
│ bool  ┆ bool    ┆ bool  ┆ bool      ┆ u32 │
╞═══════╪═════════╪═══════╪═══════════╪═════╡
│ false ┆ false   ┆ false ┆ false     ┆ 21  │
│ true  ┆ false   ┆ false ┆ false     ┆ 21  │
│ true  ┆ false   ┆ false ┆ true      ┆ 21  │
│ true  ┆ false   ┆ true  ┆ false     ┆ 21  │
│ true  ┆ true    ┆ false ┆ false     ┆ 21  │
│ true  ┆ true    ┆ false ┆ true      ┆ 21  │
│ true  ┆ true    ┆ true  ┆ false     ┆ 21  │
└───────┴─────────┴───────┴───────────┴─────┘


## Dataset Size

In [28]:
dataset_size = (
    df["limit"].unique().to_frame().with_columns(index=pl.col("limit").rank())
)

x = alt.Chart(dataset_size).encode(
    x=alt.X("index:O", axis=alt.Axis(labels=False, grid=False, title="")),
    y=alt.Y(
        "limit:Q",
        axis=alt.Axis(labelExpr='datum.value / 1E6 + "M"', tickCount=5),
        title="# of rows",
        scale=alt.Scale(type="log", domainMin=100_000, padding=8),
    ),
)
c = (
    (x.mark_point(filled=True) + x.mark_line())
    .properties(title="Dataset sizes")
    .properties(width=500)
)
c.save(IMG_DIR / "dataset_sizes.png", dpi=200)
c

## Is Polars lazy mode faster than non-lazy mode?


In [29]:
lazy_non_lazy = df.group_by("func", "lazy").agg(
    pl.mean("duration").alias("mean_duration"), pl.std("duration").alias("std_duration")
)
lazy_non_lazy

func,lazy,mean_duration,std_duration
str,bool,f64,f64
"""polars_sort""",False,1.659894,3.033752
"""polars_join""",False,1.587114,2.039549
"""polars_groupby""",False,0.87003,1.508193
"""polars_groupby""",True,0.425737,0.692869
"""polars_filter""",True,0.244767,0.510674
"""polars_sort""",True,2.981119,7.675807
"""polars_filter""",False,0.038743,0.055507
"""polars_join""",True,1.759301,2.236095


In [30]:
def plot_pretty_boxplot(
    df,
    field: str,
    title: str,
    filename: str,
    filter: str | None = None,
    legend_title: str | None = None,
    column: str | None = None,
    subtitle: str = "",
    height: int = 30,
    width: int = 300,
):
    bar = (
        alt.Chart(df)
        .mark_point(filled=True)
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X(
                "mean(duration)",
                # scale=alt.Scale(type="symlog"),
                title="Processing Time Mean and Standard Deviation (s)",
            ),
            color=alt.Color(
                f"{field}:N",
                legend=alt.Legend(orient="left"),
                title=legend_title if legend_title else field,
            ),
        )
    )

    error_bars = (
        alt.Chart(df)
        .mark_errorbar()
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X("stdev(duration)", title=""),
            color=alt.Color(f"{field}:N"),
        )
    )

    c = (
        (bar + error_bars)
        .properties(height=height, width=width)
        .facet(
            row=alt.Row(
                "func:N",
                header=alt.Header(
                    labelAngle=0,
                    labelAlign="right",
                    orient="right",
                    labelBaseline="alphabetic",
                    titleFontWeight="bold",
                ),
                title="Function",
            ),
            column=alt.Column(f"{column}:N", header=alt.Header(titleFontWeight="bold"))
            if column
            else Undefined,
        )
    ).properties(title=alt.TitleParams(title, subtitle=subtitle))
    if filter:
        c = c.transform_filter((f"datum.{filter}"))
    c.save(IMG_DIR / filename, dpi=200)
    return c


def normal_boxplot(
    df,
    field: str,
    title: str,
    filename: str,
    filter: str = None,
    legend_title: str = None,
    column: str = None,
    subtitle: str = "",
    height: int = 30,
    width: int = 300,
):
    bar = (
        alt.Chart(df)
        .mark_boxplot()
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X(
                "duration:Q",
                scale=alt.Scale(type="symlog"),
                title="Processing Time (s)",
            ),
            color=alt.Color(
                f"{field}:N",
                legend=alt.Legend(
                    orient="left",
                ),
                title=legend_title if legend_title else field,
            ),
        )
    )

    c = (
        bar.properties(height=height, width=width).facet(
            row=alt.Row(
                "func:N",
                header=alt.Header(
                    labelAngle=0,
                    labelAlign="right",
                    orient="right",
                    labelBaseline="alphabetic",
                    titleFontWeight="bold",
                ),
                title="Function",
            ),
            column=alt.Column(f"{column}:N", header=alt.Header(titleFontWeight="bold"))
            if column
            else Undefined,
        )
    ).properties(title=alt.TitleParams(title, subtitle=subtitle))
    if filter:
        c = c.transform_filter((f"datum.{filter}"))
    c.save(IMG_DIR / filename, dpi=200)
    return c

## Lazy vs non-lazy for functions (preloaded)

In [31]:
df.filter(
    ((pl.col("preload") & pl.col("lazy")) | ~pl.col("lazy"))
    & ~pl.col("gpu")
    & ~pl.col("streaming")
).group_by("func", "limit").len()

func,limit,len
str,i64,u32
"""polars_groupby""",100000,2
"""polars_groupby""",50000000,2
"""polars_filter""",5000000,2
"""polars_filter""",50000000,2
"""polars_filter""",1000000,2
…,…,…
"""polars_filter""",100000,2
"""polars_sort""",50000000,2
"""polars_filter""",10000000,2
"""polars_sort""",5000000,2


In [32]:
df.filter(
    ((pl.col("preload") & pl.col("lazy")) | ~pl.col("lazy"))
    & ~pl.col("gpu")
    & ~pl.col("streaming")
).filter((pl.col("func") == "polars_filter") & (pl.col("limit") == 50000000))

func,limit,gpu,streaming,lazy,preload,duration
str,i64,bool,bool,bool,bool,f64
"""polars_filter""",50000000,False,False,False,False,0.149607
"""polars_filter""",50000000,False,False,True,True,0.863818


In [None]:
plot_pretty_boxplot(
    df.filter(
        ((pl.col("preload") & pl.col("lazy")) | ~pl.col("lazy"))
        & ~pl.col("gpu")
        & ~pl.col("streaming")
    ),
    field="lazy",
    title="Processing time for different functions, lazy vs eager",
    subtitle="Data was preloaded into memory",
    filename="preloaded-lazy-vs-eager.png",
    # filter="preload",
    legend_title="Lazy mode",
    # column='gpu',
    height=50,
    width=400,
)

In [None]:
plot_pretty_boxplot(
    df.filter(
        ((~pl.col("preload") & pl.col("lazy")) | ~pl.col("lazy"))
        & ~pl.col("gpu")
        & ~pl.col("streaming")
    ),
    field="lazy",
    title="Processing time for different functions, lazy vs eager",
    subtitle="Data was not preloaded into memory",
    filename="non-preloaded-lazy-vs-eager.png",
    # filter="preload",
    legend_title="Lazy mode",
    # column='gpu',
    height=50,
    width=400,
)

## Lazy preloaded vs non-preloaded

In [21]:
df.filter(~pl.col("gpu") & ~pl.col("streaming")).filter(
    (pl.col("func") == "polars_filter") & pl.col("lazy")
).filter(pl.col("limit") == 50000000)

func,limit,gpu,streaming,lazy,preload,duration
str,i64,bool,bool,bool,bool,f64
"""polars_filter""",50000000,False,False,True,True,0.863818
"""polars_filter""",50000000,False,False,True,False,0.739366


In [54]:
c = (
    df.filter(~pl.col("gpu") & ~pl.col("streaming"))
    .filter((pl.col("func") == "polars_filter") & pl.col("lazy"))
    .plot.bar(
        x=alt.X("preload", axis=alt.Axis(labelAngle=45)),
        y="duration",
        column="limit",
        color="preload",
    )
    .properties(
        width=50,
        title=alt.Title(
            "Execution Speed for Lazy Filtering Operations",
            subtitle="Preloaded vs. non-preloaded data. ",
        ),
    )
)
c.save(IMG_DIR / "pretty_boxplot_preloaded_vs_non_preloaded_filtering.png", dpi=200)
c

In [None]:
plot_pretty_boxplot(
    df.filter(
        ~pl.col("gpu")
        & ~pl.col("streaming")
        & ((~pl.col("preload") & pl.col("lazy")) | (pl.col("preload") & pl.col("lazy")))
    ),
    field="preload",
    title="Processing time lazy mode preloaded vs non-preloaded",
    filename="lazy-preloaded-vs-non-preloaded.png",
    filter="lazy",
    legend_title="Preloaded in memory",
    height=50,
    width=400,
)

# GPU

In [78]:
plot_pretty_boxplot(
    (
        df.filter(~pl.col("streaming") & pl.col("lazy"))
        # .filter(
        #     ~((pl.col("func") == "polars_filter") & (pl.col("duration") > 2))
        # )
    ),
    field="gpu",
    title="Processing time GPU vs CPU",
    filename="gpu-accelerated-vs-non-accelerated.png",
    # filter="lazy",
    legend_title="GPU Enabled",
    # column="preload",
    height=50,
    width=400,
)

In [None]:
df.filter(~pl.col("streaming") & pl.col("lazy")).select(
    "gpu", "streaming", "lazy", "preload"
).unique()

gpu,streaming,lazy,preload
bool,bool,bool,bool
True,False,True,True
False,False,True,False
True,False,True,False
False,False,True,True


In [105]:
with pl.Config(float_precision=2) as cfg:
    print(
        df.filter(~pl.col("streaming") & pl.col("lazy"))
        .group_by("func", "gpu")
        .agg(pl.mean("duration"))
        .pivot(index=["func"], on="gpu")
        .with_columns((pl.col("false") / pl.col("true")).alias("Change"))
        .select("func", "Change")
    )

shape: (4, 2)
┌────────────────┬────────┐
│ func           ┆ Change │
│ ---            ┆ ---    │
│ str            ┆ f64    │
╞════════════════╪════════╡
│ polars_filter  ┆ 1.17   │
│ polars_join    ┆ 0.46   │
│ polars_sort    ┆ 2.50   │
│ polars_groupby ┆ 4.44   │
└────────────────┴────────┘


In [None]:
from tabular_titans.benchmark import read_polars_lazy


with pl.Config() as cfg:
    cfg.set_streaming_chunk_size(2_000_000)
    tmp = read_polars_lazy()
    tmp = tmp.join(tmp, on="product_id", how="left")
    print(tmp.explain(streaming=True))

STREAMING:
  LEFT JOIN:
  LEFT PLAN ON: [col("product_id")]
    Parquet SCAN [/Users/tn14gn/Developer/Sytac/polars-pyspark-pandas-comparison/data/chunk_0.parquet, ... 9 other sources]
    PROJECT */6 COLUMNS
  RIGHT PLAN ON: [col("product_id")]
    Parquet SCAN [/Users/tn14gn/Developer/Sytac/polars-pyspark-pandas-comparison/data/chunk_0.parquet, ... 9 other sources]
    PROJECT */6 COLUMNS
  END LEFT JOIN


In [None]:
c = (
    df.filter(~pl.col("streaming") & pl.col("lazy") & ~pl.col("preload"))
    .filter(~((pl.col("func") == "polars_filter") & (pl.col("duration") > 2)))
    .plot.bar(
        x=alt.X("gpu", axis=alt.Axis(labelAngle=45)),
        y="duration",
        column="limit",
        color="gpu",
    )
    .properties(
        width=50,
        title=alt.Title(
            "Execution Speed for Lazy Filtering Operations",
            subtitle="Preloaded vs. non-preloaded data. ",
        ),
    )
)
c.save(IMG_DIR / "pretty_boxplot_gpu_vs_nongpu_filtering.png", dpi=200)
c

## Lazy vs non-lazy boxplot

In [11]:
normal_boxplot(
    df.filter((pl.col("preload") & pl.col("lazy")) | ~pl.col("lazy")),
    field="lazy",
    title="Processing time for different functions, lazy vs non-lazy",
    subtitle="Lazy mode preloaded in memory. ",
    filename="preloaded-lazy-vs-non-lazy-boxplot.png",
    # filter="preload",
    legend_title="Lazy mode",
    # column='gpu',
)

## Streaming vs non-streaming

In [None]:
df.filter(~pl.col("gpu") & pl.col("lazy") & ~pl.col("preload")).select(
    "gpu", "streaming", "lazy", "preload"
).unique()

gpu,streaming,lazy,preload
bool,bool,bool,bool
False,False,True,False
False,True,True,False


In [None]:
chartyboi = plot_pretty_boxplot(
    df.filter(~pl.col("gpu") & pl.col("lazy") & ~pl.col("preload")),
    field="streaming",
    title="Processing time for Streaming vs. Non-streaming",
    filename="streaming-vs-non-streaming.png",
    # filter="gpu == false",
    legend_title="Streaming mode",
    width=400,
    height=50,
)
chartyboi

In [None]:
from IPython.display import display, Markdown

with pl.Config(float_precision=2) as cfg:
    # cfg.set_tbl_formatting("ASCII_MARKDOWN")
    print(
        df.filter(~pl.col("gpu") & pl.col("lazy") & ~pl.col("preload"))
        .group_by("func", "streaming")
        .agg(pl.mean("duration"))
        .pivot(index=["func"], on="streaming")
        .with_columns((pl.col("false") / pl.col("true")).alias("Change"))
        .select("func", "Change")
        .sort("func")
    )

shape: (4, 2)
┌────────────────┬────────┐
│ func           ┆ Change │
│ ---            ┆ ---    │
│ str            ┆ f64    │
╞════════════════╪════════╡
│ polars_filter  ┆ 0.26   │
│ polars_groupby ┆ 0.58   │
│ polars_join    ┆ 1.87   │
│ polars_sort    ┆ 0.21   │
└────────────────┴────────┘


In [None]:
result = df.join(df, on="product_id", how="left")

In [207]:
df.group_by("gpu", "streaming", "lazy", "preload").agg(
    pl.mean("duration").alias("mean_duration")
).sort("mean_duration")

gpu,streaming,lazy,preload,mean_duration
bool,bool,bool,bool,f64
True,False,True,True,1.874659
True,False,False,False,2.788744
False,False,True,True,2.791986
False,False,False,False,2.792723
False,True,False,False,2.804007
False,False,True,False,3.065482
True,False,True,False,3.09431
False,True,True,True,7.36728
False,True,True,False,8.937575


In [120]:
duration_per_function = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("func:N", title="Function", axis=alt.Axis(labelAngle=45)),
        y=alt.Y(
            "mean(duration):Q",
            title="Mean Duration (s)",
            scale=alt.Scale(type="symlog"),
        ),
        color="func:N",
        column="limit:O",
    )
)
duration_per_function.save(IMG_DIR / "duration_per_function.png", dpi=250)
duration_per_function

In [5]:
import phik
from dython.nominal import associations

Matplotlib is building the font cache; this may take a moment.


In [None]:
df.to_pandas().phik_matrix(interval_columns=["limit", "duration"])

TypeError: phik_matrix() got an unexpected keyword argument 'interval_columns'

In [8]:
df.schema

Schema([('func', String),
        ('gpu', Boolean),
        ('streaming', Boolean),
        ('lazy', Boolean),
        ('limit', Int64),
        ('preload', Boolean),
        ('duration', Float64)])

In [None]:
import pandas as pd

correlations: pd.DataFrame = associations(
    df.to_pandas(),
    numerical_columns=["duration", "limit"],
    filename=IMG_DIR / "correlation_matrix.png",
    plot=False,
)["corr"]

In [None]:
corr_melted = correlations.reset_index().melt(
    id_vars=["index"],
    value_vars=["func", "gpu", "streaming", "lazy", "limit", "preload", "duration"],
    value_name="correlation_coefficient",
)
corr_melted

Unnamed: 0,index,variable,correlation_coefficient
0,func,func,1.0
1,gpu,func,0.0
2,streaming,func,0.0
3,lazy,func,0.0
4,limit,func,0.219695
5,preload,func,0.0
6,duration,func,0.274957
7,func,gpu,0.0
8,gpu,gpu,1.0
9,streaming,gpu,0.427327


In [None]:
c = (
    alt.Chart(corr_melted)
    .mark_rect()
    .encode(
        x=alt.X("index:N", title="Column"),
        y=alt.Y("variable:N", title="Column"),
        color=alt.Color("correlation_coefficient:Q", title="Correlation Coefficient"),
    )
)
txt = (
    alt.Chart(corr_melted)
    .mark_text(baseline="middle")
    .encode(
        text=alt.Text("correlation_coefficient", format=".2f"),
        x=alt.X("index:N", title="Column"),
        y=alt.Y("variable:N", title="Column"),
        color=alt.condition(
            alt.datum.correlation_coefficient < 0.5,
            alt.value("black"),
            alt.value("white"),
        ),
    )
)
out = (c + txt).properties(height=500, width=500).configure_axis(grid=False)
out.save(IMG_DIR / "correlation_matrix.png")
out

We can see that lazy mode is slower than non-lazy mode for most operations, which makes since given that in lazy mode the data is not loaded into memory.

# Streaming chunk sizes

In [4]:
chunkies = pl.read_parquet("streaming_batch_size_evaluation.parquet")

chunkies = (
    chunkies.rename({"duration": "duration_raw"})
    .filter(pl.col("streaming"))
    .explode("duration_raw")
    .with_columns(
        pl.col("duration_raw").clip(
            lower_bound=pl.mean("duration_raw") - 3 * pl.std("duration_raw"),
            upper_bound=pl.mean("duration_raw") + 3 * pl.std("duration_raw"),
        )
    )
    # .group_by(
    #     "func", "limit", "gpu", "streaming", "lazy", "preload", "streaming_chunk_size"
    # )
    # .agg(
    #     pl.col("duration_raw")
    #     .clip(
    #         lower_bound=pl.mean("duration_raw") - 3 * pl.std("duration_raw"),
    #         upper_bound=pl.mean("duration_raw") + 3 * pl.std("duration_raw"),
    #     )
    #     .mean()
    #     .alias("duration"),
    #     pl.col("duration_raw"),
    # )
).drop("limit", "gpu", "lazy", "preload")
chunkies

func,streaming,streaming_chunk_size,duration_raw
str,bool,i64,f64
"""polars_filter""",true,10000,1.423108
"""polars_filter""",true,10000,0.591395
"""polars_filter""",true,10000,0.619112
"""polars_filter""",true,10000,0.589842
"""polars_filter""",true,10000,0.614421
…,…,…,…
"""polars_groupby""",true,10000000,0.955065
"""polars_groupby""",true,10000000,0.956192
"""polars_groupby""",true,10000000,0.956059
"""polars_groupby""",true,10000000,0.957065


In [None]:
c = (
    alt.Chart(chunkies)
    .mark_boxplot()
    .encode(
        # scale=alt.Scale(type="symlog"),
        x=alt.X(
            "streaming_chunk_size:O",
        ).axis(labelAngle=-45),
        y=alt.Y("duration_raw:Q").scale(type="symlog"),
        color="func",
        column="func",
        # facet=alt.Facet("func:O"),
    )
    .properties(width=200, title="Processing time for different streaming chunk sizes")
)
c.save(IMG_DIR / "streaming_chunk_size.png", dpi=250)
c

In [None]:
chartyboi = plot_pretty_boxplot(
    chunkies,
    field="streaming",
    title="Processing time for Streaming vs. Non-streaming",
    filename="streaming-vs-non-streaming.png",
    # filter="gpu == false",
    legend_title="Streaming mode",
    width=400,
    height=50,
)
chartyboi

In [23]:
%%timeit
from tabular_titans.benchmark import polars_groupby, read_polars_lazy


with pl.Config() as cfg:
    cfg.set_streaming_chunk_size(2_000_000)
    tmp = read_polars_lazy(limit=50_000_000)
    tmp = polars_groupby(tmp, streaming=True)

1.02 s ± 28.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit
from tabular_titans.benchmark import polars_groupby, read_polars_lazy


with pl.Config() as cfg:
    cfg.set_streaming_chunk_size(10_000)
    tmp = read_polars_lazy(limit=1_000_000)
    tmp = polars_groupby(tmp, streaming=True)

108 ms ± 928 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
%%timeit
from tabular_titans.benchmark import polars_groupby, read_polars_lazy


with pl.Config() as cfg:
    # cfg.set_streaming_chunk_size()
    tmp = read_polars_lazy(limit=1_000_000)
    tmp = polars_groupby(tmp, streaming=True)

110 ms ± 1.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
