In [37]:
from pathlib import Path
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns
from altair import datum

alt.themes.enable("carbong10")
IMG_DIR = Path("img")
IMG_DIR.mkdir(exist_ok=True)

## Research setup

Hypothesis: Polars in lazy is faster than non-lazy and using the GPU is faster than CPU. 

Hardware: Google Colab Pro with 12C, 12GB RAM and 1x A100 GPU with 40GB VRAM. 

Research questions:
- Is Polars lazy mode faster than non-lazy mode?
- Is using the GPU faster than using the CPU?
- Is there a combination of limit and lazy mode that is faster than the others?
- What is the effect of streaming on the performance?


In [2]:
df = pl.concat(
    [
        pl.read_parquet("results_polars_gpu_100m.parquet"),
        pl.read_parquet("results_polars_gpu_1_10M.parquet"),
    ]
)
df

func,gpu,streaming,lazy,limit,preload,duration
str,bool,bool,bool,i64,bool,f64
"""polars_filter""",true,false,true,14000000,true,0.448067
"""polars_filter""",true,false,true,34000000,true,1.065185
"""polars_filter""",true,false,true,54000000,true,1.666106
"""polars_filter""",true,false,true,74000000,true,2.281038
"""polars_filter""",true,false,true,94000000,true,2.934398
…,…,…,…,…,…,…
"""polars_join""",false,false,true,5000000,false,3.315305
"""polars_join""",false,false,true,10000000,false,3.406222
"""polars_join""",false,false,false,1000000,false,3.213408
"""polars_join""",false,false,false,5000000,false,3.510228


## Is Polars lazy mode faster than non-lazy mode?


In [3]:
lazy_non_lazy = df.group_by("func", "lazy").agg(
    pl.mean("duration").alias("mean_duration"), pl.std("duration").alias("std_duration")
)
lazy_non_lazy

func,lazy,mean_duration,std_duration
str,bool,f64,f64
"""polars_filter""",False,0.103041,0.087791
"""polars_groupby""",False,2.733357,2.49811
"""polars_join""",False,3.294767,0.107538
"""polars_join""",True,3.158403,1.059952
"""polars_groupby""",True,1.997471,1.983891
"""polars_filter""",True,0.852915,1.193456
"""polars_sort""",True,11.64046,17.106739
"""polars_sort""",False,5.747333,5.447984


In [25]:
df

func,gpu,streaming,lazy,limit,preload,duration
str,bool,bool,bool,i64,bool,f64
"""polars_filter""",true,false,true,14000000,true,0.448067
"""polars_filter""",true,false,true,34000000,true,1.065185
"""polars_filter""",true,false,true,54000000,true,1.666106
"""polars_filter""",true,false,true,74000000,true,2.281038
"""polars_filter""",true,false,true,94000000,true,2.934398
…,…,…,…,…,…,…
"""polars_join""",false,false,true,5000000,false,3.315305
"""polars_join""",false,false,true,10000000,false,3.406222
"""polars_join""",false,false,false,1000000,false,3.213408
"""polars_join""",false,false,false,5000000,false,3.510228


In [107]:
from altair import Undefined


def plot_pretty_boxplot(
    df,
    field: str,
    title: str,
    filename: str,
    filter: str = None,
    legend_title: str = None,
    column: str = None,
    subtitle: str = "",
):
    bar = (
        alt.Chart(df)
        .mark_point(filled=True)
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X(
                "mean(duration)",
                scale=alt.Scale(type="symlog"),
                title="Processing Time Mean and Standard Deviation (s)",
            ),
            color=alt.Color(
                f"{field}:N",
                legend=alt.Legend(orient="left"),
                title=legend_title if legend_title else field,
            ),
        )
    )

    error_bars = (
        alt.Chart(df)
        .mark_errorbar()
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X("stdev(duration)", scale=alt.Scale(type="symlog"), title=""),
            color=alt.Color(f"{field}:N"),
        )
    )

    c = (
        (bar + error_bars).facet(
            row=alt.Row(
                "func:N",
                header=alt.Header(
                    labelAngle=0,
                    labelAlign="right",
                    orient="right",
                    labelBaseline="alphabetic",
                    titleFontWeight="bold",
                ),
                title="Function",
            ),
            column=alt.Column(f"{column}:N", header=alt.Header(titleFontWeight="bold"))
            if column
            else Undefined,
        )
    ).properties(title=alt.TitleParams(title, subtitle=subtitle))
    if filter:
        c = c.transform_filter((f"datum.{filter}"))
    c.save(IMG_DIR / filename, dpi=200)
    return c


def normal_boxplot(
    df,
    field: str,
    title: str,
    filename: str,
    filter: str = None,
    legend_title: str = None,
    column: str = None,
    subtitle: str = "",
):
    bar = (
        alt.Chart(df)
        .mark_boxplot()
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X(
                "duration:Q",
                scale=alt.Scale(type="symlog"),
                title="Processing Time (s)",
            ),
            color=alt.Color(
                f"{field}:N",
                legend=alt.Legend(
                    orient="left",
                ),
                title=legend_title if legend_title else field,

            ),
        )
    )

    c = (
        (bar).facet(
            row=alt.Row(
                "func:N",
                header=alt.Header(
                    labelAngle=0,
                    labelAlign="right",
                    orient="right",
                    labelBaseline="alphabetic",
                ),
                title="Function",
            ),
            column=alt.Column(f"{column}:N", header=alt.Header(titleFontWeight="bold"))
            if column
            else Undefined,
        )
    ).properties(title=alt.TitleParams(title, subtitle=subtitle))
    if filter:
        c = c.transform_filter((f"datum.{filter}"))
    c.save(IMG_DIR / filename, dpi=200)
    return c

In [109]:
normal_boxplot(
    df,
    field="lazy",
    title="Processing time for different functions, lazy vs non-lazy",
    filename="preloaded-lazy-vs-non-lazy.png",
    filter="preload",
    legend_title="Lazy mode",
    # column='gpu',
)

## Lazy vs non-lazy for functions (preloaded)

In [101]:
plot_pretty_boxplot(
    df,
    field="lazy",
    title="Processing time for different functions, lazy vs non-lazy",
    filename="preloaded-lazy-vs-non-lazy.png",
    filter="preload",
    legend_title="Lazy mode",
    # column='gpu',
)

## Lazy preloaded vs non-preloaded

In [68]:
plot_pretty_boxplot(
    df,
    field="preload",
    title="Processing time lazy mode preloaded vs non-preloaded",
    filename="lazy-preloaded-vs-non-preloaded.png",
    filter="lazy",
    legend_title="Preloaded in memory",
)

In [97]:
plot_pretty_boxplot(
    df,
    field="gpu",
    title="Processing time GPU vs non-GPU",
    filename="gpu-accelerated-vs-non-accelerated.png",
    filter="lazy",
    legend_title="GPU Enabled",
    column="preload",
)

In [46]:
dataset_size = (
    df["limit"].unique().to_frame().with_columns(index=pl.col("limit").rank())
)

x = alt.Chart(dataset_size).encode(
    x=alt.X("index:O", axis=alt.Axis(labels=False, grid=False, title="")),
    y=alt.Y(
        "limit:Q",
        axis=alt.Axis(tickCount=5, labelExpr='datum.value / 1E6 + "M"'),
        title="# of rows",
    ),
)
c = (x.mark_point(filled=True) + x.mark_line()).properties(title="Dataset sizes")
c.save(IMG_DIR / "dataset_sizes.png", dpi=200)
c

In [14]:
bar = (
    alt.Chart(df)
    .mark_boxplot(
        # color="black",
        # filled=True
    )
    .encode(
        y=alt.Y("lazy:N", axis=None),  # , axis=alt.Axis(orient="right")),
        x=alt.X(
            "duration:Q",
            scale=alt.Scale(type="symlog"),
            title="Processing Time (s) - error bars are ",
        ),
        color=alt.Color(
            "lazy:N",
            legend=alt.Legend(
                orient="left",
            ),
        ),
    )
)

(
    (bar).facet(
        row=alt.Row(
            "func:N",
            header=alt.Header(
                # labelOrient="left",
                labelAngle=0,
                labelAlign="right",
                # labelAnchor="start",
                orient="right",
                # labelOrient="right",
                labelBaseline="alphabetic",
                # labelPadding=-10,
            ),
            title="Function",
        ),
    )
).properties(title="test")
# .configure_rule(color='black')

In [45]:
alt.themes

ThemeRegistry(active='urbaninstitute', registered=['carbong10', 'carbong100', 'carbong90', 'carbonwhite', 'dark', 'default', 'excel', 'fivethirtyeight', 'ggplot2', 'googlecharts', 'latimes', 'none', 'opaque', 'powerbi', 'quartz', 'urbaninstitute', 'vox'])

We can see that lazy mode is slower than non-lazy mode for most operations, which makes since given that in lazy mode the data is not loaded into memory.