In [3]:
from pathlib import Path
import polars as pl
import altair as alt
from altair import Undefined

alt.themes.enable("carbong10")
IMG_DIR = Path("img")
IMG_DIR.mkdir(exist_ok=True)

## Research setup

Hypothesis: Polars in lazy is faster than non-lazy and using the GPU is faster than CPU. 

Hardware: Google Colab Pro with 12C, 12GB RAM and 1x A100 GPU with 40GB VRAM. 

Research questions:
- Is Polars lazy mode faster than non-lazy mode?
- Is using the GPU faster than using the CPU?
- Is there a combination of streaming and lazy mode that is faster than the others?
- What is the effect of streaming on the performance?


In [226]:
df = pl.concat(
    [
        pl.read_parquet("results_polars_gpu_100m.parquet").filter(
            ~((pl.col("func") == "polars_join") & pl.col("gpu"))
        ),
        pl.read_parquet("results_polars_gpu_1_10M.parquet").filter(
            ~((pl.col("func") == "polars_join") & pl.col("gpu"))
        ),
        pl.read_parquet("results_polars_gpu_join.parquet"),
    ]
)
df = df.filter(~(~pl.col("lazy") & pl.col("preload")))
df

func,gpu,streaming,lazy,limit,preload,duration
str,bool,bool,bool,i64,bool,f64
"""polars_filter""",true,false,true,14000000,true,0.448067
"""polars_filter""",true,false,true,34000000,true,1.065185
"""polars_filter""",true,false,true,54000000,true,1.666106
"""polars_filter""",true,false,true,74000000,true,2.281038
"""polars_filter""",true,false,true,94000000,true,2.934398
…,…,…,…,…,…,…
"""polars_join""",false,false,true,500000,false,0.768133
"""polars_join""",false,false,true,900000,false,2.669745
"""polars_join""",false,false,false,100000,false,0.029764
"""polars_join""",false,false,false,500000,false,0.707799


In [4]:
df = pl.concat(
    [
        pl.read_parquet("results_polars_1_100M.parquet"),
        pl.read_parquet("results_polars_gpu_join.parquet"),
    ]
)
df = df.filter(~(~pl.col("lazy") & pl.col("preload")) & (pl.col('limit') != 900000))
df

func,gpu,streaming,lazy,limit,preload,duration
str,bool,bool,bool,i64,bool,f64
"""polars_filter""",true,false,true,1000000,true,0.177867
"""polars_filter""",true,false,true,5000000,true,0.211995
"""polars_filter""",true,false,true,10000000,true,0.370296
"""polars_filter""",true,false,true,50000000,true,1.974447
"""polars_filter""",false,true,true,1000000,true,0.020249
…,…,…,…,…,…,…
"""polars_join""",false,true,true,500000,false,0.690454
"""polars_join""",false,false,true,100000,false,0.049945
"""polars_join""",false,false,true,500000,false,0.768133
"""polars_join""",false,false,false,100000,false,0.029764


In [5]:
df.group_by("limit").len().sort("limit")

limit,len
i64,u32
100000,32
500000,32
1000000,32
5000000,24
10000000,24
50000000,24


In [6]:
with pl.Config(tbl_rows=1000):
    print(df.group_by("lazy", "preload", "gpu", "streaming").agg(pl.len()).sort("lazy", 'preload', 'gpu', 'streaming'))
## Dataset Size

shape: (8, 5)
┌───────┬─────────┬───────┬───────────┬─────┐
│ lazy  ┆ preload ┆ gpu   ┆ streaming ┆ len │
│ ---   ┆ ---     ┆ ---   ┆ ---       ┆ --- │
│ bool  ┆ bool    ┆ bool  ┆ bool      ┆ u32 │
╞═══════╪═════════╪═══════╪═══════════╪═════╡
│ false ┆ false   ┆ false ┆ false     ┆ 21  │
│ false ┆ false   ┆ true  ┆ false     ┆ 21  │
│ true  ┆ false   ┆ false ┆ false     ┆ 21  │
│ true  ┆ false   ┆ false ┆ true      ┆ 21  │
│ true  ┆ false   ┆ true  ┆ false     ┆ 21  │
│ true  ┆ true    ┆ false ┆ false     ┆ 21  │
│ true  ┆ true    ┆ false ┆ true      ┆ 21  │
│ true  ┆ true    ┆ true  ┆ false     ┆ 21  │
└───────┴─────────┴───────┴───────────┴─────┘


## Dataset Size

In [7]:
dataset_size = (
    df["limit"].unique().to_frame().with_columns(index=pl.col("limit").rank())
)

x = alt.Chart(dataset_size).encode(
    x=alt.X("index:O", axis=alt.Axis(labels=False, grid=False, title="")),
    y=alt.Y(
        "limit:Q",
        axis=alt.Axis(labelExpr='datum.value / 1E6 + "M"', tickCount=5),
        title="# of rows",
        scale=alt.Scale(type="log", domainMin=100_000, padding=8),
    ),
)
c = (
    (x.mark_point(filled=True) + x.mark_line())
    .properties(title="Dataset sizes")
    .properties(width=500)
)
c.save(IMG_DIR / "dataset_sizes.png", dpi=200)
c

## Is Polars lazy mode faster than non-lazy mode?


In [8]:
lazy_non_lazy = df.group_by("func", "lazy").agg(
    pl.mean("duration").alias("mean_duration"), pl.std("duration").alias("std_duration")
)
lazy_non_lazy

func,lazy,mean_duration,std_duration
str,bool,f64,f64
"""polars_filter""",True,0.381035,0.745502
"""polars_groupby""",True,0.710139,1.07794
"""polars_sort""",True,3.697009,8.682266
"""polars_join""",False,1.367813,1.5966
"""polars_filter""",False,0.036606,0.047083
"""polars_join""",True,1.371692,1.515419
"""polars_groupby""",False,0.873396,1.465716
"""polars_sort""",False,1.806888,3.161741


In [20]:
def plot_pretty_boxplot(
    df,
    field: str,
    title: str,
    filename: str,
    filter: str = None,
    legend_title: str = None,
    column: str = None,
    subtitle: str = "",
    height: int = 30,
    width: int = 300,
):
    bar = (
        alt.Chart(df)
        .mark_point(filled=True)
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X(
                "mean(duration)",
                scale=alt.Scale(type="symlog"),
                title="Processing Time Mean and Standard Deviation (s)",
            ),
            color=alt.Color(
                f"{field}:N",
                legend=alt.Legend(orient="left"),
                title=legend_title if legend_title else field,
            ),
        )
    )

    error_bars = (
        alt.Chart(df)
        .mark_errorbar()
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X("stdev(duration)", scale=alt.Scale(type="symlog"), title=""),
            color=alt.Color(f"{field}:N"),
        )
    )

    c = (
        (bar + error_bars).properties(height=height, width=width).facet(
            row=alt.Row(
                "func:N",
                header=alt.Header(
                    labelAngle=0,
                    labelAlign="right",
                    orient="right",
                    labelBaseline="alphabetic",
                    titleFontWeight="bold",
                ),
                title="Function",
            ),
            column=alt.Column(f"{column}:N", header=alt.Header(titleFontWeight="bold"))
            if column
            else Undefined,
        )
    ).properties(title=alt.TitleParams(title, subtitle=subtitle))
    if filter:
        c = c.transform_filter((f"datum.{filter}"))
    c.save(IMG_DIR / filename, dpi=200)
    return c


def normal_boxplot(
    df,
    field: str,
    title: str,
    filename: str,
    filter: str = None,
    legend_title: str = None,
    column: str = None,
    subtitle: str = "",
):
    bar = (
        alt.Chart(df)
        .mark_boxplot()
        .encode(
            y=alt.Y(f"{field}:N", axis=None),
            x=alt.X(
                "duration:Q",
                scale=alt.Scale(type="symlog"),
                title="Processing Time (s)",
            ),
            color=alt.Color(
                f"{field}:N",
                legend=alt.Legend(
                    orient="left",
                ),
                title=legend_title if legend_title else field,
            ),
        )
    )

    c = (
        bar.facet(
            row=alt.Row(
                "func:N",
                header=alt.Header(
                    labelAngle=0,
                    labelAlign="right",
                    orient="right",
                    labelBaseline="alphabetic",
                    titleFontWeight="bold",
                ),
                title="Function",
            ),
            column=alt.Column(f"{column}:N", header=alt.Header(titleFontWeight="bold"))
            if column
            else Undefined,
        )
    ).properties(title=alt.TitleParams(title, subtitle=subtitle), width=800, height=500)
    if filter:
        c = c.transform_filter((f"datum.{filter}"))
    c.save(IMG_DIR / filename, dpi=200)
    return c

## Lazy vs non-lazy for functions (preloaded)

In [23]:
plot_pretty_boxplot(
    df.filter((pl.col("preload") & pl.col("lazy")) | ~pl.col("lazy")),
    field="lazy",
    title="Processing time for different functions, lazy vs non-lazy",
    filename="preloaded-lazy-vs-non-lazy.png",
    # filter="preload",
    legend_title="Lazy mode",
    # column='gpu',
    height=50,
    width=500
)

## Lazy preloaded vs non-preloaded

In [24]:
plot_pretty_boxplot(
    df,
    field="preload",
    title="Processing time lazy mode preloaded vs non-preloaded",
    filename="lazy-preloaded-vs-non-preloaded.png",
    filter="lazy",
    legend_title="Preloaded in memory",
    height=50,
    width=500
)

In [27]:
plot_pretty_boxplot(
    df,
    field="gpu",
    title="Processing time GPU vs CPU",
    filename="gpu-accelerated-vs-non-accelerated.png",
    filter="lazy",
    legend_title="GPU Enabled",
    column="preload",
    height=50,
    width=400
)

In [18]:
with pl.Config(tbl_rows=-1):
    display(df.filter(pl.col('gpu') & pl.col('lazy') & (pl.col('func') == 'polars_filter')).sort('limit', 'preload'))

func,gpu,streaming,lazy,limit,preload,duration
str,bool,bool,bool,i64,bool,f64
"""polars_filter""",True,False,True,100000,False,0.020704
"""polars_filter""",True,False,True,100000,True,2.993594
"""polars_filter""",True,False,True,500000,False,0.033198
"""polars_filter""",True,False,True,500000,True,0.027348
"""polars_filter""",True,False,True,900000,False,0.037227
"""polars_filter""",True,False,True,900000,True,0.039958
"""polars_filter""",True,False,True,1000000,False,0.047043
"""polars_filter""",True,False,True,1000000,True,0.177867
"""polars_filter""",True,False,True,5000000,False,0.098694
"""polars_filter""",True,False,True,5000000,True,0.211995


## Lazy vs non-lazy boxplot

In [11]:
normal_boxplot(
    df.filter((pl.col("preload") & pl.col("lazy")) | ~pl.col("lazy")),
    field="lazy",
    title="Processing time for different functions, lazy vs non-lazy",
    subtitle="Lazy mode preloaded in memory. ",
    filename="preloaded-lazy-vs-non-lazy-boxplot.png",
    # filter="preload",
    legend_title="Lazy mode",
    # column='gpu',
)

## Streaming vs non-streaming

In [206]:
plot_pretty_boxplot(
    df,
    field="streaming",
    title="Processing time for Streaming vs. Non-streaming",
    filename="streaming-vs-non-streaming.png",
    filter="gpu == false",
    legend_title="Streaming mode",
)

In [207]:
df.group_by("gpu", "streaming", "lazy", "preload").agg(
    pl.mean("duration").alias("mean_duration")
).sort("mean_duration")

gpu,streaming,lazy,preload,mean_duration
bool,bool,bool,bool,f64
True,False,True,True,1.874659
True,False,False,False,2.788744
False,False,True,True,2.791986
False,False,False,False,2.792723
False,True,False,False,2.804007
False,False,True,False,3.065482
True,False,True,False,3.09431
False,True,True,True,7.36728
False,True,True,False,8.937575


In [25]:
alt.Chart(df).mark_bar().encode(
    x=alt.X("func:N", title="Function", axis=alt.Axis(labelAngle=45)),
    y=alt.Y(
        "mean(duration):Q", title="Mean Duration (s)", scale=alt.Scale(type="symlog")
    ),
    color="func:N",
    column="limit:O",
)

In [43]:
import phik
from dython.nominal import associations

In [29]:
df.to_pandas().phik_matrix(interval_columns=['limit', 'duration'])

interval columns not set, guessing: ['limit', 'duration']


Unnamed: 0,func,gpu,streaming,lazy,limit,preload,duration
func,1.0,0.0,0.0,0.0,0.132893,0.0,0.375772
gpu,0.0,1.0,0.622004,0.173414,0.0,0.0,0.0
streaming,0.0,0.622004,1.0,0.466351,0.0,0.173414,0.287687
lazy,0.0,0.173414,0.466351,1.0,0.0,0.622004,0.0
limit,0.132893,0.0,0.0,0.0,1.0,0.0,0.372662
preload,0.0,0.0,0.173414,0.622004,0.0,1.0,0.0
duration,0.375772,0.0,0.287687,0.0,0.372662,0.0,1.0


In [50]:
import pandas as pd
correlations: pd.DataFrame = associations(df.to_pandas(), numerical_columns=['duration', 'limit'], filename=IMG_DIR / 'correlation_matrix.png', plot=False)['corr']

In [55]:
corr_melted = correlations.reset_index().melt(id_vars=['index'], value_vars=['func', 'gpu', 'streaming', 'lazy', 'limit', 'preload', 'duration'], value_name='correlation_coefficient')
corr_melted

Unnamed: 0,index,variable,correlation_coefficient
0,func,func,1.0
1,gpu,func,0.0
2,streaming,func,0.0
3,lazy,func,0.0
4,limit,func,0.219695
5,preload,func,0.0
6,duration,func,0.274957
7,func,gpu,0.0
8,gpu,gpu,1.0
9,streaming,gpu,0.427327


In [69]:

c = alt.Chart(corr_melted).mark_rect().encode(
    x=alt.X('index:N', title='Column'), y=alt.Y('variable:N', title='Column'), color=alt.Color('correlation_coefficient:Q', title='Correlation Coefficient'), 
)
txt = alt.Chart(corr_melted).mark_text(baseline="middle").encode(
        text=alt.Text('correlation_coefficient', format='.2f'),
        x=alt.X('index:N', title='Column'), 
        y=alt.Y('variable:N', title='Column'), 
        color=alt.condition(
            alt.datum.correlation_coefficient < 0.5,
            alt.value("black"),
            alt.value("white"),
        ),
    )
out = (c + txt).properties(height=500, width=500).configure_axis(grid=False)
out.save(IMG_DIR / 'correlation_matrix.png')
out

: 

We can see that lazy mode is slower than non-lazy mode for most operations, which makes since given that in lazy mode the data is not loaded into memory.