# Analysis of Polars GPU Performance on 100M Dataset

In [2]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

# Set plot styles
# plt.style.use('seaborn')
# sns.set_palette("viridis")

In [3]:
# Read the parquet file
df = pl.read_parquet("results_polars_gpu_100M.parquet")

# Convert duration to milliseconds for better readability
df = df.with_columns(pl.col("duration") * 1000)

print(df.head())
print("\nDataset shape:", df.shape)
print("\nColumn names:", df.columns)
print("\nData types:")
print(df.dtypes)

shape: (5, 7)
┌───────────────┬──────┬───────────┬──────┬──────────┬─────────┬─────────────┐
│ func          ┆ gpu  ┆ streaming ┆ lazy ┆ limit    ┆ preload ┆ duration    │
│ ---           ┆ ---  ┆ ---       ┆ ---  ┆ ---      ┆ ---     ┆ ---         │
│ str           ┆ bool ┆ bool      ┆ bool ┆ i64      ┆ bool    ┆ f64         │
╞═══════════════╪══════╪═══════════╪══════╪══════════╪═════════╪═════════════╡
│ polars_filter ┆ true ┆ false     ┆ true ┆ 14000000 ┆ true    ┆ 448.066711  │
│ polars_filter ┆ true ┆ false     ┆ true ┆ 34000000 ┆ true    ┆ 1065.184832 │
│ polars_filter ┆ true ┆ false     ┆ true ┆ 54000000 ┆ true    ┆ 1666.105747 │
│ polars_filter ┆ true ┆ false     ┆ true ┆ 74000000 ┆ true    ┆ 2281.038046 │
│ polars_filter ┆ true ┆ false     ┆ true ┆ 94000000 ┆ true    ┆ 2934.397697 │
└───────────────┴──────┴───────────┴──────┴──────────┴─────────┴─────────────┘

Dataset shape: (240, 7)

Column names: ['func', 'gpu', 'streaming', 'lazy', 'limit', 'preload', 'duration']

Data ty

## Exploratory Data Analysis

In [4]:
# Basic statistics
print(df.describe())

shape: (9, 8)
┌────────────┬───────────────┬──────────┬───────────┬───────┬──────────┬─────────┬──────────────┐
│ statistic  ┆ func          ┆ gpu      ┆ streaming ┆ lazy  ┆ limit    ┆ preload ┆ duration     │
│ ---        ┆ ---           ┆ ---      ┆ ---       ┆ ---   ┆ ---      ┆ ---     ┆ ---          │
│ str        ┆ str           ┆ f64      ┆ f64       ┆ f64   ┆ f64      ┆ f64     ┆ f64          │
╞════════════╪═══════════════╪══════════╪═══════════╪═══════╪══════════╪═════════╪══════════════╡
│ count      ┆ 240           ┆ 240.0    ┆ 240.0     ┆ 240.0 ┆ 240.0    ┆ 240.0   ┆ 240.0        │
│ null_count ┆ 0             ┆ 0.0      ┆ 0.0       ┆ 0.0   ┆ 0.0      ┆ 0.0     ┆ 0.0          │
│ mean       ┆ null          ┆ 0.333333 ┆ 0.333333  ┆ 0.5   ┆ 5.4e7    ┆ 0.5     ┆ 5173.244514  │
│ std        ┆ null          ┆ null     ┆ null      ┆ null  ┆ 2.8343e7 ┆ null    ┆ 8792.654451  │
│ min        ┆ polars_filter ┆ 0.0      ┆ 0.0       ┆ 0.0   ┆ 1.4e7    ┆ 0.0     ┆ 44.833183    │
│ 25% 

## Performance Analysis

In [5]:
# Compare GPU vs CPU performance
gpu_vs_cpu = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("func:N", title="Function", axis=None),
        y=alt.Y("mean(duration):Q", title="Mean Duration (ms)"),
        color=alt.Color("func:N", title="GPU"),
        column=alt.Column("gpu:N"),
    )
    .properties(title="GPU vs CPU Performance Comparison")
)

gpu_vs_cpu

In [6]:
# Analyze the effect of lazy execution
lazy_effect = (
    alt.Chart(df)
    .mark_boxplot()
    .encode(
        x=alt.X("lazy:N", title="Lazy Execution"),
        y=alt.Y("duration:Q", title="Duration (ms)"),
        color=alt.Color("lazy:N"),
        column=alt.Column("func:N", title="Function"),
    )
    .properties(title="Effect of Lazy Execution on Performance")
)

lazy_effect

In [7]:
# Analyze the effect of streaming
streaming_effect = (
    alt.Chart(df)
    .mark_boxplot()
    .encode(
        x=alt.X("streaming:N", title="Streaming"),
        y=alt.Y("duration:Q", title="Duration (ms)"),
        color=alt.Color("streaming:N"),
        column=alt.Column("func:N", title="Function"),
    )
    .properties(title="Effect of Streaming on Performance")
)

streaming_effect

## Insights and Conclusions

1. GPU vs CPU Performance:
   - [Insert observations about GPU vs CPU performance]

2. Effect of Lazy Execution:
   - [Insert observations about the impact of lazy execution]

3. Impact of Streaming:
   - [Insert observations about the effect of streaming]

4. Function-specific Performance:
   - [Insert observations about how different functions perform]

5. Overall Performance Considerations:
   - [Insert general conclusions and recommendations based on the analysis]

Note: The above insights should be filled in after running the notebook and analyzing the results.