In [None]:
import polars as pl

# Greedy Evaluation
Take a good look at the following transformation taking place on the Pokémon dataset. You might have noticed that this code is perfectly functional, but not yet optimized. 

### Exercise 3
Optimize the code in the second function and run some timings. See if you can improve performance by yourself.

In [None]:
def some_transformation() -> pl.DataFrame:
    df = pl.read_csv('data/Pokemon.csv')
    df = df.with_columns(
        pl.col("Attack").mean().over("Type1").alias("Avg. Attack/Type"),
        pl.col("Attack").median().over("Type1").alias("Med. Attack/Type")
    ).filter(
        (pl.col("HP") > 70) &
        (pl.col("Generation") < 3)
    ).select(
        pl.col("Name", "Type1", "HP", "Avg. Attack/Type", "Med. Attack/Type", "Generation")
    ).filter(
            pl.col("Med. Attack/Type") > 60
    )
    return df

def some_transformation_optimized() -> pl.DataFrame:
    """ Change this code so it is better optimized """
    df = pl.read_csv('data/Pokemon.csv')
    df = df.with_columns(
        pl.col("Attack").mean().over("Type1").alias("Avg. Attack/Type"),
        pl.col("Attack").median().over("Type1").alias("Med. Attack/Type")
    ).filter(
        (pl.col("HP") > 70) &
        (pl.col("Generation") < 3)
    ).select(
        pl.col("Name", "Type1", "HP", "Avg. Attack/Type", "Med. Attack/Type", "Generation")
    ).filter(
            pl.col("Med. Attack/Type") > 60
    )
    return df

In [None]:
t_time = %timeit -o some_transformation()
o_time = %timeit -o some_transformation_optimized()

In [None]:
import plotly.express as px

data = {
    "sort": ["unoptimized", "optimized"],
    "best": [t_time.best, o_time.best],
    "mean": [t_time.average, o_time.average]
}

fig = px.bar(data, x="sort", y="best", color="sort")
fig.show()

fig = px.bar(data, x="sort", y="mean", color="sort")
fig.show()


# Lazy Evaluation
This brings us to, probably, one of the coolest features of Polars: lazy dataframe evaluation. During lazy evaluation, a scheme is created following 50+ years of RDMS knowledge. Polars is able to make internal optimizations - such as reading in only columns that are necessary - before evaluation. 

The change to use LazyFrames as they are called, is easy. DataFrames can be converted by using the .lazy() function. Some I/O options have a scan_ function. Finally, to evaluate, you pass the query to .collect() function.

No exercise, this time, just look at the timings. This dataset is probably to small to make a big difference, but for larger data sets, speed ups can be very significant.

In [None]:
def some_transformation_eager() -> pl.DataFrame:
    df = pl.read_csv('data/Pokemon.csv')
    df = df.with_columns(
        pl.col("Attack").mean().over("Type1").alias("Avg. Attack/Type"),
        pl.col("Attack").median().over("Type1").alias("Med. Attack/Type")
    ).filter(
        (pl.col("HP") > 70) &
        (pl.col("Generation") < 3)
    ).select(
        pl.col("Name", "Type1", "HP", "Avg. Attack/Type", "Med. Attack/Type", "Generation")
    ).filter(
            pl.col("Med. Attack/Type") > 60
    )
    
    return df


def some_transformation_lazy() -> pl.DataFrame:
    q = (
        pl.scan_csv('data/Pokemon.csv').with_columns(
            pl.col("Attack").mean().over("Type1").alias("Avg. Attack/Type"),
            pl.col("Attack").median().over("Type1").alias("Med. Attack/Type")
        ).filter(
            (pl.col("HP") > 70) &
            (pl.col("Generation") < 3)
        ).select(
            pl.col("Name", "Type1", "HP", "Avg. Attack/Type", "Med. Attack/Type", "Generation")
        ).filter(
            pl.col("Med. Attack/Type") > 60
        )
    )
    
    return q.collect()

e_time = %timeit -o some_transformation_eager()
l_time = %timeit -o some_transformation_lazy()

data = {
    "sort": ["eager", "lazy"],
    "best": [e_time.best, l_time.best],
    "mean": [e_time.average, l_time.average]
}

fig = px.bar(data, x="sort", y="best", color="sort")
fig.show()

fig = px.bar(data, x="sort", y="mean", color="sort")
fig.show()


To see that there is something happening behind the scenes, we can print the schema's Polars has generated. As you can see, expression order has been adjusted, and not all columns are read in from the file.

In [None]:
q = (
        pl.scan_csv('data/Pokemon.csv').with_columns(
            pl.col("Attack").mean().over("Type1").alias("Avg. Attack/Type"),
            pl.col("Attack").median().over("Type1").alias("Med. Attack/Type")
        ).filter(
            (pl.col("HP") > 70) &
            (pl.col("Generation") < 3)
        ).select(
            pl.col("Name", "Type1", "HP", "Avg. Attack/Type", "Med. Attack/Type", "Generation")
        ).filter(
            pl.col("Med. Attack/Type") > 60
        )
    )

print(q.explain(optimized=True), '\n'*4, q.explain(optimized=False))