## Lazy Dataframes

In [2]:
import polars as pl
import numpy as np
from datetime import datetime

print(pl.__version__)

1.16.0


In [26]:
import polars as pl
import numpy as np
from datetime import datetime, date, timedelta

# Create date range using timedelta
date_range = pl.date_range(
                            date(2021, 1, 1),
                            date(2023, 12, 31),
                            timedelta(days=1),
                            eager=True
                            ).alias("timestamp").to_list()

# Create a sample DataFrame
df = pl.DataFrame({
                    'id': range(1000),
                    'category': np.random.choice(['A', 'B', 'C'], 1000),
                    'value': np.random.normal(100, 15, 1000),
                    'timestamp': date_range[:1000]  # Take first 1000 dates to match other columns
                    })
df

id,category,value,timestamp
i64,str,f64,date
0,"""C""",89.468546,2021-01-01
1,"""A""",87.401196,2021-01-02
2,"""A""",103.949077,2021-01-03
3,"""A""",115.600454,2021-01-04
4,"""C""",102.574941,2021-01-05
…,…,…,…
995,"""B""",112.526763,2023-09-23
996,"""C""",76.877215,2023-09-24
997,"""A""",111.441719,2023-09-25
998,"""B""",99.221138,2023-09-26


In [27]:
# Convert to lazy DataFrame
lazy_df = df.lazy()
lazy_df

In [28]:
# Complex transformation pipeline
transformed_lazy = (
    lazy_df
    .with_columns([
        pl.col('value').rolling_mean(7).alias('rolling_avg'),
        pl.col('value').shift().alias('prev_value')
    ])
    .filter(pl.col('category').is_in(['A', 'B']))
    .group_by('category')
    .agg([
        pl.col('value').mean().alias('avg_value'),
        pl.col('value').std().alias('std_value'),
        pl.col('id').count().alias('count')
    ])
    .sort('category')
)

# Examine the optimization plan
print("Optimization Plan:")
print(transformed_lazy)


Optimization Plan:
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SORT BY [col("category")]
  AGGREGATE
  	[col("value").mean().alias("avg_value"), col("value").std().alias("std_value"), col("id").count().alias("count")] BY [col("category")] FROM
    FILTER col("category").is_in([Series]) FROM
       WITH_COLUMNS:
       [col("value").rolling_mean().alias("rolling_avg"), col("value").shift([dyn int: 1]).alias("prev_value")] 
        DF ["id", "category", "value", "timestamp"]; PROJECT */4 COLUMNS; SELECTION: None


In [29]:
filtered_lazy = (
                df.lazy()
                .filter(pl.col('value') > 100)
                .with_columns(pl.col('value').alias('high_value'))
                .select(['category', 'high_value'])
                )
filtered_lazy

In [30]:
# Example of lazy joins
other_df = pl.DataFrame({
    'category': ['A', 'B', 'C'],
    'category_name': ['Alpha', 'Beta', 'Gamma']
})

joined_lazy = (
    df.lazy()
    .join(
        other_df.lazy(),
        on='category',
        how='left'
    )
    .select([
        'id',
        'category_name',
        'value'
    ])
)
joined_lazy