## Lazy Dataframes

In [1]:
import polars as pl
import numpy as np
from datetime import datetime

print(pl.__version__)

0.19.0


In [2]:
data = {"a": [1, 2], "b": [3, 4]}
lf = pl.LazyFrame(data)
lf.collect()

a,b
i64,i64
1,3
2,4


In [3]:
import numpy as np
data = np.array([(1, 2), (3, 4)], dtype=np.int64)
lf = pl.LazyFrame(data, schema=["a", "b"], orient="col")
lf.collect()

a,b
i64,i64
1,3
2,4


In [4]:
import polars as pl
import numpy as np
from datetime import datetime, date, timedelta

# Create date range using timedelta
date_range = pl.date_range(
                            date(2021, 1, 1),
                            date(2023, 12, 31),
                            timedelta(days=1),
                            eager=True
                            ).alias("timestamp").to_list()

# Create a sample DataFrame
df = pl.DataFrame({
                    'id': range(1000),
                    'category': np.random.choice(['A', 'B', 'C'], 1000),
                    'value': np.random.normal(100, 15, 1000),
                    'timestamp': date_range[:1000]  # Take first 1000 dates to match other columns
                    })
df

id,category,value,timestamp
i64,str,f64,date
0,"""C""",106.461047,2021-01-01
1,"""C""",100.465115,2021-01-02
2,"""B""",90.49305,2021-01-03
3,"""C""",124.151213,2021-01-04
4,"""B""",100.626111,2021-01-05
…,…,…,…
995,"""B""",88.629508,2023-09-23
996,"""C""",95.764697,2023-09-24
997,"""B""",80.374095,2023-09-25
998,"""C""",96.18892,2023-09-26


In [5]:
# Convert to lazy DataFrame
lazy_df = df.lazy()
lazy_df

In [6]:
# Complex transformation pipeline
transformed_lazy = (
    lazy_df
    .with_columns([
        pl.col('value').rolling_mean(7).alias('rolling_avg'),
        pl.col('value').shift().alias('prev_value')
    ])
    .filter(pl.col('category').is_in(['A', 'B']))
    .group_by('category')
    .agg([
        pl.col('value').mean().alias('avg_value'),
        pl.col('value').std().alias('std_value'),
        pl.col('id').count().alias('count')
    ])
    .sort('category')
)

# Examine the optimization plan
print("Optimization Plan:")
print(transformed_lazy)


Optimization Plan:
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SORT BY [col("category")]
  AGGREGATE
  	[col("value").mean().alias("avg_value"), col("value").std().alias("std_value"), col("id").count().alias("count")] BY [col("category")] FROM
    FILTER col("category").is_in([Series]) FROM
       WITH_COLUMNS:
       [col("value").rolling_mean().alias("rolling_avg"), col("value").shift([dyn int: 1]).alias("prev_value")], [] 
        DF ["id", "category", "value", "timestamp"]; PROJECT */4 COLUMNS; SELECTION: None


In [7]:
filtered_lazy = (
                df.lazy()
                .filter(pl.col('value') > 100)
                .with_columns(pl.col('value').alias('high_value'))
                .select(['category', 'high_value'])
                )
filtered_lazy

In [8]:
# Example of lazy joins
other_df = pl.DataFrame({
    'category': ['A', 'B', 'C'],
    'category_name': ['Alpha', 'Beta', 'Gamma']
})

joined_lazy = (
    df.lazy()
    .join(
        other_df.lazy(),
        on='category',
        how='left'
    )
    .select([
        'id',
        'category_name',
        'value'
    ])
)
joined_lazy

  .join(


In [9]:
from datetime import date
import polars as pl

(pl.date_range(
            start=date(2024, 1, 1),
            end=date(2025, 1, 1),
            interval="1d",
            closed="left",  # Don't include `end`
            eager=True,
            ).to_frame("d")
            .group_by_dynamic("d", every="1mo")
            .agg(pl.len().alias("days_in_month"))
            .select(
                pl.col("d").dt.month().alias("month"),
                pl.col("days_in_month"),
            ))

month,days_in_month
i8,u32
1,31
2,29
3,31
4,30
5,31
…,…
8,31
9,30
10,31
11,30
