https://docs.pola.rs/

In [1]:
import polars as pl

path = "./sample_data/california_housing_test.csv"

q = (
    pl.scan_csv(path)
    .filter(pl.col('housing_median_age') > 10)
    .group_by('households')
    .agg(pl.all().sum())
)

df = q.collect()

In [2]:
import polars as pl
from datetime import datetime


df = pl.DataFrame(
    {
        "integer":[1, 2, 3],
        "date": [
            datetime(2025, 1, 1),
            datetime(2025, 1, 2),
            datetime(2025, 1, 3)
        ],
        "float": [4., 5., 6.],
        "string": ["a", "b", "c"]
    }
)

print(df)

shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date                ┆ float ┆ string │
│ ---     ┆ ---                 ┆ ---   ┆ ---    │
│ i64     ┆ datetime[μs]        ┆ f64   ┆ str    │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02 00:00:00 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03 00:00:00 ┆ 6.0   ┆ c      │
└─────────┴─────────────────────┴───────┴────────┘


In [3]:
df.write_csv("./sample_data/output.csv")
df_csv = pl.read_csv("./sample_data/output.csv")
print(df_csv)

shape: (3, 4)
┌─────────┬────────────────────────────┬───────┬────────┐
│ integer ┆ date                       ┆ float ┆ string │
│ ---     ┆ ---                        ┆ ---   ┆ ---    │
│ i64     ┆ str                        ┆ f64   ┆ str    │
╞═════════╪════════════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01T00:00:00.000000 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02T00:00:00.000000 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03T00:00:00.000000 ┆ 6.0   ┆ c      │
└─────────┴────────────────────────────┴───────┴────────┘


In [4]:
df.select(pl.col("*"))

integer,date,float,string
i64,datetime[μs],f64,str
1,2025-01-01 00:00:00,4.0,"""a"""
2,2025-01-02 00:00:00,5.0,"""b"""
3,2025-01-03 00:00:00,6.0,"""c"""


In [5]:
df.select(pl.col("integer", "date"))

integer,date
i64,datetime[μs]
1,2025-01-01 00:00:00
2,2025-01-02 00:00:00
3,2025-01-03 00:00:00


In [6]:
df.filter(
    pl.col("date").is_between(datetime(2025, 1, 2), datetime(2025, 1, 3))
)

integer,date,float,string
i64,datetime[μs],f64,str
2,2025-01-02 00:00:00,5.0,"""b"""
3,2025-01-03 00:00:00,6.0,"""c"""


In [7]:
df.filter(
    (
        pl.col("float") > 3.0
    ) &
    (
        pl.col("date") == datetime(2025, 1, 3)
    )
)

integer,date,float,string
i64,datetime[μs],f64,str
3,2025-01-03 00:00:00,6.0,"""c"""


In [8]:
df.with_columns(pl.col("float").sum().alias("sum_float")), df.with_columns((pl.col("string") + "e").alias("string_e"))


(shape: (3, 5)
 ┌─────────┬─────────────────────┬───────┬────────┬───────────┐
 │ integer ┆ date                ┆ float ┆ string ┆ sum_float │
 │ ---     ┆ ---                 ┆ ---   ┆ ---    ┆ ---       │
 │ i64     ┆ datetime[μs]        ┆ f64   ┆ str    ┆ f64       │
 ╞═════════╪═════════════════════╪═══════╪════════╪═══════════╡
 │ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      ┆ 15.0      │
 │ 2       ┆ 2025-01-02 00:00:00 ┆ 5.0   ┆ b      ┆ 15.0      │
 │ 3       ┆ 2025-01-03 00:00:00 ┆ 6.0   ┆ c      ┆ 15.0      │
 └─────────┴─────────────────────┴───────┴────────┴───────────┘,
 shape: (3, 5)
 ┌─────────┬─────────────────────┬───────┬────────┬──────────┐
 │ integer ┆ date                ┆ float ┆ string ┆ string_e │
 │ ---     ┆ ---                 ┆ ---   ┆ ---    ┆ ---      │
 │ i64     ┆ datetime[μs]        ┆ f64   ┆ str    ┆ str      │
 ╞═════════╪═════════════════════╪═══════╪════════╪══════════╡
 │ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      ┆ ae       │
 │ 2       ┆ 20

In [9]:
df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

In [10]:
df2.group_by("y", maintain_order=True).agg(
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum"),
)

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [12]:
df = pl.DataFrame(
    {
        "a":[1, 2, 3],
        "b": [
            datetime(2025, 1, 1),
            datetime(2025, 1, 2),
            datetime(2025, 1, 3)
        ],
        "c": [4., 5., 6.],
        "d": ["a", "b", "c"]
    }
)

In [13]:
df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    pl.all().exclude(["c", "d"])
)

print(df_x)

shape: (3, 3)
┌─────┬─────────────────────┬──────────────────┐
│ a   ┆ b                   ┆ a * b            │
│ --- ┆ ---                 ┆ ---              │
│ i64 ┆ datetime[μs]        ┆ i64              │
╞═════╪═════════════════════╪══════════════════╡
│ 1   ┆ 2025-01-01 00:00:00 ┆ 1735689600000000 │
│ 2   ┆ 2025-01-02 00:00:00 ┆ 3471552000000000 │
│ 3   ┆ 2025-01-03 00:00:00 ┆ 5207587200000000 │
└─────┴─────────────────────┴──────────────────┘


In [15]:
df_y = df.with_columns((pl.col("a") *  pl.col("b")).alias("a * b")).select(
    pl.all().exclude("d")
)
print(df_y)

shape: (3, 4)
┌─────┬─────────────────────┬─────┬──────────────────┐
│ a   ┆ b                   ┆ c   ┆ a * b            │
│ --- ┆ ---                 ┆ --- ┆ ---              │
│ i64 ┆ datetime[μs]        ┆ f64 ┆ i64              │
╞═════╪═════════════════════╪═════╪══════════════════╡
│ 1   ┆ 2025-01-01 00:00:00 ┆ 4.0 ┆ 1735689600000000 │
│ 2   ┆ 2025-01-02 00:00:00 ┆ 5.0 ┆ 3471552000000000 │
│ 3   ┆ 2025-01-03 00:00:00 ┆ 6.0 ┆ 5207587200000000 │
└─────┴─────────────────────┴─────┴──────────────────┘


In [17]:
import numpy as np

df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
    }
)
df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)
joined = df.join(df2, left_on="a", right_on="x")
print(joined)

shape: (8, 4)
┌─────┬──────────┬───────┬─────┐
│ a   ┆ b        ┆ d     ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ str │
╞═════╪══════════╪═══════╪═════╡
│ 0   ┆ 0.059247 ┆ 1.0   ┆ A   │
│ 1   ┆ 0.182831 ┆ 2.0   ┆ A   │
│ 2   ┆ 0.433984 ┆ NaN   ┆ A   │
│ 3   ┆ 0.951915 ┆ NaN   ┆ B   │
│ 4   ┆ 0.087812 ┆ 0.0   ┆ B   │
│ 5   ┆ 0.901299 ┆ -5.0  ┆ C   │
│ 6   ┆ 0.775381 ┆ -42.0 ┆ X   │
│ 7   ┆ 0.533215 ┆ null  ┆ X   │
└─────┴──────────┴───────┴─────┘


In [18]:
stacked = df.hstack(df2)
print(stacked)

shape: (8, 5)
┌─────┬──────────┬───────┬─────┬─────┐
│ a   ┆ b        ┆ d     ┆ x   ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ i64 ┆ str │
╞═════╪══════════╪═══════╪═════╪═════╡
│ 0   ┆ 0.059247 ┆ 1.0   ┆ 0   ┆ A   │
│ 1   ┆ 0.182831 ┆ 2.0   ┆ 1   ┆ A   │
│ 2   ┆ 0.433984 ┆ NaN   ┆ 2   ┆ A   │
│ 3   ┆ 0.951915 ┆ NaN   ┆ 3   ┆ B   │
│ 4   ┆ 0.087812 ┆ 0.0   ┆ 4   ┆ B   │
│ 5   ┆ 0.901299 ┆ -5.0  ┆ 5   ┆ C   │
│ 6   ┆ 0.775381 ┆ -42.0 ┆ 6   ┆ X   │
│ 7   ┆ 0.533215 ┆ null  ┆ 7   ┆ X   │
└─────┴──────────┴───────┴─────┴─────┘
