forked from pola-rs/polars
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_lazy.py
150 lines (118 loc) · 4.18 KB
/
test_lazy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from polars import DataFrame, Series
from polars.lazy import *
from polars.datatypes import *
import polars as pl
import pytest
def test_lazy():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
ldf = df.lazy().with_column(lit(1).alias("foo")).select([col("a"), col("foo")])
print(ldf.collect())
# test if it executes
new = (
df.lazy()
.with_column(
when(col("a").gt(lit(2))).then(lit(10)).otherwise(lit(1)).alias("new")
)
.collect()
)
def test_apply():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
new = df.lazy().with_column(col("a").map(lambda s: s * 2).alias("foo")).collect()
def test_add_eager_column():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
out = df.lazy().with_column(pl.lit(pl.Series("c", [1, 2, 3]))).collect()
assert out["c"].sum() == 6
def test_set_null():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
out = (
df.lazy()
.with_column(when(col("a") > 1).then(lit(None)).otherwise(100).alias("foo"))
.collect()
)
s = out["foo"]
assert s[0] == 100
assert s[1] is None
assert s[2] is None
def test_agg():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
ldf = df.lazy().min()
assert ldf.collect().shape == (1, 2)
def test_fold():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
out = df.lazy().select(pl.sum(["a", "b"])).collect()
assert out["sum"].series_equal(Series("sum", [2, 4, 6]))
def test_or():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
out = df.lazy().filter((pl.col("a") == 1) | (pl.col("b") > 2)).collect()
assert out.shape[0] == 2
def test_groupby_apply():
df = DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 3.0]})
ldf = df.lazy().groupby("a").apply(lambda df: df)
assert ldf.collect().sort("b").frame_equal(df)
def test_binary_function():
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
out = (
df.lazy()
.with_column(map_binary(col("a"), col("b"), lambda a, b: a + b))
.collect()
)
assert out["binary_function"] == (out.a + out.b)
def test_filter_str():
# use a str instead of a column expr
df = pl.DataFrame(
{
"time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"],
"bools": [True, False, True, False],
}
)
q = df.lazy()
# last row based on a filter
q.filter(pl.col("bools")).select(pl.last("*"))
def test_apply_custom_function():
df = pl.DataFrame(
{
"A": [1, 2, 3, 4, 5],
"fruits": ["banana", "banana", "apple", "apple", "banana"],
"B": [5, 4, 3, 2, 1],
"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
}
)
# two ways to determine the length groups.
a = (
df.lazy()
.groupby("fruits")
.agg(
[
pl.col("cars").apply(lambda groups: groups.len()).alias("custom_1"),
pl.col("cars").apply(lambda groups: groups.len()).alias("custom_2"),
pl.count("cars"),
]
)
.sort("custom_1", reverse=True)
).collect()
expected = pl.DataFrame(
{
"fruits": ["banana", "apple"],
"custom_1": [3, 2],
"custom_2": [3, 2],
"cars_count": [3, 2],
}
)
expected["cars_count"] = expected["cars_count"].cast(pl.UInt32)
assert a.frame_equal(expected)
def test_groupby():
df = pl.DataFrame({"a": [1.0, None, 3.0, 4.0], "groups": ["a", "a", "b", "b"]})
out = df.lazy().groupby("groups").agg(pl.mean("a")).collect()
def test_shift_and_fill():
df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]})
# use exprs
out = df.lazy().with_column(col("a").shift_and_fill(-2, col("b").mean())).collect()
assert out["a"].null_count() == 0
# use df method
out = df.lazy().shift_and_fill(2, col("b").std()).collect()
assert out["a"].null_count() == 0
def test_arange():
df = pl.DataFrame({"a": [1, 1, 1]}).lazy()
result = df.filter(pl.lazy.col("a") >= pl.lazy.arange(0, 3)).collect()
expected = pl.DataFrame({"a": [1, 1]})
assert result.frame_equal(expected)