In [2]:
import numexpr
import numpy as np
import pandas as pd

In [5]:
# the power of the pydata stack, is built on the ability of numpy and pandas to push basic operations into lower level compiled code via an intuitive higher level syntax
# examples are vectorized broadcasted operation in numpy and grouping type operations in pandas. while these abstractions are efficient, and effective for many common use cases, they often rely on the creation of temporary intermediate objects which can use time and memory resources
# pandas incluedes some methods that allow you to directly access C-speed oeprations without costly allocations
# these methods are called eval() and query() and rely on the numexpr package
# we have seen previously that numpy and pandas support fast vectorized operations for example when adding the elements of two arrays
rng = np.random.default_rng(42)
x = rng.random(1_000_000)
y = rng.random(1_000_000)
%timeit x + y

1.84 ms ± 81.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
# the above function is much faster than doing the addition via a python loop or comprehension
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

191 ms ± 8.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# this abstraction can become less efficient when computing compound expressions consider the following expression
mask = (x > 0.5) & (y < 0.5)
# because NumPy evaluates each subexpression, this is roughly equivalent to the following
tmp1 = x > 0.5
tmp2 = y < 0.5
mask = tmp1 & tmp2
# every intermediate step is explicitly allocated in memory. if the x and y arrays are very large, this can lead to significant memory and computational overhead the numexpr library gives you the ability to compute this type of compound expression element by element, without the need to allocate full intermediate arrays
# library accepts a string giving the numpy-style expression you'd like to compute
mask_numexpr = numexpr.evaluate("(x > 0.5) & (y < 0.5)")
np.all(mask == mask_numexpr)
# the benefit here is that numexpr evaluates the expression in a way that avoids temporary arrays, where possible, and this can be more efficient than numpy, especially for long secquences of computations on large arrays
# the pandas eval and query tools are conceptually similar and are essentially pandas specific wrappers of numexpr

True

In [11]:
# pandas.eval() for efficient operations
# the eval function in pandas uses string expressions to efficiently compute operations using DataFrames
nrows, ncols = 100_000, 100
df1, df2, df3, df4 = (pd.DataFrame(rng.random((nrows, ncols))) for i in range(4))

In [12]:
# to compute the sum of all four DataFrames using the typical pandas approach, we can just write the sum
%timeit df1 + df2 + df3 + df4

66.7 ms ± 576 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
# the same result can be computed via pd.eval by constructing the expression as a string
%timeit pd.eval('df1 + df2 + df3 + df4')

31.9 ms ± 1.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
# the eval version of this expression is about 50% faster and uses much less memory in the process
np.allclose(df1 + df2 + df3 + df4, pd.eval("df1 + df2 + df3 + df4"))

True

In [15]:
# pd.eval supports a wide range of operations.
df1, df2, df3, df4, df5 = (
    pd.DataFrame(rng.integers(0, 1000, (100, 3))) for i in range(5)
)

In [16]:
# arithmetic operators
# pd.eval() supports all arithmetic operators
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval("-df1 * df2 / (df3 + df4) - df5")
np.allclose(result1, result2)

True

In [18]:
# comparison operators
# pd.eval() supports all comparison operators, including chained expressions
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval("df1 < df2 <= df3 != df4")
np.allclose(result1, result2)

True

In [19]:
# bitwise operators
# pd.eval() supports the & and | bitwise operators
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval("(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)")
np.allclose(result1, result2)

True

In [20]:
# in addition, it supports the use of the literal and and or in boolean expressions
result3 = pd.eval("(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)")
np.allclose(result1, result3)

True

In [21]:
# object attributes and indices
# pd.eval supports access to object attributes via the obj.attr syntax, and indexes via the obj[index] syntax
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval("df2.T[0] + df3.iloc[1]")
np.allclose(result1, result2)
# other operations such as function calls, conditional statements, loops, and other more involved constucts are currently not implemented in pd.eval you can use the Numexpr library itself

True

In [22]:
# DataFrame.eval() for column-wise operations
# just as pandas has a top-level pd.eval function, DataFrames have an eval method that works in similar ways
# the benifit of the eval method is that columns can be referred to by name
df = pd.DataFrame(rng.random((1_000, 3)), columns=["A", "B", "C"])
df.head()

Unnamed: 0,A,B,C
0,0.850888,0.966709,0.95869
1,0.820126,0.385686,0.061402
2,0.059729,0.831768,0.652259
3,0.244774,0.140322,0.041711
4,0.818205,0.753384,0.578851


In [23]:
# using pd.eval we can compute epressions with the three columns like this
result1 = (df["A"] + df["B"]) / (df["C"] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [24]:
# the dataframe.eval method allows much more succint evaluation of expressions with the columns
result3 = df.eval("(A + B) / (C - 1)")
np.allclose(result1, result3)
# here we treat column names as variables within the evaluated expression, and the result is what we would wish

True

In [25]:
# assignment in DataFrame.eval()
# in addition to the options just discussed, DataFrame.eval also allows assignment to any column lets use the dataframe from before, which has columns A, B, and C
df.head()

Unnamed: 0,A,B,C
0,0.850888,0.966709,0.95869
1,0.820126,0.385686,0.061402
2,0.059729,0.831768,0.652259
3,0.244774,0.140322,0.041711
4,0.818205,0.753384,0.578851


In [27]:
# we can use df.eval to create a new column D and assign to it a value computed from other columns
df.eval("D = (A + B) / C", inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.850888,0.966709,0.95869,1.895916
1,0.820126,0.385686,0.061402,19.638139
2,0.059729,0.831768,0.652259,1.366782
3,0.244774,0.140322,0.041711,9.23237
4,0.818205,0.753384,0.578851,2.715013


In [28]:
# in the same way, any existing column can be modified
df.eval("D = (A - B) / C", inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.850888,0.966709,0.95869,-0.120812
1,0.820126,0.385686,0.061402,7.075399
2,0.059729,0.831768,0.652259,-1.183638
3,0.244774,0.140322,0.041711,2.504142
4,0.818205,0.753384,0.578851,0.111982


In [29]:
# local variables in DataFrame.eval()
# the dataframe.eval method supports an additional syntax that lets it work with local python variables
column_mean = df.mean(1)
result1 = df["A"] + column_mean
result2 = df.eval("A + @column_mean")
np.allclose(result1, result2)
# the @ character here marks a variable name rather than a column name, and lets you efficiently evaluate expressions involving the two "namespaces" the namespace of columns, and the anemspace of python objects. this @ character is only supported by the dataframe eval method not by the pandas eval function, because the pandas.eval function only has access to one python namespace

True

In [30]:
# the DataFrame.query() method
# the dataframe has another method based on evaluated strings called query consider the following
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval("df[(df.A < 0.5) & (df.B < 0.5)]")
np.allclose(result1, result2)

True

In [31]:
# as with the example used in our discussion of dataframe.eval this is an expression involving columns of the dataframe. it cannot be expressed using the dataframe.eval syntax, however! instead, for this type of filtering operation, you can use the query method
result2 = df.query("A < 0.5 and B < 0.5")
np.allclose(result1, result2)

True

In [32]:
# in addition to being a more efficient computation, compared to the masking expression this is also much easier to read and understand at a glance
Cmean = df["C"].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query("A < @Cmean and B < @Cmean")
np.allclose(result1, result2)

True

In [34]:
# when to use these functions
# there are two considerations time and memory use. memory use is the most predictable aspect. as already mentioned, every compound expression involving NumPy arrays or pandas dataframes will result in implicit creation of temporary arrays: for example, this:
x = df[(df.A < 0.5) & (df.B < 0.5)]
# this is roughly equivalent to this
tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]
# if the size of the temporary dataframes is significant compared to your available system memory (typically several gigabytes) then it's a good idea to use an eval or query expression. you can check the approximate size of your array in bytes using this:
df.values.nbytes
# on the performance side eval can be faster even when you are not maxing out your system memory. the issue is how your temporary objects compare to the size of the l1 or l2 cpu cache on your system typically tens of kilobytes in 2016.
# the benifit of eval/query is mainly in the saved memory, and the cleaner syntax that they sometimes offer

32000