In [3]:
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(10**6)
y = rng.rand(10**6)
%timeit x+y

100 loops, best of 3: 4.19 ms per loop


In [6]:
mask = (x>0.5)&(y<0.5)

In [4]:
# 但是这种向量化运算在处理复合代数式的时候，效率较低
# 因为，每段中间过程都需要显式分配内存
# 而Numexpr程序库可以不为中间过程分配全部内存的前提下，完成复合代数运算

In [7]:
import numexpr
mask_numexpr = numexpr.evaluate('(x>0.5)&(y<0.5)')
np.allclose(mask,mask_numexpr)

True

In [8]:
# 用Pandas.eval()实现高性能运算
import pandas as pd
nrows, ncols = 100000,100
rng = np.random.RandomState(42)
df1,df2,df3,df4 = (pd.DataFrame(rng.rand(nrows,ncols)) for i in range(4))

In [9]:
# 普通pandas方法
%timeit df1+df2+df3+df4

10 loops, best of 3: 115 ms per loop


In [10]:
# pd.eval()和字符串代数式方法 (基于Numexpr实现)
%timeit pd.eval('df1+df2+df3+df4')

10 loops, best of 3: 47.8 ms per loop


In [11]:
# 验证两种方法的结果一样
np.allclose(df1+df2+df3+df4,pd.eval('df1+df2+df3+df4'))

True

In [12]:
# pd.eval()支持的运算
# 1，算术运算
# 2，比较运算
# 3，位运算
# 4，对象属性与索引
# 5，不支持函数调用、条件语句、循环等更复杂的运算

In [13]:
# 一个示例：链式代数式
df1,df2,df3,df4 = (pd.DataFrame(rng.randint(0,1000,(100,3))) for i in range(4))
result1 = (df1<df2)&(df2<=df3)&(df3!=df4)
result2 = pd.eval('df1<df2<=df3!=df4')
np.allclose(result1,result2)

True

In [14]:
# 用DataFrame.eval()实现列间运算
df = pd.DataFrame(rng.rand(1000,3),columns=list('ABC'))
df.head()

Unnamed: 0,A,B,C
0,0.067497,0.023679,0.800495
1,0.247672,0.930917,0.107492
2,0.892389,0.401882,0.012611
3,0.819101,0.246383,0.875091
4,0.620237,0.204526,0.14966


In [15]:
result1 = (df['A']+df['B'])/(df['C']-1)
result2 = pd.eval("(df.A+df.B)/(df.C-1)")
np.allclose(result1,result2)

True

In [16]:
result3 = df.eval('(A+B)/(C-1)')    # 实现简洁的代数式，列名称可以直接作为变量
np.allclose(result1,result3)

True

In [17]:
df.eval('D=(A+B)/C',inplace=True)    # df.eval()还可以用来创建新的列
df.head()

Unnamed: 0,A,B,C,D
0,0.067497,0.023679,0.800495,0.113899
1,0.247672,0.930917,0.107492,10.964414
2,0.892389,0.401882,0.012611,102.630345
3,0.819101,0.246383,0.875091,1.21757
4,0.620237,0.204526,0.14966,5.510902


In [18]:
df.eval('D=(A-B)/C',inplace=True)   # 修改已有列
df.head()

Unnamed: 0,A,B,C,D
0,0.067497,0.023679,0.800495,0.054739
1,0.247672,0.930917,0.107492,-6.356229
2,0.892389,0.401882,0.012611,38.895206
3,0.819101,0.246383,0.875091,0.654467
4,0.620237,0.204526,0.14966,2.777701


In [20]:
# DataFrame.eval()还支持通过@符号来使用Python局部变量
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A+@column_mean')
np.allclose(result1,result2)

True

In [23]:
# 对于过滤运算，可以使用query()方法
result1 = df[(df.A<0.5)&(df.B<0.5)]
result2 = df.query('A<0.5 and B<0.5')
np.allclose(result1,result2)

True

In [24]:
# 同样支持使用局部变量
Cmean = df['C'].mean()
result1 = df[(df.A<Cmean)&(df.B<Cmean)]
result2 = df.query('A<@Cmean and B<@Cmean')
np.allclose(result1,result2)

True