# High performance

"""python
mask = (x > 0.5) & (y < 0.5)
"""
ex:
df = df[df[mask]]
 
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2

Using eval() will perform elementwise directly without intermediate steps using numexpr.

eval can be slower than normal pandas expressions. Rule of thumb: if df rows > 10000 can use eval() else use normal df expressions


In [8]:
import numpy as np
import pandas as pd
nrows, ncols = 1000000, 100
df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.459446,1.014531,0.764515,0.242376,0.231385,1.103638,0.101644,-1.61222,-0.031451,-1.223215,...,0.40449,0.331771,1.617663,-0.161625,-1.087892,2.203068,-1.585936,0.471002,1.655557,0.398145
1,-0.035714,0.403343,-0.536187,-0.886616,0.467239,-2.261013,-0.217419,-0.866536,-2.728936,0.853474,...,0.00853,-0.589993,-0.406172,0.512117,1.247251,-0.974371,-1.5116,1.328161,-0.36201,0.405338
2,0.544321,0.691795,0.434663,-1.804151,0.326256,1.267419,1.305482,-0.221931,0.844772,-1.029458,...,-0.263139,1.107772,0.588988,1.095538,1.267949,-0.052844,-1.07392,1.921171,-1.533587,-1.589141
3,0.932197,0.660127,-1.461895,-1.226833,-1.388603,-0.804966,0.313495,1.108552,-0.416784,-0.488294,...,0.63823,0.879701,-0.326495,-0.15146,0.201802,-0.50194,-0.41795,0.393855,0.301893,0.261797
4,0.537444,0.88229,0.896521,0.414694,0.736074,-1.07162,0.022882,0.629204,0.117552,0.370913,...,0.017766,0.206424,1.816617,1.15507,-2.14774,-1.651092,1.225523,-0.117041,-1.30831,-0.902151


In [9]:
%timeit df1 + df2 + df3 + df4 #evaluate df

1.19 s ± 29.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit pd.eval("df1 + df2 + df3 + df4")

502 ms ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
plain = df1 + df2 + df3 + df4
sum_eval = pd.eval("df1 + df2 + df3 + df4")
sum_eval.equals(plain)

True

In [14]:
rolls = pd.DataFrame(np.random.randint(1,6, (6,3)), columns = ["Die1", "Die2", "Die3"])
rolls.eval("Sum = Die1 + Die2 + Die3", inplace = True)
rolls

Unnamed: 0,Die1,Die2,Die3,Sum
0,1,5,1,7
1,5,4,3,12
2,3,5,1,9
3,5,3,3,11
4,5,3,4,12
5,3,3,3,9


In [17]:
# use variables
high = 10 
rolls.eval("Winner = Sum > @high", inplace = True)
rolls

Unnamed: 0,Die1,Die2,Die3,Sum,High,Winner
0,1,5,1,7,False,False
1,5,4,3,12,True,True
2,3,5,1,9,False,False
3,5,3,3,11,True,True
4,5,3,4,12,True,True
5,3,3,3,9,False,False


In [18]:
rolls[rolls["Sum"] <= high]

Unnamed: 0,Die1,Die2,Die3,Sum,High,Winner
0,1,5,1,7,False,False
2,3,5,1,9,False,False
5,3,3,3,9,False,False


# Query

In [19]:
rolls.query("Sum <= @high")

Unnamed: 0,Die1,Die2,Die3,Sum,High,Winner
0,1,5,1,7,False,False
2,3,5,1,9,False,False
5,3,3,3,9,False,False


In [32]:
os = pd.read_csv("../Data/athlete_events.csv")
os.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [34]:
%timeit os[os["NOC"] == "SWE"]
%timeit os.query("NOC == 'SWE'")

23.2 ms ± 2.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
16.3 ms ± 2.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
%timeit os[os["Height"] > 180]
%timeit os.query ("Height > 180")

14.6 ms ± 505 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
18.4 ms ± 571 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [39]:
%timeit os[(os["Sex"] == "F") & (os ["Height"] > 180) & (os["NOC"] == "SWE")]
%timeit os.query ("Sex == 'F' & Height > 180 & NOC == 'SWE'")

40.9 ms ± 6.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.3 ms ± 750 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
