In [2]:
import pandas as pd
import numpy as np

In [3]:
def avg2(x, y):
    return (x+y)/2

In [4]:
avg2(5, 6.0)

5.5

In [5]:
assert avg2(5, 6.0) == 5.5

In [6]:
assert avg2(5, 6.0) == 5.8

AssertionError: 

In [7]:
def my_sq(x):
    return x ** 2

In [8]:
my_sq(4)

16

## We can create dataframe

In [9]:
df = pd.DataFrame({'a': [10,20,30],
                  'b': [20,30,40]})

In [10]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


## Dataframes have brodcasting

In [11]:
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [12]:
my_sq

<function __main__.my_sq(x)>

In [13]:
df['a'].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [14]:
def my_exp(x, e):
    return x**e

In [15]:
my_exp(2, 10)

1024

In [16]:
df['a'].apply(my_exp, e=4)  # apply() function **kwrgs arguments to access my_exp() function arguments

0     10000
1    160000
2    810000
Name: a, dtype: int64

In [17]:
def print_me(x):
    print(x)

In [18]:
df.apply(print_me) # entire columns are passed in to print_me() in one shot

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [19]:
def avg3(x,y,z):
    return (x+y+z)/3

In [20]:
df.apply(avg3) # throws error because only one column is passed to the avg3() but it requires 3 variables

TypeError: avg3() missing 2 required positional arguments: 'y' and 'z'

In [21]:
def avg_3_apply(col):
    return np.mean(col)

In [22]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [23]:
def avg_3_apply(col):
    x = col[0]
    y=col[1]
    z=col[2]
    return (x+y+z)/3

In [24]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [25]:
df.mean()

a    20.0
b    30.0
dtype: float64

In [26]:
df.apply(np.mean)

a    20.0
b    30.0
dtype: float64

In [27]:
df['a'] + df['b']

0    30
1    50
2    70
dtype: int64

In [28]:
def avg_2_mod(x, y):
    if(x==20):
        return np.NaN  # np.NAN  np.nan
    else:
        return (x+y)/2

In [29]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [30]:
# we want supply two columns of dataframe to avg_2_mod()
avg_2_mod(df['a'], df['b'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### but our function avg_2_mod() is not vectorised to understand its parameters are lists/series objects of pandas
#### numpy provides a function which vectorise another function and return vectorised function of our original function

In [31]:
avg_2_mod_vec = np.vectorize(avg_2_mod)

In [32]:
avg_2_mod_vec(df['a'], df['b'])

array([15., nan, 35.])

### another way of vectorising a function is by using decorators
### advantage of using decorators is we dont have use another variable for our function name

In [33]:
@np.vectorize
def avg_2_mod(x, y):
    if(x==20):
        return np.NaN  # np.NAN  np.nan
    else:
        return (x+y)/2

In [34]:
avg_2_mod(df['a'], df['b'])

array([15., nan, 35.])

### another way of doing it is by using vectorise() by numba

In [35]:
import numba

In [38]:
@numba.vectorize
def avg_2_mod_numba(x, y):
    if(x==20):
        return np.NaN  # np.NAN  np.nan
    else:
        return (x+y)/2

In [40]:
avg_2_mod_numba(df['a'], df['b'])   # numba cannot understand dataframe series it only understands numpy objects

ValueError: [1mCannot determine Numba type of <class 'pandas.core.series.Series'>[0m

In [46]:
avg_2_mod_numba(df['a'].values, df['b'].values)

array([15., nan, 35.])

### we can time certain operation in jupyter notebook 

In [43]:
%%timeit
avg2(df['a'], df['b'])

136 µs ± 505 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [44]:
%%timeit
avg_2_mod(df['a'], df['b'])

35.4 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [45]:
%%timeit
avg_2_mod_vec(df['a'], df['b'])

35.7 µs ± 225 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [47]:
%%timeit
avg_2_mod_numba(df['a'].values, df['b'].values)

6.97 µs ± 208 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
