# Mod11 Ufuncs in Pandas

## Ufuncs: Index Preservation

In [1]:
import pandas as pd
import numpy as np

Review NumPy ufunc

In [3]:
arr=np.arange(5);arr

array([0, 1, 2, 3, 4])

In [4]:
# exp():Exponential function指數函數 e**0,e**1,e**2,e**3,e**4  (e是常數:2.71828..)
np.exp(arr)

array([ 1.        ,  2.71828183,  7.3890561 , 20.08553692, 54.59815003])

In [5]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4));ser

0    6
1    3
2    7
3    4
dtype: int32

In [6]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [22]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),columns=['A', 'B', 'C', 'D']);df

Unnamed: 0,A,B,C,D
0,7,7,2,5
1,4,1,7,5
2,1,4,0,9


In [23]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,1096.633158,1096.633158,7.389056,148.413159
1,54.59815,2.718282,1096.633158,148.413159
2,2.718282,54.59815,1.0,8103.083928


In [24]:
np.exp(df['A'])

0    1096.633158
1      54.598150
2       2.718282
Name: A, dtype: float64

In [26]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-0.7071068,-0.7071068,1.0,-0.707107
1,1.224647e-16,0.7071068,-0.707107,-0.707107
2,0.7071068,1.224647e-16,0.0,0.707107


## UFuncs: Index Alignment

### Index alignment in Series

In [27]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,'New York': 19651127}, name='population')

In [28]:
print(area)
print("-"*50)
print(population)

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64
--------------------------------------------------
California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64


In [29]:
population / area                  # key沒對到的地方會形成 Not a Number

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [15]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

In [16]:
print(A)
print("-"*50)
print(B)

0    2
1    4
2    6
dtype: int64
--------------------------------------------------
1    1
2    3
3    5
dtype: int64


In [17]:
A + B                              # key沒對到的地方會形成 Not a Number

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

explicit specification of the fill value for any elements in ``A`` or ``B`` that might be missing:

In [18]:
A.add(B, fill_value=0)             # 透過fill_value可以自動把值補齊

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [2]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),columns=list('AB')); display(A)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),columns=list('BAC')); display(B)

Unnamed: 0,A,B
0,6,19
1,14,10


Unnamed: 0,B,A,C
0,7,4,6
1,9,2,6
2,7,4,3


In [3]:
A + B

Unnamed: 0,A,B,C
0,10.0,26.0,
1,16.0,19.0,
2,,,


In [4]:
# 處理NaN的方法:   1.在兩Series或DataFrame運算的時候用fill_value  2.在兩陣列運算後用dropna()  3.在兩陣列運算後用fillna()
A.add(B,fill_value=0)                    

Unnamed: 0,A,B,C
0,10.0,26.0,6.0
1,16.0,19.0,6.0
2,4.0,7.0,3.0


fill with the mean of all values in ``A`` (computed by first stacking the rows of ``A``):

In [5]:
A.mean()            # 各欄的平均值

A    10.0
B    14.5
dtype: float64

<details>
    <summary><b>DataFrame.stack() 說明圖</b></summary>
    <img src='./img/df_stack.png'>
</details>

In [40]:
A.stack()

0  A     6
   B    19
1  A    14
   B    10
dtype: int32

In [41]:
fill = A.stack().mean()
fill

12.25

In [42]:
fill = A.stack().mean()            # 因為直接A.mean()得到的平均值是各欄的，所以先透過stack()再取平均值就會是所有值的平均值
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,9.0,25.0,19.25
1,20.0,14.0,21.25
2,18.25,14.25,19.25


## Lab

<b>有兩個 Series 如下，求兩個 Series 的和，遇到 NaN 以零代替</b>

In [6]:
s1 = pd.Series([8.6, -3.2, 6.1, 2.4], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-4.2, 7.3, -2.7, 5, 1.8],
               index=['a', 'c', 'e', 'f', 'g'])

In [44]:
s1

a    8.6
c   -3.2
d    6.1
e    2.4
dtype: float64

In [45]:
s2

a   -4.2
c    7.3
e   -2.7
f    5.0
g    1.8
dtype: float64

In [8]:
s1.add(s2, fill_value=0)              # 先補0再相加

a    4.4
c    4.1
d    6.1
e   -0.3
f    5.0
g    1.8
dtype: float64

In [7]:
s1.add(s2).fillna(0)                  # 先相加再補0

a    4.4
c    4.1
d    0.0
e   -0.3
f    0.0
g    0.0
dtype: float64

<b>有兩個 DataFrame 如下，求兩個 DataFrame 的乘積，遇到 NaN 以 1 代替</b>

In [9]:
np.random.seed(61)
df1 = pd.DataFrame(np.random.randint(10,size=12).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.random.randint(20,size=20).reshape((4, 5)),
                   columns=list('abcde'))

In [5]:
df1

Unnamed: 0,a,b,c,d
0,3,2,2,0
1,2,1,7,5
2,3,7,0,9


In [6]:
df2

Unnamed: 0,a,b,c,d,e
0,10,19,15,4,2
1,19,10,5,14,8
2,2,13,6,3,2
3,15,13,14,15,9


In [7]:
df1*df2

Unnamed: 0,a,b,c,d,e
0,30.0,38.0,30.0,0.0,
1,38.0,10.0,35.0,70.0,
2,6.0,91.0,0.0,27.0,
3,,,,,


In [10]:
df1.mul(df2, fill_value = 1)            # 先補1再相加

Unnamed: 0,a,b,c,d,e
0,30.0,38.0,30.0,0.0,2.0
1,38.0,10.0,35.0,70.0,8.0
2,6.0,91.0,0.0,27.0,2.0
3,15.0,13.0,14.0,15.0,9.0


In [11]:
df1.mul(df2).fillna(1)                  # 先相加再補1

Unnamed: 0,a,b,c,d,e
0,30.0,38.0,30.0,0.0,1.0
1,38.0,10.0,35.0,70.0,1.0
2,6.0,91.0,0.0,27.0,1.0
3,1.0,1.0,1.0,1.0,1.0
