# Mod12 Object Operations in Pandas

## Ufuncs: Operations Between DataFrame and Series

In [1]:
import pandas as pd
import numpy as np

### Pandas Operation

In Pandas, the convention similarly operates row-wise by default:

In [2]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3, 4));A

array([[6, 3, 7, 4],
       [6, 9, 2, 6],
       [7, 4, 3, 7]])

In [3]:
df = pd.DataFrame(A, columns=list('QRST'));df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [14]:
df.iloc[0]

Q    6
R    3
S    7
T    4
Name: 0, dtype: int32

In [6]:
df - df.iloc[0]                               # key有對到但形狀不同時，如果符合broadcast條件就會先broadcast再相加
                                              # key沒有對到，只有key對到的部分作運算，沒對到的部分是NaN

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


## Pandas Objects Operations

In [4]:
ind_char = pd.Index(['a','b','c','d','e']); ind_char

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
ind_num = pd.RangeIndex(5); ind_num

RangeIndex(start=0, stop=5, step=1)

In [6]:
s1 = pd.Series(range(10, 15), ind_char); s1

a    10
b    11
c    12
d    13
e    14
dtype: int64

In [7]:
s2 = pd.Series(range(30, 40, 2), ind_char); s2

a    30
b    32
c    34
d    36
e    38
dtype: int64

In [8]:
df1 = pd.DataFrame(100, index=ind_num, columns=ind_char)
df1

Unnamed: 0,a,b,c,d,e
0,100,100,100,100,100
1,100,100,100,100,100
2,100,100,100,100,100
3,100,100,100,100,100
4,100,100,100,100,100


In [9]:
df2 = pd.DataFrame(np.arange(25).reshape(5,5),index=ind_num, columns=ind_char)
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


### Series and Series Operation

Series operatrion is index alignment

In [9]:
s1 + s2

a    40
b    43
c    46
d    49
e    52
dtype: int64

In [14]:
s2.sample(frac=1)                    # sample()對數字抽樣進行打散，frac:要抽樣的比例(0~1)

c    34
a    30
d    36
e    38
b    32
dtype: int64

In [11]:
s1 + s2.sample(frac=1)              # 即使序位打散他還是可以自動對齊key相加

a    40
b    43
c    46
d    49
e    52
dtype: int64

### DataFrame and DataFrame Opertaion

In [12]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,100,101,102,103,104
1,105,106,107,108,109
2,110,111,112,113,114
3,115,116,117,118,119
4,120,121,122,123,124


In [13]:
df2.sample(frac=1, axis=0).sample(frac=1, axis=1)             # 同時打亂row和column

Unnamed: 0,a,c,d,b,e
2,10,12,13,11,14
4,20,22,23,21,24
1,5,7,8,6,9
3,15,17,18,16,19
0,0,2,3,1,4


In [14]:
df1 + df2.sample(frac=1, axis=0).sample(frac=1, axis=1)       # 即使序位打散他還是可以自動對齊key相加

Unnamed: 0,a,b,c,d,e
0,100,101,102,103,104
1,105,106,107,108,109
2,110,111,112,113,114
3,115,116,117,118,119
4,120,121,122,123,124


### DataFrame and Series Operation

In [15]:
s1.keys(), df1.keys()

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'),
 Index(['a', 'b', 'c', 'd', 'e'], dtype='object'))

In [27]:
df1+s1                                     # 因為keys有對到，所以s1會broadcast再做相加

Unnamed: 0,a,b,c,d,e
0,110,111,112,113,114
1,110,111,112,113,114
2,110,111,112,113,114
3,110,111,112,113,114
4,110,111,112,113,114


In [16]:
s3 = pd.Series(range(50, 10, -8), ind_num);s3

0    50
1    42
2    34
3    26
4    18
dtype: int64

In [29]:
df1 + s3                                   # 因為keys沒對到所以會出現 Nat a Number

Unnamed: 0,a,b,c,d,e,0,1,2,3,4
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,


Using transposition to resolve the problem

In [61]:
df1.T + s3                                 # 先轉置使Key對應做相加

Unnamed: 0,0,1,2,3,4
a,150,142,134,126,118
b,150,142,134,126,118
c,150,142,134,126,118
d,150,142,134,126,118
e,150,142,134,126,118


In [62]:
(df1.T + s3).T                             # 相加完畢後再轉置回來

Unnamed: 0,a,b,c,d,e
0,150,150,150,150,150
1,142,142,142,142,142
2,134,134,134,134,134
3,126,126,126,126,126
4,118,118,118,118,118


Better way to resolve the problem
<details>
    <summary>Axis in DataFrame</summary>
    <img src='./img/2D_axis_1.jpg'>
</details>

In [31]:
df1.add(s3, axis=0)                           

Unnamed: 0,a,b,c,d,e
0,150,150,150,150,150
1,142,142,142,142,142
2,134,134,134,134,134
3,126,126,126,126,126
4,118,118,118,118,118


## Lab

<b>有一個 DataFrame df 如下，試著正確的運算 df - df['Z'] 的結果</b>

operate column-wise by specifying the ``axis`` keyword:

In [15]:
np.random.seed(62)
A = np.random.randint(10, size=(3, 4))

df = pd.DataFrame(A, columns=list('WXYZ'))
df

Unnamed: 0,W,X,Y,Z
0,2,8,1,9
1,3,5,1,4
2,5,1,9,8


In [16]:
df['Z']

0    9
1    4
2    8
Name: Z, dtype: int32

In [18]:
df.sub(df['Z'], axis=0)

Unnamed: 0,W,X,Y,Z
0,-7,-1,-8,0
1,-1,1,-3,0
2,-3,-7,1,0
