In [1]:
import pandas as pd
import numpy as np

### 5.2.5 객체간 산술 연산

#### Series

In [2]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])

In [3]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

In [4]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [5]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [6]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

- s1, s2에 있는 모든 index를 포함
- s1과 s2의 공통 index에 대해서는 계산됨
- 공통 index가 아닌 경우 NaN으로 표시

=> **DataFrame에도 똑같이적용!**

#### DataFrame

In [7]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns = list('bcd'), index = ['Ohio', 'Texas', 'Colorado'])

In [8]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [9]:
df3 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns = list('손지우'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
df3

Unnamed: 0,손,지,우
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [10]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [11]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [12]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


Series와 마찬가지로 index와 column에 적용

In [13]:
df1 = pd.DataFrame({'A' : [1, 2]})
df2 = pd.DataFrame({'B' : [3, 4]})

In [14]:
df1

Unnamed: 0,A
0,1
1,2


In [15]:
df2

Unnamed: 0,B
0,3
1,4


In [16]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


In [17]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd'))

In [18]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [19]:
df2 = pd.DataFrame(np.arange(20.).reshape((5, 4)), columns = list('bcde'))

In [20]:
df2

Unnamed: 0,b,c,d,e
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0
3,12.0,13.0,14.0,15.0
4,16.0,17.0,18.0,19.0


In [21]:
df2.loc[1, 'b'] = np.nan

In [22]:
df2

Unnamed: 0,b,c,d,e
0,0.0,1.0,2.0,3.0
1,,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0
3,12.0,13.0,14.0,15.0
4,16.0,17.0,18.0,19.0


In [23]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,,1.0,3.0,5.0,
1,,,11.0,13.0,
2,,17.0,19.0,21.0,
3,,,,,
4,,,,,


겹치지 않는 부분 NA

In [24]:
#fill_value를 통해서 df1 or df2중 하나만 없으면 0으로 채워줌 if 둘다 없으면 Nan으로 채워짐
df1 + df2
df1.add(df2, fill_value = 0) 

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,3.0,5.0,3.0
1,4.0,5.0,11.0,13.0,7.0
2,8.0,17.0,19.0,21.0,11.0
3,,12.0,13.0,14.0,15.0
4,,16.0,17.0,18.0,19.0


**r 메서드 역수 취해 더하기**

In [25]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [26]:
df1.div(2) df1/2

Unnamed: 0,a,b,c,d
0,0.0,0.5,1.0,1.5
1,2.0,2.5,3.0,3.5
2,4.0,4.5,5.0,5.5


In [27]:
df1.rdiv(2) #2/df1

Unnamed: 0,a,b,c,d
0,inf,2.0,1.0,0.666667
1,0.5,0.4,0.333333,0.285714
2,0.25,0.222222,0.2,0.181818


In [86]:
df1.sub(2)

Unnamed: 0,a,b,c,d
0,-2.0,-1.0,0.0,1.0
1,2.0,3.0,4.0,5.0
2,6.0,7.0,8.0,9.0


In [28]:
df1.rsub(2) # 2- df1

Unnamed: 0,a,b,c,d
0,2.0,1.0,0.0,-1.0
1,-2.0,-3.0,-4.0,-5.0
2,-6.0,-7.0,-8.0,-9.0


reindex

In [88]:
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,b,c,d,e
0,1.0,2.0,3.0,0
1,5.0,6.0,7.0,0
2,9.0,10.0,11.0,0


#### frame과 series간 브로드 캐스팅

In [30]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [31]:
series = frame.iloc[0]

In [32]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [33]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [34]:
frame - series #브로드 캐스팅

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


- 기본적으로 Series의 index를 각 column을 맞추고 모든 row에 대해 계산해준다. 

브로드캐스팅 => axis = 1 기준

In [35]:
series2 = frame['b']

In [36]:
series2

Utah      0.0
Ohio      3.0
Texas     6.0
Oregon    9.0
Name: b, dtype: float64

In [37]:
frame - series2

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


만약 Sereis의 index를 각 row에 맞추고 각 column에 대한 계산을 하고 싶다!

=> 산술 연산메서드 사용

In [38]:
frame.sub(series2, axis = 'index') #Series의 index를 daraframe의 각 index에 맞춘다

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,0.0,1.0,2.0
Texas,0.0,1.0,2.0
Oregon,0.0,1.0,2.0


axis : 연산을 적용할 축의 번호 => 숫자로 사용가능

In [39]:
frame.sub(series2, axis = 0)

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,0.0,1.0,2.0
Texas,0.0,1.0,2.0
Oregon,0.0,1.0,2.0


In [40]:
frame.sub(series2, axis = 1)

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


#### 5.2.6 함수 적용과 매핑

In [41]:
frame = pd.DataFrame(np.random.randn(4, 3), index = ['Utah', 'Ohio', 'Texas', 'Oregon'], columns = list('bde'))

##### Numpy의 유니버셜 함수 적용

In [42]:
frame

Unnamed: 0,b,d,e
Utah,-1.736979,-0.922993,0.520305
Ohio,-0.44512,0.256149,-1.243236
Texas,-0.804748,-0.873878,0.449246
Oregon,0.921296,-0.230322,-0.812496


In [43]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.736979,0.922993,0.520305
Ohio,0.44512,0.256149,1.243236
Texas,0.804748,0.873878,0.449246
Oregon,0.921296,0.230322,0.812496


In [44]:
np.sum(frame) #defualt axis = 0

b   -2.065551
d   -1.771044
e   -1.086181
dtype: float64

##### Apply를 통한 함수 적용

In [45]:
f = lambda x: x.max() - x.min() 

x는 1차원 배열 
* axis = 0 => column이 같은 / 각 row => x가 됨.
* axis =1 => 각 row가 같은 / 각 column => x가 됨.

In [46]:
frame

Unnamed: 0,b,d,e
Utah,-1.736979,-0.922993,0.520305
Ohio,-0.44512,0.256149,-1.243236
Texas,-0.804748,-0.873878,0.449246
Oregon,0.921296,-0.230322,-0.812496


In [47]:
frame.apply(f)

b    2.658275
d    1.179143
e    1.763541
dtype: float64

In [48]:
frame.apply(f, axis = 'columns') # 아니면 axis = 1

Utah      2.257284
Ohio      1.499385
Texas     1.323125
Oregon    1.733792
dtype: float64

In [49]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max']) # return 형이 Series

In [50]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.736979,-0.922993,-1.243236
max,0.921296,0.256149,0.520305


In [51]:
format = lambda x: '%.2f' %x # 형식을 갖춘 문자열을 만들어주는 연산자: %
frame.applymap(format) #시리즈 각 원소에 적용할 함수를 지정하기위한 map메서드를 가지고 있다.

Unnamed: 0,b,d,e
Utah,-1.74,-0.92,0.52
Ohio,-0.45,0.26,-1.24
Texas,-0.8,-0.87,0.45
Oregon,0.92,-0.23,-0.81


데이터 타입 확인

In [52]:
change_frame = frame.applymap(format)

In [53]:
change_frame

Unnamed: 0,b,d,e
Utah,-1.74,-0.92,0.52
Ohio,-0.45,0.26,-1.24
Texas,-0.8,-0.87,0.45
Oregon,0.92,-0.23,-0.81


In [54]:
frame.dtypes

b    float64
d    float64
e    float64
dtype: object

In [55]:
change_frame.dtypes

b    object
d    object
e    object
dtype: object

applymap인 이유
- Series는 각 원소에 적용할 함수를 지정하기 위한 map 메소드를 가지고 있으므로

In [56]:
frame['e'].map(format)

Utah       0.52
Ohio      -1.24
Texas      0.45
Oregon    -0.81
Name: e, dtype: object

In [57]:
frame

Unnamed: 0,b,d,e
Utah,-1.736979,-0.922993,0.520305
Ohio,-0.44512,0.256149,-1.243236
Texas,-0.804748,-0.873878,0.449246
Oregon,0.921296,-0.230322,-0.812496


#### 5.2.7 정렬과 순위

In [58]:
obj = pd.Series(range(4), index = ['d', 'a', 'b', 'c'])

In [59]:
obj.sort_index() #index를 알파벳 기준으로 정렬

a    1
b    2
c    3
d    0
dtype: int64

In [60]:
assert id(obj) == id(obj.sort_index()) #새로운 객체를 return 하기 때문에

AssertionError: 

Dataframe은 row나 column 하나의 축을 기준으로 정렬가능함

In [61]:
frame = pd.DataFrame(np.arange(12).reshape((3, 4)), index = ['three', 'one', 'four'], columns = list('dabc'))

In [62]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7
four,8,9,10,11


In [63]:
frame.sort_index()

Unnamed: 0,d,a,b,c
four,8,9,10,11
one,4,5,6,7
three,0,1,2,3


In [64]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4
four,9,10,11,8


In [65]:
frame.sort_index(axis = 1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5
four,8,11,10,9


sort_values : 값에 따라 정렬

In [66]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [67]:
obj = pd.Series([4, 7, -3, 2, np.nan])
obj.sort_values()

2   -3.0
3    2.0
0    4.0
1    7.0
4    NaN
dtype: float64

In [68]:
obj.sort_values(ascending = False)

1    7.0
0    4.0
3    2.0
2   -3.0
4    NaN
dtype: float64

nan은 무조건 가장 뒤에 정렬

In [90]:
frame = pd.DataFrame({'a' : [4, 7, -3, 2], 'b' : [0, 1, 0, 1]}) #책 오타

In [91]:
frame

Unnamed: 0,a,b
0,4,0
1,7,1
2,-3,0
3,2,1


In [92]:
frame.sort_values(by = 'b')

Unnamed: 0,a,b
0,4,0
2,-3,0
1,7,1
3,2,1


column 여러개 정렬

In [71]:
frame.sort_values(by = ['a', 'b'])

Unnamed: 0,a,b
2,-3,0
3,2,1
0,4,0
1,7,1


순위 매기기(rank)

- 작은 순서가 1부터 시작 
- 겹치는 경우 평균 순위 => default가 average

In [95]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [94]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [73]:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

데이터가 나타난 순서대로 순위를 매김

ex) 첫번째 7과 세번째 7에 대해서 첫번째 나온것이 세번째 나온것보다 더 빠른 순위

In [96]:
obj.rank(ascending = False ,method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

동률인 경우 그룹내 높은 순위

ex) 0번쨰 2번째 7에 대해서 각 7이 1등 2등이므로 그중 2등으로 값을 주겟다!

Axis기준으로 순위 정하기

In [75]:
frame = pd.DataFrame({'b' : [4.3, 7, -3, 2], 'a' : [0, 1, 0, 1], 'c' : [-2, 5, 8, -2.5]})

In [85]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [76]:
frame.rank(axis = 'columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### 5.2.8 중복index

In [77]:
obj = pd.Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])

In [78]:
obj.index.is_unique

False

중복된 index => series로

In [79]:
obj['a']

a    0
a    1
dtype: int64

중복된 index 

In [80]:
df = pd.DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])

In [81]:
df.loc['b']

Unnamed: 0,0,1,2
b,1.257123,0.001944,-2.194547
b,-2.429467,1.320653,-0.601182


column도 마찬가지!

In [82]:
df = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'a', 'b', 'b'])

In [83]:
df

Unnamed: 0,a,a.1,b,b.1
0,-1.018089,0.073702,-1.024931,-0.499433
1,-0.330453,1.099487,0.519535,-1.19032
2,-0.499915,0.994674,0.778697,-0.926325


In [84]:
df['a']

Unnamed: 0,a,a.1
0,-1.018089,0.073702
1,-0.330453,1.099487
2,-0.499915,0.994674
