In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.__version__

'2.0.1'

In [3]:
arr = np.arange(100,105)
arr

array([100, 101, 102, 103, 104])

In [4]:
s = pd.Series(arr)
s

0    100
1    101
2    102
3    103
4    104
dtype: int32

In [5]:
s = pd.Series(arr, dtype='int32')
s

0    100
1    101
2    102
3    103
4    104
dtype: int32

In [6]:
s = pd.Series(['부장','차장','대리','사원','인턴'])
s

0    부장
1    차장
2    대리
3    사원
4    인턴
dtype: object

In [7]:
s = pd.Series([91,2.5,'스포츠','sports',4,5.16])
s

0        91
1       2.5
2       스포츠
3    sports
4         4
5      5.16
dtype: object

In [8]:
s.index = list('abcdef')
print(s.index)
s

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')


a        91
b       2.5
c       스포츠
d    sports
e         4
f      5.16
dtype: object

In [9]:
# values 는 Series 데이터 값만 numpy array형식으로 가져옴
s.values

array([91, 2.5, '스포츠', 'sports', 4, 5.16], dtype=object)

In [10]:
s.ndim

1

In [11]:
s.shape

(6,)

In [12]:
s = pd.Series(['선화','강호',np.nan, '소정','우영'])
s

0     선화
1     강호
2    NaN
3     소정
4     우영
dtype: object

In [13]:
s1 = pd.Series(np.arange(50,55), dtype = 'float32')
s1

0    50.0
1    51.0
2    52.0
3    53.0
4    54.0
dtype: float32

In [14]:
s2 = pd.Series(['apple',np.nan, 'banana','kiwi','gubong'], index = list('가나다라마'))
s2

가     apple
나       NaN
다    banana
라      kiwi
마    gubong
dtype: object

In [15]:
s = pd.Series(['손흥민','김연아','박세리','박찬호','김연경'], index = ['a','b','c','d','e'])
s

a    손흥민
b    김연아
c    박세리
d    박찬호
e    김연경
dtype: object

In [16]:
s[[True,True,False,False,True]]

a    손흥민
b    김연아
e    김연경
dtype: object

In [17]:
s = pd.Series([29,99,np.nan,11,56], index = ['a','b','c','d','e'])
s

a    29.0
b    99.0
c     NaN
d    11.0
e    56.0
dtype: float64

In [18]:
s>50

a    False
b     True
c    False
d    False
e     True
dtype: bool

In [19]:
s[s>50]

b    99.0
e    56.0
dtype: float64

In [20]:
s.isnull()
# 또는
# s.isna()

a    False
b    False
c     True
d    False
e    False
dtype: bool

In [21]:
# 결측치 찾기
s[s.isnull()]

c   NaN
dtype: float64

In [22]:
# 결측치 개수
s.isnull().sum()

1

In [23]:
s.notnull()

a     True
b     True
c    False
d     True
e     True
dtype: bool

In [24]:
s[s.notnull()]

a    29.0
b    99.0
d    11.0
e    56.0
dtype: float64

# Dataframe

In [25]:
pd.DataFrame([[1,2,3],
             [4,5,6],
             [7,8,9]])

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [26]:
pd.DataFrame([[1,2,3],
             [4,5,6],
             [7,8,9]], columns=['가','나','다'])

Unnamed: 0,가,나,다
0,1,2,3
1,4,5,6
2,7,8,9


In [27]:
data = {
    'name' : ['kim','lee','park'],
    'age' : [24,27,34],
    'children' : [2,1,3]
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,children
0,kim,24,2
1,lee,27,1
2,park,34,3


In [28]:
print(df.index)
print(df.columns)
print(df.values)
print(df.dtypes)

RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age', 'children'], dtype='object')
[['kim' 24 2]
 ['lee' 27 1]
 ['park' 34 3]]
name        object
age          int64
children     int64
dtype: object


In [29]:
df.T

Unnamed: 0,0,1,2
name,kim,lee,park
age,24,27,34
children,2,1,3


In [30]:
df.index = list('abc')
df

Unnamed: 0,name,age,children
a,kim,24,2
b,lee,27,1
c,park,34,3


In [31]:
print(df['name'])
print(type(df['name']))

a     kim
b     lee
c    park
Name: name, dtype: object
<class 'pandas.core.series.Series'>


In [32]:
df[['name','children']]

Unnamed: 0,name,children
a,kim,2
b,lee,1
c,park,3


In [33]:
df.rename({'name': '이름'}, axis = 1, inplace= True)
df

Unnamed: 0,이름,age,children
a,kim,24,2
b,lee,27,1
c,park,34,3


In [34]:
data = {
    'food': ['KFC', 'McDonald', 'SchoolFood'], 
    'price': [1000, 2000, 2500], 
    'rating': [4.5, 3.9, 4.2]
}

df = pd.DataFrame(data)
df

Unnamed: 0,food,price,rating
0,KFC,1000,4.5
1,McDonald,2000,3.9
2,SchoolFood,2500,4.2


In [35]:
df[['food', 'rating']]

Unnamed: 0,food,rating
0,KFC,4.5
1,McDonald,3.9
2,SchoolFood,4.2


In [36]:
df.rename(columns={'food': 'place'}, inplace=True)
df

Unnamed: 0,place,price,rating
0,KFC,1000,4.5
1,McDonald,2000,3.9
2,SchoolFood,2500,4.2


In [37]:
from pandas import Series, DataFrame

In [38]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [39]:
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [40]:
# 더하기. 겹치는 색인이 없다면 데이터는 NA 값이 됨.
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [41]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [42]:
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [43]:
df1 + df2 #shape이 다를 경우, 공통으로 들어가 있는 부분만 합연산이 되고, 그렇지 않으면 NaN으로 처리됨

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [44]:
# 메서드도 있다.
df1.add(df2)

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [45]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [46]:
series = frame.loc["Utah"]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [47]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [48]:
series2 = Series(range(3), index=['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [49]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [50]:
# 함수 적용과 매핑
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),  #randn() 표준정규분포 난수, 
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.587832,-0.862606,1.411133
Ohio,1.045656,-0.04438,-0.680913
Texas,0.025367,1.792629,1.398341
Oregon,1.129559,0.507098,-0.258145


In [51]:
# NumPy의 유니버셜 함수(배열의 각 원소에 적용되는 메서드) 적용 가능
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.587832,0.862606,1.411133
Ohio,1.045656,0.04438,0.680913
Texas,0.025367,1.792629,1.398341
Oregon,1.129559,0.507098,0.258145


In [52]:
# apply를 통해 각 로우나 칼럼의 1차원 배열에 함수를 적용
f = lambda x: x.max() - x.min()

print(frame.apply(f)) #default, axis=0 행방향, 즉 각 열에 대해 적용
print(frame.apply(f, axis=1)) # 열방향, 즉 각 행에 대해

b    1.104192
d    2.655235
e    2.092046
dtype: float64
Utah      2.273739
Ohio      1.726569
Texas     1.767263
Oregon    1.387704
dtype: float64


In [53]:
# apply 메서드에 전달된 함수는 스칼라 값을 반환할 필요 없으며, 
# Series 또는 여러 값을 반환해도 된다.
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,0.025367,-0.862606,-0.680913
max,1.129559,1.792629,1.411133


In [54]:
# 각 원소에 함수가 적용되게 하고 싶으면 applymap
format = lambda x: '%.2f' % x

frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.59,-0.86,1.41
Ohio,1.05,-0.04,-0.68
Texas,0.03,1.79,1.4
Oregon,1.13,0.51,-0.26


In [55]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [56]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [57]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [58]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [59]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [60]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [61]:
# 순위 매기기 rank
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj 

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [62]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [63]:
obj.rank(method='first') # 같은 값일때 먼저 나온 것에 순위를 먼저 부여

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

### 동점을 가진 데이터의 순위를 정하는 방법
average(평균): 그룹의 평균 순위 부여 (예: 두 명이 공동 1등이라면 둘 다 1.5등으로 처리)

min(최솟값): 그룹에서 가장 낮은 순위 부여 (예: 두 명이 공동 1등이라면 둘 다 1등으로 처리)

max(최댓값): 그룹에서 가장 낮은 순위 부여 (예: 두 명이 공동 1등이라면 둘 다 2등으로 처리)

first(첫 번째): 그룹에서 표시되는 순서대로 순위 부여 (예: 두 명이 공동 1등이라면 순서가 빠른 사람을 1등으로 처리)

dense(밀도): min과 동일함. 다만 순위는 항상 1씩 증가

In [64]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [65]:
frame = DataFrame({'b': [4.3, 7, -3, 2],
                   'a':[0, 1, 0, 1],
                   'c':[-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [66]:
frame.rank()

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [67]:
frame.rank(axis=1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [68]:
# 중복색인
# 색인 값이 유일하면 좋지만(pandas의 많은 함수들이 이런 가정하에 입력을 받는다), 강제사항은 아니다.
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [69]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,-0.198477,0.686774,-1.150857
a,-1.094344,1.345912,-0.780949
b,1.065559,-0.223104,1.367888
b,0.159589,-0.326692,-1.071094


In [70]:
import numpy as np
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [71]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [72]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [73]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [74]:
df.sum(axis=1, skipna = False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [75]:
df.idxmax()  # 행의 방향으로...각열별 최대값을 갖는 인덱스

one    b
two    d
dtype: object

In [76]:
# 수치 데이터가 아니면 빈도 관련 기술요약치를 반환
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [77]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [78]:
# 유일 값
from pandas import Series, DataFrame
import pandas as pd
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [79]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [80]:
# 카운트
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [81]:
pd.value_counts(obj.values, sort=True)

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [82]:
# 멤버십
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [83]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [84]:
# 결측치찾기
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [85]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [86]:
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [87]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [88]:
# 결측치 제거
from numpy import nan as NA

data = Series([1, NA, 3.5, NA, 7])
data


0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [89]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [90]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 9, 2]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,9.0,2.0


In [91]:
# 결측치가 들어간 행 전체삭제
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [92]:
# 모든 값이 NA인 로우만 제외
data.dropna(how='all')# 카운트
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [93]:
# 몇개 이상 값이 들어있는가를 기준으로
data.dropna(thresh=2)

# 2개 이상 결측치를 제거

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
3,,9.0,2.0


In [94]:
# 결측치 채우기 : fillna
df = DataFrame(np.random.randn(7, 3))
df.loc[:4, 1] = NA; df.loc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,1.029891,,
1,0.706446,,
2,-0.78338,,
3,-0.882686,,2.311601
4,0.931358,,0.713479
5,2.362058,0.039827,0.134858
6,0.034472,0.002271,0.795787


In [95]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.029891,0.0,0.0
1,0.706446,0.0,0.0
2,-0.78338,0.0,0.0
3,-0.882686,0.0,2.311601
4,0.931358,0.0,0.713479
5,2.362058,0.039827,0.134858
6,0.034472,0.002271,0.795787


In [96]:
# key는 컬럼 색인을 뜻함
df.fillna({1: 0.5, 2: -1})

Unnamed: 0,0,1,2
0,1.029891,0.5,-1.0
1,0.706446,0.5,-1.0
2,-0.78338,0.5,-1.0
3,-0.882686,0.5,2.311601
4,0.931358,0.5,0.713479
5,2.362058,0.039827,0.134858
6,0.034472,0.002271,0.795787


In [97]:
# 보간법 가능
df = DataFrame(np.random.randn(6, 3))
df.loc[2:, 1] = NA
df.loc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-2.220505,-1.651933,-0.429081
1,0.91206,-0.048332,-0.149949
2,0.001807,,1.295748
3,-0.223388,,-2.459196
4,-0.455859,,
5,0.085139,,


In [98]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-2.220505,-1.651933,-0.429081
1,0.91206,-0.048332,-0.149949
2,0.001807,-0.048332,1.295748
3,-0.223388,-0.048332,-2.459196
4,-0.455859,-0.048332,-2.459196
5,0.085139,-0.048332,-2.459196


In [99]:
# 함수도 넘길 수 있다.
print(df.mean(axis = 0)) # 각 열의 평균값으로 대체
df.fillna(df.mean())

0   -0.316791
1   -0.850132
2   -0.435619
dtype: float64


Unnamed: 0,0,1,2
0,-2.220505,-1.651933,-0.429081
1,0.91206,-0.048332,-0.149949
2,0.001807,-0.850132,1.295748
3,-0.223388,-0.850132,-2.459196
4,-0.455859,-0.850132,-0.435619
5,0.085139,-0.850132,-0.435619


In [100]:
# 계층적 색인
data = Series(np.random.randn(10),
              index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                       [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

a  1   -0.843575
   2   -0.869825
   3    0.478418
b  1   -0.619083
   2   -0.672219
   3   -0.187074
c  1   -0.364353
   2    0.172796
d  2    0.007455
   3   -0.451774
dtype: float64

In [101]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [102]:
# 계층 색인 풀기 - wide 형 (행의 인덱스가 컬럼으로 옮겨감)
data.unstack()

Unnamed: 0,1,2,3
a,-0.843575,-0.869825,0.478418
b,-0.619083,-0.672219,-0.187074
c,-0.364353,0.172796,
d,,0.007455,-0.451774


In [103]:
# 데이터프레임 로우,칼럼 색인을 계층 색인으로 묶어내기 - long 형 
# 컬럼의 인덱스가 행의 인덱스로 옮겨감
# ( 원상복귀 )
data.unstack().stack()

a  1   -0.843575
   2   -0.869825
   3    0.478418
b  1   -0.619083
   2   -0.672219
   3   -0.187074
c  1   -0.364353
   2    0.172796
d  2    0.007455
   3   -0.451774
dtype: float64

In [104]:
# 2중 행렬
frame = DataFrame(np.arange(16).reshape((4, 4)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado','Colorado'],
                           ['Green', 'Red', 'Green','Red']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green,Red
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [105]:
# 계층 색인에 이름 주기
frame.index.names = ['key1', 'key2']
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [106]:
# 계층 열에 이름 주기
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado,Colorado
Unnamed: 0_level_1,color,Green,Red,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [107]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,4,5
b,1,8,9
b,2,12,13


In [108]:
frame['Ohio'].loc['a']

color,Green,Red
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,4,5


In [109]:
frame.groupby(level='key2').sum()

state,Ohio,Ohio,Colorado,Colorado
color,Green,Red,Green,Red
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,8,10,12,14
2,16,18,20,22


In [110]:
frame.groupby(level='color', axis=1).sum()

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,4
a,2,10,12
b,1,18,20
b,2,26,28


In [111]:
# 행과 열의 교환
frame = DataFrame({'a': range(7),
                   'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [112]:
# 열을 행으로 옮길 수 있다.
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [113]:
#로우 색인 -> 컬럼 색인으로
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [114]:
# 내부조인 inner join
df1 = DataFrame({'key' : ['b','b','a','c','a','a','b'],
                'data1': range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [115]:
df2 = DataFrame({'key' : ['a','b','d'],
                'data2': range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [116]:
# 암묵적으로 겹치는 컬럼을 키로 사용해서 합침
pd.merge(df1,df2) #교집합을 반환

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [117]:
# 명시적으로 키를 설정할 수 있다.
pd.merge(df1,df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [118]:
# 공통되는 칼럼이 없으면 정해줄수 있다.

df3 = DataFrame({'lkey' : ['b','b','a','c','a','a','b'],
                'data1': range(7)})
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [119]:
df4 = DataFrame({'rkey' : ['a','b','d'],
                'data2': range(3)})
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [120]:
# 따로 정해줌
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [121]:
# 완전 외부조인(outer)
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [122]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [123]:
pd.merge(df1, df2, how='outer') #합집합의 개념|

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [124]:
# 왼쪽 우선 외부조인(left)
df1 = DataFrame({'key' : ['b','b','a','c','a','b'],
                'data1': range(6)})
df2 = DataFrame({'key' : ['a','b','a', 'b', 'd'],
                'data2': range(5)})

In [125]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [126]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [127]:
# 왼쪽 우선 외부조인은 왼쪽의 모든 로우를 포함하는 결과를 반환한다
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [128]:
# 오른쪽 우선 외부조인(right)
# 오른쪽 우선 외부조인은 오른쪽의 모든 로우를 포함하는 결과를 반환한다
pd.merge(df1, df2, on='key', how='right')

Unnamed: 0,key,data1,data2
0,a,2.0,0
1,a,4.0,0
2,b,0.0,1
3,b,1.0,1
4,b,5.0,1
5,a,2.0,2
6,a,4.0,2
7,b,0.0,3
8,b,1.0,3
9,b,5.0,3


In [129]:
# 색인 merge
left1 = DataFrame({'key' : ['a', 'b', 'a', 'a', 'b', 'c'],
                           'value' : range(6)})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [130]:
right1 = DataFrame({'group_val' : [3.5, 7]}, index=['a', 'b'])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [131]:
# 기본은 교집합
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [132]:
#  외부조인
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [133]:
# concat
# 하나의 축을 따라 객체를 이어붙인다.

# numpy의 concatenate 함수
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [134]:
np.concatenate([arr, arr], axis=1) # axis=1 열의 방향으로

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [135]:
# pandas의 concatenate 함수

# Series 객체
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])

print(s1,'\n',s2,'\n',s3)

a    0
b    1
dtype: int64 
 c    2
d    3
e    4
dtype: int64 
 f    5
g    6
dtype: int64


In [136]:
# 합치기
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [137]:
# 축 바꿔서
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [138]:
s4 = pd.concat([s1*5, s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

In [139]:
# DataFrame도 비슷하게 작동한다

df1 = DataFrame(np.arange(6).reshape(3, 2), index = ['a', 'b', 'c'],
                        columns = ['one', 'two'])

df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index = ['a',  'c'],
                        columns = ['three', 'four'])

In [140]:
print(df1)
print(df2)

   one  two
a    0    1
b    2    3
c    4    5
   three  four
a      5     6
c      7     8


In [141]:
pd.concat([df1, df2])

Unnamed: 0,one,two,three,four
a,0.0,1.0,,
b,2.0,3.0,,
c,4.0,5.0,,
a,,,5.0,6.0
c,,,7.0,8.0


In [142]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [143]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [144]:
# combine_first
# 두 객체를 포개서 한 객체에서 누락된 데이터를 다른 객체에 있는 값으로 채울 수 있게 한다.

a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [145]:
b = Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [146]:
# 두 자료의 색인이 겹친다. 이러면(색인이 일부겹치거나 완전히 같거나) 머지나 이어붙이기로는 불가능
# 이때 조건절로 합침
np.where(pd.isnull(a), b, a)
# a의 값이 결측치인 경우 b로 채우고, 아닌 경우 a로 채우기

array([0. , 2.5, 2. , 3.5, 4.5, nan])

In [147]:
print(b[:-2])
print(a[2:])

f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64


In [148]:
# 위의 것과 동일하다.
b[:-2].combine_first(a[2:])
# b[:-2] 부터 채우고 나머지 a[2:]로 채우기

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [149]:
a[2:]

d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [150]:
b[:-2]

f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64

In [151]:
# DataFrame은 컬럼에 대해 위와 같이 동작
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                            'b': [np.nan, 2., np.nan, 6.],
                            'c': range(2, 18, 4)})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [152]:
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7],
                            'b': [np.nan, 3., 4., 6., 8.]})
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [153]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


## 재형성과 피벗
표 형식의 데이터를 재배치하는 다양한 기본 연산이 존재하는데

이런 연산을 재형성reshaping 또는 피벗이라고 한다.
- 계층적 색인으로 재형성
- 피버팅


계층적 색인으로 재형성
- stack : 데이터의 컬럼을 로우로 피벗 또는 회전
- unstack : 로우를 칼럼으로 피벗시킨다

In [154]:
data = DataFrame(np.arange(6).reshape((2, 3)),
                         index=pd.Index(['Ohio', 'Colorado'], name='state'),
                        columns = pd.Index(['one', 'two', 'three'], name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [155]:
# long 포맷으로 변형
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [156]:
# wide 포맷으로 변형
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [157]:
# 피버팅
ldata = DataFrame({'date': np.random.rand(9),
                          'item':['a','a','c', 'b', 'a', 'b', 'c', 'a', 'a'],
                          'value': np.arange(9)})       
ldata

Unnamed: 0,date,item,value
0,0.582303,a,0
1,0.552886,a,1
2,0.354999,c,2
3,0.347413,b,3
4,0.06849,a,4
5,0.165812,b,5
6,0.374906,c,6
7,0.102793,a,7
8,0.701019,a,8


In [158]:
pivoted = ldata.pivot(index = 'date', columns= 'item', values= 'value')
pivoted.head()

item,a,b,c
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.06849,4.0,,
0.102793,7.0,,
0.165812,,5.0,
0.347413,,3.0,
0.354999,,,2.0


In [159]:
# 데이터 변형
# 중복제거
data = DataFrame({'k1': ['one']*3 + ['two']*4,
                          'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [160]:
# 중복찾기
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [161]:
# 중복제거
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [162]:
# 함수 매핑
data = DataFrame({'food' : ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                          'ounces' : [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [163]:
meat_to_animal = {
    'bacon' : 'pig',
    'pulled pork' : 'pig',
    'pastrami' : 'cow',
    'corned beef' : 'cow',
    'honey ham' : 'pig',
    'nova lox' : 'salmon'
}

In [164]:
# 육류의 이름을 소문자로 바꾸고, 해당 동물로 매핑
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [165]:
# 함수 형태로 넘길수도 있다.
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [166]:
# 값 치환
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [167]:
data.replace(-999, np.nan, inplace=True)
data

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [168]:
# 특이값 찾아내고 치환하기
np.random.seed(12345)

data = DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [169]:
# 위 자료에서 한 컬럼에서 절대값이 3을 초과하는 값 찾기
col = data[3]

col.head()

0   -0.555730
1    0.281746
2   -1.296221
3    0.886429
4   -0.438570
Name: 3, dtype: float64

In [170]:
col[np.abs(col) > 3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [172]:
data[(np.abs(data)>3).any(axis=1)] #데이터프레임에서 각 행에 절대값이 3을 넘어가는 것이 하나라도 있는 경우 출력


Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [173]:
# 데이터 그룹 연산
# 그룹 연산 = 분리-적용-결합

df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})

df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.150765,1.199915
1,a,two,-0.997174,-0.451814
2,b,one,0.046486,-0.155385
3,b,two,-0.610441,-0.153514
4,a,one,-0.394982,0.011194


In [174]:
# 데이터를 key1으로 묶고 각 그룹에서 data1의 평균을 구하는 방법
grouped = df['data1'].groupby(df['key1'])

# grouped 변수는 Groupby 객체
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001E67A5EFA10>

In [175]:
# 평균 구하기
grouped.mean()

key1
a   -0.080463
b   -0.281977
Name: data1, dtype: float64

In [176]:
# 색인 여러개 넘기기
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

means

key1  key2
a     one     0.377892
      two    -0.997174
b     one     0.046486
      two    -0.610441
Name: data1, dtype: float64