### DataFrame 계층적 인덱싱 이해하기


In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series(np.random.randn(10),
              index=[["a", "a", "a", "b", "b", "b", "c", "c", "d", "d"],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

In [3]:
s

a  1   -0.305995
   2    0.271531
   3    1.529962
b  1   -0.933480
   2   -0.491206
   3   -0.745088
c  1   -0.292596
   2   -1.798789
d  2    1.755286
   3   -0.906242
dtype: float64

In [4]:
 s.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [5]:
s["b"]

1   -0.933480
2   -0.491206
3   -0.745088
dtype: float64

In [6]:
s["b":"c"]

b  1   -0.933480
   2   -0.491206
   3   -0.745088
c  1   -0.292596
   2   -1.798789
dtype: float64

In [7]:
# b인덱스의 3번값
s[("b",3)]

-0.74508836488167074

In [8]:
s[:,2]

a    0.271531
b   -0.491206
c   -1.798789
d    1.755286
dtype: float64

In [9]:
df = pd.DataFrame(np.arange(12).reshape((4, 3)),
                               index=[["a", "a", "b", "b"], 
                                      [1, 2, 1, 2]],
                               columns=[["Seoul", "Seoul", "Busan"],
                                        ["Green", "Red", "Green"]])

In [10]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Seoul,Seoul,Busan
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [11]:
df.index.names = ["key1", "key2"]

In [12]:
df.columns.names = ["city", "color"]

In [13]:
df

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [14]:
df["Seoul"]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [15]:
df[("Seoul","Green")]

key1  key2
a     1       0
      2       3
b     1       6
      2       9
Name: (Seoul, Green), dtype: int64

In [16]:
df.loc["a"]

city,Seoul,Seoul,Busan
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


In [17]:
df.loc[("a",1)]

city   color
Seoul  Green    0
       Red      1
Busan  Green    2
Name: (a, 1), dtype: int64

In [18]:
df.loc["b",("Seoul","Red")]

key2
1     7
2    10
Name: (Seoul, Red), dtype: int64

In [19]:
df.loc[("b",2),"Busan"]

color
Green    11
Name: (b, 2), dtype: int64

In [20]:
df.loc[("b",1),("Seoul","Green")]

6

In [21]:
# level 은 인덱스 층 . 행방향 , 최상위인덱스 기준으로 오름차순 정렬
df.sort_index(axis=0, level=0)

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [22]:
# key2를 기준으로 오름차순 정렬
df.sort_index(axis=0, level=1)

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [23]:
df.sort_index(axis=0, level="key2")

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [24]:
# 열 방향 정렬
df.sort_index(axis=1, level=0)

Unnamed: 0_level_0,city,Busan,Seoul,Seoul
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [25]:
df.sort_index(axis=1, level=1)

Unnamed: 0_level_0,city,Busan,Seoul,Seoul
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [26]:
# 값 기준 정렬
df.sort_values(by=("Busan","Green"))

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [27]:
# 계층적 인덱스 상에서 통계 함수 적용
df.sum(axis=0, level=0)

city,Seoul,Seoul,Busan
color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [28]:
df.sum(axis=0, level=1)

city,Seoul,Seoul,Busan
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [29]:
df

Unnamed: 0_level_0,city,Seoul,Seoul,Busan
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [30]:
df.mean(axis=1, level="color")

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,1
a,2,4,4
b,1,7,7
b,2,10,10


In [31]:
# DataFrame 내 컬럼-인덱스 간 변환

df2 = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                    'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                    'd': [0, 1, 2, 0, 1, 2, 3]})

In [32]:
df2

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [33]:
df3 = df2.set_index(["c","d"])

In [34]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [35]:
# 기존 열 유지하려면
df2.set_index(["c","d"], drop = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [36]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [37]:
# 기존 인덱스를 컬럼으로 옮기고 인덱스를 정수로 대체
df3.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [38]:
# Reshaping DataFrame

df4 = pd.DataFrame(np.arange(6).reshape((2, 3)),
                   index=['Seoul', 'Busan'], 
                   columns=['one', 'two', 'three'])
df4.index.name = "city"
df4.columns.name = "number"

In [39]:
df4

number,one,two,three
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Seoul,0,1,2
Busan,3,4,5


In [40]:
# 최하위 컬럼이 취하위 인덱스로 붙음
df5 = df4.stack()

In [41]:
df5

city   number
Seoul  one       0
       two       1
       three     2
Busan  one       3
       two       4
       three     5
dtype: int64

In [42]:
# 최하위 인덱스를 최하위 컬럼으로 이동
df5.unstack()

number,one,two,three
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Seoul,0,1,2
Busan,3,4,5


In [43]:
df5.unstack(level=0)

city,Seoul,Busan
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [44]:
df5.unstack(level="city")

city,Seoul,Busan
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [45]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
s3 = pd.concat([s1, s2], keys=["one", "two"])

In [46]:
s3

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [47]:
s3.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [48]:
df6 = pd.DataFrame({"left": df5, "right": df5 + 5},
                   columns=["left", "right"])
df6.columns.name = "side"

In [49]:
df6

Unnamed: 0_level_0,side,left,right
city,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Seoul,one,0,5
Seoul,two,1,6
Seoul,three,2,7
Busan,one,3,8
Busan,two,4,9
Busan,three,5,10


In [50]:
df6.unstack()

side,left,left,left,right,right,right
number,one,two,three,one,two,three
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Seoul,0,1,2,5,6,7
Busan,3,4,5,8,9,10


In [51]:
df6.unstack(level="city")

side,left,left,right,right
city,Seoul,Busan,Seoul,Busan
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [52]:
df6.unstack(level="city").stack(level="side")

Unnamed: 0_level_0,city,Busan,Seoul
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7
