In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,2.485941,1.489988,a,one
1,-0.18165,0.413749,a,two
2,0.242835,0.096225,b,one
3,2.088335,0.113333,b,two
4,-0.011802,0.123153,a,one


In [4]:
grouped = df["data1"].groupby(df["key1"])

In [5]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x10aa562d0>

In [6]:
grouped.mean()

key1
a    0.764163
b    1.165585
Name: data1, dtype: float64

In [None]:
# split - apply - combine
# 나누고 적용하고 통합여 표시한다

In [8]:
# 복수개의 그룹화 조건 
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()

In [9]:
means

key1  key2
a     one     1.237070
      two    -0.181650
b     one     0.242835
      two     2.088335
Name: data1, dtype: float64

In [10]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.23707,-0.18165
b,0.242835,2.088335


In [11]:
df.groupby("key1").mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.764163,0.67563
b,1.165585,0.104779


In [12]:
df.groupby("key1").count()

Unnamed: 0_level_0,data1,data2,key2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,3,3
b,2,2,2


In [13]:
df.groupby(["key1","key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1.23707,0.80657
a,two,-0.18165,0.413749
b,one,0.242835,0.096225
b,two,2.088335,0.113333


In [14]:
df.groupby(["key1","key2"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,2,2
a,two,1,1
b,one,1,1
b,two,1,1


In [15]:
# 특정 열의 통계량을 계산한 값 추출
df.groupby(["key1","key2"])["data2"].mean()

key1  key2
a     one     0.806570
      two     0.413749
b     one     0.096225
      two     0.113333
Name: data2, dtype: float64

In [16]:
df.groupby("key1")

<pandas.core.groupby.DataFrameGroupBy object at 0x10aa9d4d0>

In [17]:
# 그룹화를 수행한 직후의 결과물 확인
for name, group in df.groupby("key1") :
    print(name)
    print(group)

a
      data1     data2 key1 key2
0  2.485941  1.489988    a  one
1 -0.181650  0.413749    a  two
4 -0.011802  0.123153    a  one
b
      data1     data2 key1 key2
2  0.242835  0.096225    b  one
3  2.088335  0.113333    b  two


In [18]:
for (k1,k2), group in df.groupby(["key1","key2"]):
    print(k1,k2)
    print(group)

('a', 'one')
      data1     data2 key1 key2
0  2.485941  1.489988    a  one
4 -0.011802  0.123153    a  one
('a', 'two')
     data1     data2 key1 key2
1 -0.18165  0.413749    a  two
('b', 'one')
      data1     data2 key1 key2
2  0.242835  0.096225    b  one
('b', 'two')
      data1     data2 key1 key2
3  2.088335  0.113333    b  two


In [19]:
# 딕션 형태로 얻기
pieces = dict(list(df.groupby("key1")))

In [20]:
pieces

{'a':       data1     data2 key1 key2
 0  2.485941  1.489988    a  one
 1 -0.181650  0.413749    a  two
 4 -0.011802  0.123153    a  one, 'b':       data1     data2 key1 key2
 2  0.242835  0.096225    b  one
 3  2.088335  0.113333    b  two}

In [22]:
pieces["b"]

Unnamed: 0,data1,data2,key1,key2
2,0.242835,0.096225,b,one
3,2.088335,0.113333,b,two


In [23]:
df2 = pd.DataFrame(np.random.randn(5, 5), 
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
map_dict = {'a': 'red', 'b': 'red', 'c': 'blue', 
            'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [24]:
df2

Unnamed: 0,a,b,c,d,e
Joe,0.21495,-0.271475,-0.471796,0.693545,0.746226
Steve,0.891774,-0.523395,0.899849,-0.675586,1.024771
Wes,-1.186672,0.969701,-0.277094,1.006338,0.957432
Jim,0.007533,0.887679,0.687531,-1.094692,-0.967065
Travis,0.684402,1.847187,-0.778429,-0.546525,2.035673


In [26]:
df2.groupby(map_dict, axis=1).sum()

Unnamed: 0,blue,red
Joe,0.221749,0.6897
Steve,0.224262,1.39315
Wes,0.729245,0.740462
Jim,-0.407161,-0.071853
Travis,-1.324953,4.567262


In [27]:
map_s = pd.Series(map_dict)

In [28]:
df2.groupby(map_s,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,2,3
Jim,2,3
Travis,2,3


In [29]:
df

Unnamed: 0,data1,data2,key1,key2
0,2.485941,1.489988,a,one
1,-0.18165,0.413749,a,two
2,0.242835,0.096225,b,one
3,2.088335,0.113333,b,two
4,-0.011802,0.123153,a,one


In [31]:
grouped = df.groupby("key1")

In [32]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [34]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.667591,1.366835
b,1.8455,0.017108


In [35]:
# 통계함수 사용. 문자열릐 형태로 통계함수 이름 입력
grouped.agg("std")

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.49352,0.720067
b,1.304966,0.012097


In [36]:
# 그룹에 대한 기본 통계량 계산결과를 보여줌
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.764163,1.49352,-0.18165,-0.096726,-0.011802,1.23707,2.485941,3.0,0.67563,0.720067,0.123153,0.268451,0.413749,0.951869,1.489988
b,2.0,1.165585,1.304966,0.242835,0.70421,1.165585,1.62696,2.088335,2.0,0.104779,0.012097,0.096225,0.100502,0.104779,0.109056,0.113333
