# 10.1 GroupBy 메카닉

In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [2]:
df = pd.DataFrame({
    'key1' : ['a', 'a', 'b', 'b', 'a'],
    'key2' : ['one', 'two', 'one', 'two', 'one'],
    'data1' : np.random.randn(5),
    'data2' : np.random.randn(5)
})

df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.204708,1.393406
1,a,two,0.478943,0.092908
2,b,one,-0.519439,0.281746
3,b,two,-0.55573,0.769023
4,a,one,1.965781,1.246435


### 각 그룹에서 data1의 평균 구하기

![img](./img/groupby.jfif)

In [3]:
# Groupby - Single Group Single Column
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000017F3C31FF40>

컬럼 이름으로 색인 했을 경우 **SeriesGroupBy**가 나오는 것을 확인할 수 있다.  
이에 대한 추가적인 내용은 뒤에서 다룰 예정이다.

집단별 크기는 grouped.size(), 집단별 합계는 grouped.sum(), 집단별 평균은 grouped.mean() 을 사용한다.

In [4]:
# 크기
grouped.size()

key1
a    3
b    2
Name: data1, dtype: int64

In [5]:
# 합계
grouped.sum()

key1
a    2.240016
b   -1.075169
Name: data1, dtype: float64

In [6]:
# 평균
grouped.mean()

key1
a    0.746672
b   -0.537585
Name: data1, dtype: float64

#### 방법 1)  일부 컬럼 전달(속도상 유리)

In [7]:
df['data1'].groupby(df['key1']).mean() 

key1
a    0.746672
b   -0.537585
Name: data1, dtype: float64

#### 방법 2) 전체(연산 가능한 모든 컬럼) 전달 후 색인

In [8]:
df.groupby(['key1'])['data1'].mean() 

key1
a    0.746672
b   -0.537585
Name: data1, dtype: float64

위에서 처럼 바로 Apply 함수를 적용하는 것도 가능하다.  
또한 1번처럼 사용해도 되고 2번처럼 사용할 수 있다.  
1번과 2번은 다음과 같이 해석 할 수 있다. 
<br>
1. df['data1'] 컬럼을 df['key1'] 그룹을 기준으로 집계한다.  
2. 전체 컬럼을 df의 ['key1'] 그룹을 기준으로 집계한 후 data1을 추출한다.   

In [10]:
# Groupby - Multiple Columns
means = df['data1'].groupby([df['key1'], df['key2']]).mean() # 1
means

key1  key2
a     one     0.880536
      two     0.478943
b     one    -0.519439
      two    -0.555730
Name: data1, dtype: float64

위의 예처럼 여러 개의 범주형 변수 key 값을 가지고 그룹별 집계를 수행할 수도 있다.  하지만 여기서도 마찬가지로 df가 많이 쓰여서 가독성이 떨어진다.  
그래서 다음과 같이 코드를 깔끔하게 변형할 수 있다.

In [11]:
# 1번 코드의 비해 가독성이 더 좋다.
df.groupby(['key1', 'key2'])['data1'].mean()

key1  key2
a     one     0.880536
      two     0.478943
b     one    -0.519439
      two    -0.555730
Name: data1, dtype: float64

In [12]:
type(means)

pandas.core.series.Series

집계 결과가 Series 형태로 표현되는 것을 확인할 수 있다.  
이때 unstack() 함수를 사용하면 집계 결과를 가로, 세로 축으로 좀더 보기 좋게 표현 할 수 있다.

In [13]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.880536,0.478943
b,-0.519439,-0.55573


In [14]:
type(means.unstack())

pandas.core.frame.DataFrame

data1에 괄호를 하나 더 추가해도 DataFrame 형식으로 나오는 것을 확인 할 수 있다.

In [15]:
# df.groupby(['key1', 'key2'])['data1'].mean()
df.groupby(['key1', 'key2'])[['data1']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
a,one,0.880536
a,two,0.478943
b,one,-0.519439
b,two,-0.55573


이는 groupby 객체를 컬럼 이름으로 색인 했을 때와 컬럼 이름이 담긴 배열로 색인 했을 때의 차이가 있다.  
https://steadiness-193.tistory.com/123 자세한 내용은 이 링크에 담겨져 있다.

In [16]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005    0.478943
            2006   -0.519439
Ohio        2005   -0.380219
            2006    1.965781
Name: data1, dtype: float64

states와 years는 df에 없는 컬럼이지만, 추가적인 key로 활용이 가능하다.

In [18]:
group = df.groupby('key1')
group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017F3C35EDC0>

컬럼 이름이 담긴 배열로 색인한 경우 **DataFrameGroupBy**가 나온다.  
여기서 컬럼 이름이 담긴 배열로 인식하는 이유는 명시적으로 'data1'을 색인하지 않았기 때문에 ['data1', 'data2']가 담긴 배열로 자동 지정이 되기 때문이다.

In [19]:
group.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.746672,0.910916
b,-0.537585,0.525384


결과도 마찬가지로 데이터프레임 형식으로 나오게 된다.

In [20]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.880536,1.31992
a,two,0.478943,0.092908
b,one,-0.519439,0.281746
b,two,-0.55573,0.769023


In [25]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [26]:
type(df.groupby(['key1', 'key2']).size())

pandas.core.series.Series

그런데 size는 컬럼이 1개 밖에 표시가 안되므로 Series 형식으로 나오는 것 같다.

### 그룹별 반복 작업

In [54]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)
    print('-'*30)

a
  key1 key2     data1     data2
0    a  one -0.204708  1.393406
1    a  two  0.478943  0.092908
4    a  one  1.965781  1.246435
------------------------------
b
  key1 key2     data1     data2
2    b  one -0.519439  0.281746
3    b  two -0.555730  0.769023
------------------------------


enumerate랑 비슷해보인다.  
초기 값은 해당 key로 묶여있는 name이 출력된다.

In [55]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
    print('-'*30)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.204708  1.393406
4    a  one  1.965781  1.246435
------------------------------
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.478943  0.092908
------------------------------
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.519439  0.281746
------------------------------
('b', 'two')
  key1 key2    data1     data2
3    b  two -0.55573  0.769023
------------------------------


In [57]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.519439,0.281746
3,b,two,-0.55573,0.769023


dictionary 형태로도 사용이 가능하다

In [58]:
pieces

{'a':   key1 key2     data1     data2
 0    a  one -0.204708  1.393406
 1    a  two  0.478943  0.092908
 4    a  one  1.965781  1.246435,
 'b':   key1 key2     data1     data2
 2    b  one -0.519439  0.281746
 3    b  two -0.555730  0.769023}

In [60]:
df.dtypes
grouped = df.groupby(df.dtypes, axis=1)

In [10]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.204708  1.393406
1  0.478943  0.092908
2 -0.519439  0.281746
3 -0.555730  0.769023
4  1.965781  1.246435
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


dtype으로 설정했더니 타입별로 group이 나뉘게 된다!!  
이 기능은 어디에 쓰는게 좋을까?  
이때 axis = 1로 설정해서 column 별로 나뉘게 했다.  

In [62]:
test = pd.DataFrame({
    'key1' : ['a', 'a', 'b', 'b', 'a'],
    'key2' : ['one', 'two', 'one', 'two', 'one'],
    'data1' : np.random.randn(5),
    'data2' : np.random.randn(5),
    'int1' : [1, 2, 3, 4, 5]
})
test

Unnamed: 0,key1,key2,data1,data2,int1
0,a,one,-0.919262,0.86258,1
1,a,two,-1.549106,-0.010032,2
2,b,one,0.022185,0.050009,3
3,b,two,0.758363,0.670216,4
4,a,one,-0.660524,0.852965,5


int 형식도 추가해보았다. 

In [63]:
test.dtypes
test_grouped = test.groupby(test.dtypes, axis=1)
for dtype, group in test_grouped:
    print(dtype)
    print(group)

int64
   int1
0     1
1     2
2     3
3     4
4     5
float64
      data1     data2
0 -0.919262  0.862580
1 -1.549106 -0.010032
2  0.022185  0.050009
3  0.758363  0.670216
4 -0.660524  0.852965
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


int, float64, object로 나뉘는 것으로 확인된다

### Selecting a Column or Subset of Columns

위에서 했던 얘기인데 미리 말해버렸다.

In [28]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,1.31992
a,two,0.092908
b,one,0.281746
b,two,0.769023


In [29]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
s_grouped.mean()

key1  key2
a     one     1.319920
      two     0.092908
b     one     0.281746
      two     0.769023
Name: data2, dtype: float64

### Grouping with Dicts and Series

In [30]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,1.007189,-1.296221,0.274992,0.228913,1.352917
Steve,0.886429,-2.001637,-0.371843,1.669025,-0.43857
Wes,-0.539741,,,-1.021228,-0.577087
Jim,0.124121,0.302614,0.523772,0.00094,1.34381
Travis,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757


In [71]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [72]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,0.503905,1.063885
Steve,1.297183,-1.553778
Wes,-1.021228,-1.116829
Jim,0.524712,1.770545
Travis,-4.230992,-2.405455


groupby는 생각보다 다양하게 활용이 되는 것 같다.  
mapping을 사용하면 mapping을 통해서 원하는 컬럼끼리 묶어줄 수 있다.  
일일히 수작업으로 하기가 귀찮겠지만, 특정 컬럼끼리 그룹핑하기에 유용할 것 같다.

In [33]:
map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### Grouping with Functions

위에서 groupby에 파라미터 dtype, mapping을 넣어서 사용해봤다.  
이번에는 함수를 넣어서 사용해볼 예정이다.

In [73]:
people

Unnamed: 0,a,b,c,d,e
Joe,1.007189,-1.296221,0.274992,0.228913,1.352917
Steve,0.886429,-2.001637,-0.371843,1.669025,-0.43857
Wes,-0.539741,,,-1.021228,-0.577087
Jim,0.124121,0.302614,0.523772,0.00094,1.34381
Travis,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757


In [34]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.591569,-0.993608,0.798764,-0.791374,2.119639
5,0.886429,-2.001637,-0.371843,1.669025,-0.43857
6,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757


3, 5, 6은 string의 길이인 것 같다.  
그렇다면 string의 앞글자로 groupby 할 수도 있을 것이다.  

In [75]:
# 도우미 함수 정의
def prefix(x):
    return x[0]

people.groupby(prefix).sum()

Unnamed: 0,a,b,c,d,e
J,1.131311,-0.993608,0.798764,0.229853,2.696727
S,0.886429,-2.001637,-0.371843,1.669025,-0.43857
T,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
W,-0.539741,0.0,0.0,-1.021228,-0.577087


J의 경우에만 합쳐진 모습을 볼 수 있다.

In [35]:
#ind_list = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis']
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.539741,-1.296221,0.274992,-1.021228,-0.577087
3,two,0.124121,0.302614,0.523772,0.00094,1.34381
5,one,0.886429,-2.001637,-0.371843,1.669025,-0.43857
6,two,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757


key_list라는 mapping 리스트를 사용해서 추가적인 key를 만들어주었다.

### Grouping by Index Levels

In [36]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])

hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.560145,-1.265934,0.119827,-1.063512,0.332883
1,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703
2,0.28635,0.377984,-0.753887,0.331286,1.349742
3,0.069877,0.246674,-0.011862,1.004812,1.327195


In [37]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


### Data Aggregation

In [82]:
df
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

key1
a    1.668413
b   -0.523068
Name: data1, dtype: float64

**Quantile**이란 주어진 데이터를 동등한 크기로 분할하는 지점을 말합니다.   
https://eunsukimme.github.io/data%20science/2019/11/01/Quantile/  
https://stats.stackexchange.com/questions/470193/representing-quantile-like-quartile-in-form-of-normal-distribution-curve


In [39]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

  results[key] = self.aggregate(func)


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.170488,1.300498
b,0.036292,0.487276


min, max의 차이를 도우미 함수를 사용하여 나타낼 수도 있다.

In [40]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.746672,1.109736,-0.204708,0.137118,0.478943,1.222362,1.965781,3.0,0.910916,0.712217,0.092908,0.669671,1.246435,1.31992,1.393406
b,2.0,-0.537585,0.025662,-0.55573,-0.546657,-0.537585,-0.528512,-0.519439,2.0,0.525384,0.344556,0.281746,0.403565,0.525384,0.647203,0.769023


In [83]:
grouped['data1'].quantile(0.75)

key1
a    1.222362
b   -0.528512
Name: data1, dtype: float64

확인 결과 정답인 걸로

### Column-Wise and Multiple Function Application

In [84]:
tips = pd.read_csv('examples/tips.csv')
# Add tip percentage of total bill
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [43]:
grouped = tips.groupby(['day', 'smoker'])

In [44]:
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [45]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


여러개 함수도 사용가능합니다!!

In [88]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std), ('test', peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar,test
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


column 이름을 바꾸는 것도 가능한듯!!

In [47]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result

  result = grouped['tip_pct', 'total_bill'].agg(functions)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


각 컬럼에도 묶어서 적용 가능!!