## 그룹연산

### 1) 그룹객체 만들기

In [1]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset("titanic")

df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [3]:
# 컬럼이 너무 많으니까 좀 줄여본다.
df = df.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]

df.head(10)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
6,54.0,male,First,51.8625,0
7,2.0,male,Third,21.075,0
8,27.0,female,Third,11.1333,1
9,14.0,female,Second,30.0708,1


In [7]:
# class 열을 기준으로 분할
# class 열의 고유값 확인
print(df['class'].value_counts())


# df.groupby(기준이 되는 열)
groups = df.groupby('class', observed = True)
print(groups) # 그룹객체의 참조값(주소값)

for group in groups:
    print(type(group)) # <class 'tuple'> 그룹 객체의 요소는 튜플 형태
    print(group) # class 열 기준으로 쪼개진 데이터 프레임



class
Third     491
First     216
Second    184
Name: count, dtype: int64
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000201B581D6D0>
<class 'tuple'>
('First',       age     sex  class     fare  survived
1    38.0  female  First  71.2833         1
3    35.0  female  First  53.1000         1
6    54.0    male  First  51.8625         0
11   58.0  female  First  26.5500         1
23   28.0    male  First  35.5000         1
..    ...     ...    ...      ...       ...
871  47.0  female  First  52.5542         1
872  33.0    male  First   5.0000         0
879  56.0  female  First  83.1583         1
887  19.0  female  First  30.0000         1
889  26.0    male  First  30.0000         1

[216 rows x 5 columns])
<class 'tuple'>
('Second',       age     sex   class     fare  survived
9    14.0  female  Second  30.0708         1
15   55.0  female  Second  16.0000         1
17    NaN    male  Second  13.0000         1
20   35.0    male  Second  26.0000         0
21   34.0    male  

In [9]:
# 튜플의 특징: 그룹에서 키 값만 분리할 수 있다.

for key, group in groups:
    print(f"클래스: {key}")
    print(f"{key}클래스의 승객 수: {len(group)} 명")
    print(group.head(), "\n")

클래스: First
First클래스의 승객 수: 216 명
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1 

클래스: Second
Second클래스의 승객 수: 184 명
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1 

클래스: Third
Third클래스의 승객 수: 491 명
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0 



In [10]:
# 그룹별로 연산 메소드 적용
# 클래스 별로 나이, 요금의 평균 값 확인
avg = groups[['age', 'fare']].mean()
print(avg)

              age       fare
class                       
First   38.233441  84.154687
Second  29.877630  20.662183
Third   25.140620  13.675550


In [13]:
# 그룹객체에서 특정 그룹 선택
# 그룹객체.get_group(key 값)

# Second 클래스만 선택
group_second = groups.get_group('Second')
group_second.head()

Unnamed: 0,age,sex,class,fare,survived
9,14.0,female,Second,30.0708,1
15,55.0,female,Second,16.0,1
17,,male,Second,13.0,1
20,35.0,male,Second,26.0,0
21,34.0,male,Second,13.0,1


In [16]:
# 여러 열을 기준으로 그룹화
# df.groupby([열 리스트])
df

# class 열과 sex 열을 기준으로 분학
group_two = df.groupby(['class', 'sex'])

for key, group in group_two:
    print(f"key: {key}, key 의 타입: {type(key)}") # key가 튜플 형태로 반환됨
    print(f"승객 수: {len(group)}")
    print(group.head(), "\n")
    

key: ('First', 'female'), key 의 타입: <class 'tuple'>
승객 수: 94
     age     sex  class      fare  survived
1   38.0  female  First   71.2833         1
3   35.0  female  First   53.1000         1
11  58.0  female  First   26.5500         1
31   NaN  female  First  146.5208         1
52  49.0  female  First   76.7292         1 

key: ('First', 'male'), key 의 타입: <class 'tuple'>
승객 수: 122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
30  40.0  male  First   27.7208         0
34  28.0  male  First   82.1708         0 

key: ('Second', 'female'), key 의 타입: <class 'tuple'>
승객 수: 76
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
41  27.0  female  Second  21.0000         0
43   3.0  female  Second  41.5792         1
53  29.0  female  Second  26.0000         1 

key: ('Second', 'male'), key 의 

  group_two = df.groupby(['class', 'sex'])


In [20]:
# 여러 열을 기준으로 그룹 연산 메소드 적용

avg_two = group_two[['age', 'fare']].mean()
print(avg_two.index, "\n")
print(avg_two, "\n") # 행 인덱스(멀티 인덱스)
# 클래스와 나이별로 평균값 출력됨

MultiIndex([( 'First', 'female'),
            ( 'First',   'male'),
            ('Second', 'female'),
            ('Second',   'male'),
            ( 'Third', 'female'),
            ( 'Third',   'male')],
           names=['class', 'sex']) 

                     age        fare
class  sex                          
First  female  34.611765  106.125798
       male    41.281386   67.226127
Second female  28.722973   21.970121
       male    30.740707   19.741782
Third  female  21.750000   16.118810
       male    26.507589   12.661633 



In [25]:
# group_two 객체에서 개별 그룹 선택

result = group_two.get_group(('Third', 'male')) # key를 튜플로 전달
print(result.head()) # 남성이면서 Third 클래스인 승객 데이터 추출

     age   sex  class     fare  survived
0   22.0  male  Third   7.2500         0
4   35.0  male  Third   8.0500         0
5    NaN  male  Third   8.4583         0
7    2.0  male  Third  21.0750         0
12  20.0  male  Third   8.0500         0


## 2) 그룹연산 메소드

In [30]:
# class 열을 기준으로 그룹화

groups = df.groupby('class')

# 각 그룹에 대한 age, fare 열의 표준편차를 집계
# .std(): 표준편차
std_all = groups[['age', 'fare']].std()
print(std_all, "\n")
print(type(std_all)) # 데이터 프레임 형태

              age       fare
class                       
First   14.802856  78.380373
Second  14.001077  13.417399
Third   12.495398  11.778142 

<class 'pandas.core.frame.DataFrame'>


  groups = df.groupby('class')


In [31]:
# 그룹 객체에 사용자 정의 함수 적용
# 사용자 정의 함수 정의

def max_min(col):
    return col.max() - col.min() # 최대값 - 최소값


agg_maxmin = groups[['age', 'fare']].agg(max_min) # 그룹마다 함수 적용
agg_maxmin


Unnamed: 0_level_0,age,fare
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,79.08,512.3292
Second,69.33,73.5
Third,73.58,69.55


In [33]:
# 람다식 활용
agg_maxmin = groups[['age', 'fare']].agg(lambda x : x.max() - x.min())
# 람다식의 x에는 각 그룹의 데이터 프레임을 전달받는다.

agg_maxmin

Unnamed: 0_level_0,age,fare
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,79.08,512.3292
Second,69.33,73.5
Third,73.58,69.55


In [41]:
# 여러 함수를 각 열에 동일하게 적용하여 집계
# 그룹.agg([함수1, 함수2, .....])

agg_all = groups[['age', 'fare']].agg(['min', 'max'])

# print(agg_all, "\n")

# 각 열마다 다른 함수를 적용하여 집계
# fare 열에 2개의 함수(min, max)를 age열에는 mean 함수 적용
agg_sep = groups[['age', 'fare']].agg({"fare":['min', 'max'], "age":'mean'})

agg_sep

Unnamed: 0_level_0,fare,fare,age
Unnamed: 0_level_1,min,max,mean
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,0.0,512.3292,38.233441
Second,0.0,73.5,29.87763
Third,0.0,69.55,25.14062


In [45]:
# transform()

age_maxmin = groups['age'].transform(lambda x: x.max() - x.min())
print(age_maxmin, "\n")


# 그룹별로 집계하지 않고 본래 데이터 프레임(분할하기 전)의 행과 열을 기준으로 결과를 반환
df['age_maxmin'] = age_maxmin
print(df)

0      73.58
1      79.08
2      73.58
3      79.08
4      73.58
       ...  
886    69.33
887    79.08
888    73.58
889    79.08
890    73.58
Name: age, Length: 891, dtype: float64 

      age     sex   class     fare  survived  age_maxmin
0    22.0    male   Third   7.2500         0       73.58
1    38.0  female   First  71.2833         1       79.08
2    26.0  female   Third   7.9250         1       73.58
3    35.0  female   First  53.1000         1       79.08
4    35.0    male   Third   8.0500         0       73.58
..    ...     ...     ...      ...       ...         ...
886  27.0    male  Second  13.0000         0       69.33
887  19.0  female   First  30.0000         1       79.08
888   NaN  female   Third  23.4500         0       73.58
889  26.0    male   First  30.0000         1       79.08
890  32.0    male   Third   7.7500         0       73.58

[891 rows x 6 columns]


In [None]:
# 위 셀 내용  수정
# transform 대신 agg()와 merge()
df1 = df.copy()

groups1 = groups

agg_maxmin = groups1['age'].agg(lambda x : x.max() - x.min())
agg_maxmin.rename("age_maxmin", inplace = True)
# .rename : 시리즈의 name 변경
print(agg_maxmin, type(agg_maxmin))

# class열 기준으로 병합
df_t = df1.merge(agg_maxmin, on = 'class')

df_t

In [64]:
# transform 대신 agg() 와 merge()
df1 = df.copy()
# df1 = df1.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
groups1 = groups

agg_maxmin = groups1['age'].agg(lambda x : x.max() - x.min())
print(agg_maxmin, type(agg_maxmin))

class
First     79.08
Second    69.33
Third     73.58
Name: age, dtype: float64 <class 'pandas.core.series.Series'>


In [65]:
# 이름 바꾼다.
agg_maxmin.rename('age_maxmin', inplace = True)
# .rename: 시리즈의 name 변경
print(agg_maxmin, type(agg_maxmin))

df1

class
First     79.08
Second    69.33
Third     73.58
Name: age_maxmin, dtype: float64 <class 'pandas.core.series.Series'>


Unnamed: 0,age,sex,class,fare,survived,age_maxmin
0,22.0,male,Third,7.2500,0,73.58
1,38.0,female,First,71.2833,1,79.08
2,26.0,female,Third,7.9250,1,73.58
3,35.0,female,First,53.1000,1,79.08
4,35.0,male,Third,8.0500,0,73.58
...,...,...,...,...,...,...
886,27.0,male,Second,13.0000,0,69.33
887,19.0,female,First,30.0000,1,79.08
888,,female,Third,23.4500,0,73.58
889,26.0,male,First,30.0000,1,79.08


In [67]:
# class 열 기준으로 병합
df_t = df1.merge(agg_maxmin, on=['class', 'age_maxmin'])
df_t

Unnamed: 0,age,sex,class,fare,survived,age_maxmin
0,22.0,male,Third,7.2500,0,73.58
1,26.0,female,Third,7.9250,1,73.58
2,35.0,male,Third,8.0500,0,73.58
3,,male,Third,8.4583,0,73.58
4,2.0,male,Third,21.0750,0,73.58
...,...,...,...,...,...,...
886,27.0,female,Second,13.8583,1,69.33
887,28.0,female,Second,24.0000,1,69.33
888,25.0,female,Second,26.0000,1,69.33
889,28.0,male,Second,10.5000,0,69.33


In [70]:
# class 별로 그룹화된 그룹 중 데이터가 200개 이상인 그룹만 추출
# class 별로 데이터 개수 확인
print(df['class'].value_counts(), "\n")

group_filter = groups.filter(lambda x : len(x) >= 200)
print(group_filter.head())

print(group_filter['class'].value_counts()) # 데이터 개수가 200개 이상인 First, Third 클래스만 추출

class
Third     491
First     216
Second    184
Name: count, dtype: int64 

    age     sex  class     fare  survived  age_maxmin
0  22.0    male  Third   7.2500         0       73.58
1  38.0  female  First  71.2833         1       79.08
2  26.0  female  Third   7.9250         1       73.58
3  35.0  female  First  53.1000         1       79.08
4  35.0    male  Third   8.0500         0       73.58
class
Third     491
First     216
Second      0
Name: count, dtype: int64


In [73]:
# 그룹마다 나이의 평균이 30보다 작은 그룹만 추출

print(groups['age'].mean()) # 그룹마다 나이의 평균 확인
print()

group_filter = groups.filter(lambda x : x['age'].mean() < 30)
print(group_filter.head(10))

print(group_filter['class'].value_counts())

class
First     38.233441
Second    29.877630
Third     25.140620
Name: age, dtype: float64

     age     sex   class     fare  survived  age_maxmin
0   22.0    male   Third   7.2500         0       73.58
2   26.0  female   Third   7.9250         1       73.58
4   35.0    male   Third   8.0500         0       73.58
5    NaN    male   Third   8.4583         0       73.58
7    2.0    male   Third  21.0750         0       73.58
8   27.0  female   Third  11.1333         1       73.58
9   14.0  female  Second  30.0708         1       69.33
10   4.0  female   Third  16.7000         1       73.58
12  20.0    male   Third   8.0500         0       73.58
13  39.0    male   Third  31.2750         0       73.58
class
Third     491
Second    184
First       0
Name: count, dtype: int64


In [75]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset('diamonds')

# price : 가격(기준 : 미국 달러)
# carat : 다이아몬드의 무게를 의미
# cut : 컷팅 품질(Fair, Good, Very Good, Premium, Ideal 순으로 좋음)
# color : 다이아몬드 색깔(J부터 시작해서 D까지, D로 갈수록 좋다.)
# clarity : 다이아몬드의 선명도 (I1(최악) ~ IF(최고)로 구성)
# x : 다이아몬드의 길이(mm)
# y : 다이아몬드의 넓이(mm)
# z : 다이아몬드의 깊이(mm)
# depth : 전체 깊이 퍼센티지를 의미 (z / mean(x, y))
# table : 가장 넓은 부분 기준으로 다이아몬드의 상단부분의 넓이

df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [77]:
# 1) 다이아몬드 데이터 프레임에서 컷팅 품질별(cut) 로 그룹화한 후 그룹별로
# 가격의 평균을 구하기

'''
ex 
# class 열을 기준으로 그룹화

groups = df.groupby('class')

# 각 그룹에 대한 age, fare 열의 표준편차를 집계
# .std(): 표준편차
std_all = groups[['age', 'fare']].std()
print(std_all, "\n")
print(type(std_all)) # 데이터 프레임 형태
'''
diamond_groups = df.groupby('cut')
diamond_groups

mean_group = diamond_groups[['price']].mean()
print(mean_group)
print(type(mean_group))



                 price
cut                   
Ideal      3457.541970
Premium    4584.257704
Very Good  3981.759891
Good       3928.864452
Fair       4358.757764
<class 'pandas.core.frame.DataFrame'>


  diamond_groups = df.groupby('cut')


In [97]:
# 1) 다른 풀이
print(df['cut'].value_counts(), "\n")

groups = df.groupby('cut')

''' 확인용
for key, group in groups:
    print(f"cut: {key}")
    print(group.head(), "\n")
'''

result1 = groups['price'].mean().round(2) # round(2) : 소수점 2째자리까지 반올림
print(result1)

cut
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: count, dtype: int64 

cut
Ideal        3457.54
Premium      4584.26
Very Good    3981.76
Good         3928.86
Fair         4358.76
Name: price, dtype: float64


  groups = df.groupby('cut')


In [99]:
# 2) 컷팅 등급이 'Good' 인 그룹의 다이아몬드 무개(carot) 합계 구하기

# 그룹 선택
good = groups.get_group('Good')

print(good.head(), '\n')


# 가격열에서 합계 구하기
print(f"Good 등급의 무게 합계: {good['carat'].sum()}")


    carat   cut color clarity  depth  table  price     x     y     z
2    0.23  Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
4    0.31  Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
10   0.30  Good     J     SI1   64.0   55.0    339  4.25  4.28  2.73
17   0.30  Good     J     SI1   63.4   54.0    351  4.23  4.29  2.70
18   0.30  Good     J     SI1   63.8   56.0    351  4.23  4.26  2.71 

Good 등급의 무게 합계: 4166.1


In [109]:
# 3) 각 그룹마다 최대 가격과 가격의 중간값(median)의 차이를 구하고
# 그룹마다의 다이아몬드 무게의 최소값과 평균값을 구하기
'''
ex
# 여러 함수를 각 열에 동일하게 적용하여 집계
# 그룹.agg([함수1, 함수2, .....])

agg_all = groups[['age', 'fare']].agg(['min', 'max'])

# print(agg_all, "\n")

# 각 열마다 다른 함수를 적용하여 집계
# fare 열에 2개의 함수(min, max)를 age열에는 mean 함수 적용
agg_sep = groups[['age', 'fare']].agg({"fare":['min', 'max'], "age":'mean'})

'''

def max_median(col):
    return col.max() - col.median() # 최대값 - 중간값

agg_group = diamond_groups[['price', 'carat']].agg({'price':max_median, 'carat': ['min', 'mean']}) # 사용자 정의함수는 따옴표('') 쓰지 않는다.
agg_group

Unnamed: 0_level_0,Unnamed: 1_level_0,price,carat,carat
Unnamed: 0_level_1,Unnamed: 1_level_1,max_median,min,mean
cut,color,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Ideal,D,17117.0,0.2,0.565766
Ideal,E,17292.0,0.2,0.578401
Ideal,F,17005.0,0.23,0.655829
Ideal,G,16948.5,0.23,0.700715
Ideal,H,16482.0,0.23,0.799525
Ideal,I,16120.0,0.23,0.913029
Ideal,J,14412.0,0.23,1.063594
Premium,D,16566.0,0.2,0.721547
Premium,E,16549.0,0.2,0.717745
Premium,F,15950.0,0.2,0.827036


In [106]:
# 3) 다른 풀이 lambda 사용
agg_group = diamond_groups[['price', 'carat']].agg({'price' : lambda x : x.max() - x.median(), 'carat' : ['min', 'mean']})
agg_group

Unnamed: 0_level_0,Unnamed: 1_level_0,price,carat,carat
Unnamed: 0_level_1,Unnamed: 1_level_1,<lambda>,min,mean
cut,color,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Ideal,D,17117.0,0.2,0.565766
Ideal,E,17292.0,0.2,0.578401
Ideal,F,17005.0,0.23,0.655829
Ideal,G,16948.5,0.23,0.700715
Ideal,H,16482.0,0.23,0.799525
Ideal,I,16120.0,0.23,0.913029
Ideal,J,14412.0,0.23,1.063594
Premium,D,16566.0,0.2,0.721547
Premium,E,16549.0,0.2,0.717745
Premium,F,15950.0,0.2,0.827036


In [110]:
# 4) 다이아몬드 데이터를 컷팅 등급과 색깔로 그룹화한 후 그룹마다의 
# 가격의 중간값이 3000 달러보다 많은 그룹들만 출력
cut_color_groups = diamond_groups = df.groupby(['cut', 'color'])
cut_color_groups

'''
ex
group_filter = groups.filter(lambda x : x['age'].mean() < 30)
'''
group_filter = cut_color_groups.filter(lambda x : x['price'].median() > 3000)

print(group_filter.head())

   carat        cut color clarity  depth  table  price     x     y     z
3   0.29    Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31       Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
5   0.24  Very Good     J    VVS2   62.8   57.0    336  3.94  3.96  2.48
6   0.24  Very Good     I    VVS1   62.3   57.0    336  3.95  3.98  2.47
7   0.26  Very Good     H     SI1   61.9   55.0    337  4.07  4.11  2.53


  cut_color_groups = diamond_groups = df.groupby(['cut', 'color'])
