# groupby 메서드로 평균값 구하기

In [1]:
import pandas as pd
df = pd.read_csv('../data/gapminder.tsv', sep='\t')

In [2]:
avg_life_exp_by_year = df.groupby('year').lifeExp.mean()
print(avg_life_exp_by_year)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


# 분할-반영-결합 과정 살펴보기

In [3]:
# 분할: 나눌 기준을 따로 추출한다.
years = df.year.unique()
print(years)

[1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007]


In [6]:
# 반영
# 1. 연도별로 데이터 추출
y1952 = df.loc[df.year == 1952, :]
print(y1952.head())

y1957 = df.loc[df.year == 1957, :]
print(y1957.head())

y1962 = df.loc[df.year == 1962, :]
print(y1962.head())

y1967 = df.loc[df.year == 1967, :]
print(y1967.head())


# 2. lifeExp 열의 평균값 구하기
y1952_mean = y1952.lifeExp.mean()
print(y1952_mean)

y1957_mean = y1957.lifeExp.mean()
print(y1957_mean)

y1962_mean = y1962.lifeExp.mean()
print(y1962_mean)

y1967_mean = y1967.lifeExp.mean()
print(y1967_mean)

        country continent  year  lifeExp       pop    gdpPercap
0   Afghanistan      Asia  1952   28.801   8425333   779.445314
12      Albania    Europe  1952   55.230   1282697  1601.056136
24      Algeria    Africa  1952   43.077   9279525  2449.008185
36       Angola    Africa  1952   30.015   4232095  3520.610273
48    Argentina  Americas  1952   62.485  17876956  5911.315053
        country continent  year  lifeExp       pop    gdpPercap
1   Afghanistan      Asia  1957   30.332   9240934   820.853030
13      Albania    Europe  1957   59.280   1476505  1942.284244
25      Algeria    Africa  1957   45.685  10270856  3013.976023
37       Angola    Africa  1957   31.999   4561361  3827.940465
49    Argentina  Americas  1957   64.399  19610538  6856.856212
        country continent  year  lifeExp       pop    gdpPercap
2   Afghanistan      Asia  1962   31.997  10267083   853.100710
14      Albania    Europe  1962   64.820   1728137  2312.888958
26      Algeria    Africa  1962   48.303

In [7]:
# 결합 : 연도별로 계산한 lifeExp의 평균값 구하기
df2 = pd.DataFrame({'year': [1952, 1957, 1962, 1967], '': [y1952_mean, y1957_mean, y1962_mean, y1967_mean]})
print(df2)

   year           
0  1952  49.057620
1  1957  51.507401
2  1962  53.609249
3  1967  55.678290


# 평균값을 구하는 사용자 함수와 groupby 메서드

In [8]:
# agg 메서드: 사용자함수와 groupby메소드 조합

In [9]:
# 열의 평균값 구하기
def my_mean(values):
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    return sum / n

In [10]:
agg_my_mean = df.groupby('year').lifeExp.agg(my_mean)
print(agg_my_mean)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


# 두 개의 인잣값을 받아 처리하는 사용자 함수와 groupby 메서드

In [12]:
# 첫번째 읹로 받은 열의 평균값을 구하여 두번째 인자로 받은 값과의 차이를 계산한 후 반환
def my_mean_diff(values, diff_value):
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    mean = sum / n
    return mean - diff_value

In [14]:
global_mean = df.lifeExp.mean()
print(global_mean)  # 59.47443936619713

agg_mean_diff = df.groupby('year')['lifeExp'].agg(my_mean_diff, diff_value=global_mean)
print(agg_mean_diff)

# year
# 1952   -10.416820
# 1957    -7.967038
# 1962    -5.865190
# 1967    -3.796150

59.47443936619713
year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64


# 집계 메서드를 리스트, 딕셔너리에 담아 전달하기

In [15]:
import numpy as np
gdf = df.groupby('year').lifeExp.agg([np.count_nonzero, np.mean, np.std])
print(gdf)

#       count_nonzero       mean        std
# year                                     
# 1952          142.0  49.057620  12.225956
# 1957          142.0  51.507401  12.231286

      count_nonzero       mean        std
year                                     
1952          142.0  49.057620  12.225956
1957          142.0  51.507401  12.231286
1962          142.0  53.609249  12.097245
1967          142.0  55.678290  11.718858
1972          142.0  57.647386  11.381953
1977          142.0  59.570157  11.227229
1982          142.0  61.533197  10.770618
1987          142.0  63.212613  10.556285
1992          142.0  64.160338  11.227380
1997          142.0  65.014676  11.559439
2002          142.0  65.694923  12.279823
2007          142.0  67.007423  12.073021


In [17]:
# 딕셔너리 값으로 집계 메서드 전달
gdf_dict = df.groupby('year').agg({'lifeExp': 'mean', 'pop': 'median', 'gdpPer_cap': 'median'})
print(gdf_dict)

KeyError: 'gdpPer_cap'

# 표준 점수 계산하기

In [18]:
def my_zscore(x):
    return (x - x.mean())/x.std()

In [20]:
transform_z = df.groupby('year').lifeExp.transform(my_zscore)
print(transform_z.head())

# 0   -1.656854
# 1   -1.731249
# 2   -1.786543
# 3   -1.848157
# 4   -1.894173
# Name: lifeExp, dtype: float64

0   -1.656854
1   -1.731249
2   -1.786543
3   -1.848157
4   -1.894173
Name: lifeExp, dtype: float64


In [21]:
print(df.shape)  # (1704, 6)
print(transform_z.shape)  # (1704,)

(1704, 6)
(1704,)


# 누락값을 평균값으로 처리하기

In [22]:
import seaborn as sns
import numpy as np

np.random.seed(42)

In [24]:
# total_bill 열 값 4개를 임의로 바꿈
tips_10 = sns.load_dataset('tips').sample(10)
tips_10.loc[np.random.permutation(tips_10.index)[:4], 'total_bill'] = np.NaN
print(tips_10)

#      total_bill   tip     sex smoker   day    time  size
# 243       18.78  3.00  Female     No  Thur  Dinner     2
# 58        11.24  1.76    Male    Yes   Sat  Dinner     2

     total_bill   tip     sex smoker   day    time  size
243       18.78  3.00  Female     No  Thur  Dinner     2
58        11.24  1.76    Male    Yes   Sat  Dinner     2
227         NaN  3.00    Male     No   Sat  Dinner     4
137       14.15  2.00  Female     No  Thur   Lunch     2
173       31.85  3.18    Male    Yes   Sun  Dinner     2
77          NaN  4.00    Male     No  Thur   Lunch     4
192       28.44  2.56    Male    Yes  Thur   Lunch     2
213       13.27  2.50  Female    Yes   Sat  Dinner     2
10          NaN  1.71    Male     No   Sun  Dinner     2
231         NaN  3.00    Male    Yes   Sat  Dinner     3


In [25]:
# 여성과 남성을 구분해 total_bill 열의 평균값을 구해야함. 남녀 비율이 다르기 때문
count_sex = tips_10.groupby('sex').count()
print(count_sex)

#         total_bill  tip  smoker  day  time  size
# sex                                             
# Male             3    7       7    7     7     7
# Female           3    3       3    3     3     3

        total_bill  tip  smoker  day  time  size
sex                                             
Male             3    7       7    7     7     7
Female           3    3       3    3     3     3


In [26]:
# 성별을 구분해 total_bill 열의 데이터를 받아 평균값을 구하는 함수
def fill_na_mean(x):
    avg = x.mean()
    return x.fillna(avg)  # fillna: 누락값 변경

In [27]:
# 남성과 여성의 누락값을 고려해 남, 여 평균으로 계산
total_bill_group_mean = tips_10.groupby('sex').total_bill.transform(fill_na_mean)
tips_10['fill_total_bill'] = total_bill_group_mean
print(tips_10)

#      total_bill   tip     sex smoker   day    time  size  fill_total_bill
# 243       18.78  3.00  Female     No  Thur  Dinner     2        18.780000
# 58        11.24  1.76    Male    Yes   Sat  Dinner     2        11.240000
# 227         NaN  3.00    Male     No   Sat  Dinner     4        23.843333
# 137       14.15  2.00  Female     No  Thur   Lunch     2        14.150000

     total_bill   tip     sex smoker   day    time  size  fill_total_bill
243       18.78  3.00  Female     No  Thur  Dinner     2        18.780000
58        11.24  1.76    Male    Yes   Sat  Dinner     2        11.240000
227         NaN  3.00    Male     No   Sat  Dinner     4        23.843333
137       14.15  2.00  Female     No  Thur   Lunch     2        14.150000
173       31.85  3.18    Male    Yes   Sun  Dinner     2        31.850000
77          NaN  4.00    Male     No  Thur   Lunch     4        23.843333
192       28.44  2.56    Male    Yes  Thur   Lunch     2        28.440000
213       13.27  2.50  Female    Yes   Sat  Dinner     2        13.270000
10          NaN  1.71    Male     No   Sun  Dinner     2        23.843333
231         NaN  3.00    Male    Yes   Sat  Dinner     3        23.843333


# 데이터 필터링 사용하기 ─ filter 메서드

In [28]:
tips = sns.load_dataset('tips')
print(tips.shape)  # (244, 7)

(244, 7)


In [29]:
print(tips['size'].value_counts())

# 2    156
# 3     38
# 4     37
# 5      5
# 6      4
# 1      4
# Name: size, dtype: int64

2    156
3     38
4     37
5      5
6      4
1      4
Name: size, dtype: int64


In [30]:
tips_filtered = tips.groupby('size').filter(lambda x: x['size'].count() >= 30)
print(tips_filtered.shape)  # (231, 7)

(231, 7)


In [31]:
# 주문이 매우 적은 테이블 제거
print(tips_filtered['size'].value_counts())

# 2    156
# 3     38
# 4     37
# Name: size, dtype: int64

2    156
3     38
4     37
Name: size, dtype: int64


# 그룹 오브젝트 저장하여 살펴보기

In [32]:
tips_10 = sns.load_dataset('tips').sample(10, random_state=42)
print(tips_10)

#      total_bill   tip     sex smoker   day    time  size
# 24        19.82  3.18    Male     No   Sat  Dinner     2
# 6          8.77  2.00    Male     No   Sun  Dinner     2
# 153       24.55  2.00    Male     No   Sun  Dinner     4
# 211       25.89  5.16    Male    Yes   Sat  Dinner     4
# 198       13.00  2.00  Female    Yes  Thur   Lunch     2
# 176       17.89  2.00    Male    Yes   Sun  Dinner     2

     total_bill   tip     sex smoker   day    time  size
24        19.82  3.18    Male     No   Sat  Dinner     2
6          8.77  2.00    Male     No   Sun  Dinner     2
153       24.55  2.00    Male     No   Sun  Dinner     4
211       25.89  5.16    Male    Yes   Sat  Dinner     4
198       13.00  2.00  Female    Yes  Thur   Lunch     2
176       17.89  2.00    Male    Yes   Sun  Dinner     2
192       28.44  2.56    Male    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
9         14.78  3.23    Male     No   Sun  Dinner     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2


In [33]:
grouped = tips_10.groupby('sex')
print(grouped)  # <pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000024AF05A8AC8>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000024AF05A8AC8>


In [34]:
# 그룹 오브젝트에 포함된 그룹을 보려면 groups속성을 출력해야함.
print(grouped.groups)

# {'Male': Int64Index([24, 6, 153, 211, 176, 192, 9], dtype='int64'), 'Female': Int64Index([198, 124, 101], dtype='int64')}

{'Male': Int64Index([24, 6, 153, 211, 176, 192, 9], dtype='int64'), 'Female': Int64Index([198, 124, 101], dtype='int64')}


# 그룹 오브젝트의 평균 구하기

In [36]:
# 오브젝트는 평균값을 구할 수 없는 열도 포함
# 이때, 오브젝트에 바로 mean 메서드를 사용해도 바로 평균값을 구할 수 있음
avg = grouped.mean()
print(avg)

#         total_bill       tip      size
# sex                                   
# Male         20.02  2.875714  2.571429
# Female       13.62  2.506667  2.000000

        total_bill       tip      size
sex                                   
Male         20.02  2.875714  2.571429
Female       13.62  2.506667  2.000000


In [37]:
# 파이썬은 그룹 연산에 적합한 열을 알아서 골라줌
print(tips_10.columns)

# Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')


# 그룹 오브젝트에서 데이터 추출하고 반복하기

In [38]:
# 오브젝트에서 특정 데이터만 추출하려면 get_group 메서드 사용
female = grouped.get_group('Female')
print(female)

#      total_bill   tip     sex smoker   day    time  size
# 198       13.00  2.00  Female    Yes  Thur   Lunch     2
# 124       12.48  2.52  Female     No  Thur   Lunch     2
# 101       15.38  3.00  Female    Yes   Fri  Dinner     2

     total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2


In [39]:
# 성별 그룹의 데이터를 반복문을 이용해 출력
for sex_group in grouped:
    print(sex_group)
    
# ('Female',      total_bill   tip     sex smoker   day    time  size
# 198       13.00  2.00  Female    Yes  Thur   Lunch     2
# 124       12.48  2.52  Female     No  Thur   Lunch     2
# 101       15.38  3.00  Female    Yes   Fri  Dinner     2)

# tuple로 나타나며 tuple[0]: 성별, tuple[1]: 내용

('Male',      total_bill   tip   sex smoker   day    time  size
24        19.82  3.18  Male     No   Sat  Dinner     2
6          8.77  2.00  Male     No   Sun  Dinner     2
153       24.55  2.00  Male     No   Sun  Dinner     4
211       25.89  5.16  Male    Yes   Sat  Dinner     4
176       17.89  2.00  Male    Yes   Sun  Dinner     2
192       28.44  2.56  Male    Yes  Thur   Lunch     2
9         14.78  3.23  Male     No   Sun  Dinner     2)
('Female',      total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2)


In [40]:
for sex_group in grouped:
    print(sex_group[0])
#             Male
#             Female

Male
Female


# 그룹 오브젝트 계산하고 살펴보기

In [41]:
bill_sex_time = tips_10.groupby(['sex', 'time'])
group_avg = bill_sex_time.mean()
print(group_avg)

#                total_bill       tip      size
# sex    time                                  
# Male   Lunch    28.440000  2.560000  2.000000
#        Dinner   18.616667  2.928333  2.666667
# Female Lunch    12.740000  2.260000  2.000000
#        Dinner   15.380000  3.000000  2.000000

               total_bill       tip      size
sex    time                                  
Male   Lunch    28.440000  2.560000  2.000000
       Dinner   18.616667  2.928333  2.666667
Female Lunch    12.740000  2.260000  2.000000
       Dinner   15.380000  3.000000  2.000000


In [42]:
# groupby 메서드가 반환하는 데이터의 자료형: 그룹 오브젝트
print(type(group_avg))  # <class 'pandas.core.frame.DataFrame'>
print(group_avg.columns)  # Index(['total_bill', 'tip', 'size'], dtype='object')

<class 'pandas.core.frame.DataFrame'>
Index(['total_bill', 'tip', 'size'], dtype='object')


In [43]:
# group_avg의 인덱스 출력
print(group_avg.index)

# MultiIndex(levels=[['Male', 'Female'], ['Lunch', 'Dinner']],
#            codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
#            names=['sex', 'time'])

MultiIndex(levels=[['Male', 'Female'], ['Lunch', 'Dinner']],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['sex', 'time'])


In [45]:
# reset_index() 사용해 데이터프레임의 인덱스 새로 부여 가능
group_method = tips_10.groupby(['sex', 'time']).mean().reset_index()
print(group_method)

#      sex    time  total_bill       tip      size
# 0    Male   Lunch   28.440000  2.560000  2.000000
# 1    Male  Dinner   18.616667  2.928333  2.666667
# 2  Female   Lunch   12.740000  2.260000  2.000000
# 3  Female  Dinner   15.380000  3.000000  2.000000

      sex    time  total_bill       tip      size
0    Male   Lunch   28.440000  2.560000  2.000000
1    Male  Dinner   18.616667  2.928333  2.666667
2  Female   Lunch   12.740000  2.260000  2.000000
3  Female  Dinner   15.380000  3.000000  2.000000


In [None]:
# reset_index() 대신 as_index인자를 False로 사용해도 위와 같은 결과
group_param = tips_10.groupby(['sex', ])