### 그룹화

In [15]:
#필요한 패키지들 가져오기 
import numpy as np #수치연산, 선형대수, ndarray라는 자료 구조를 가진 패키지
import pandas as pd #Series, DF 자료구조를 가진 패키지
import seaborn as sns #샘플데이터와 시각화

#1. 타이타닉 데이터 가져오기
titanic = sns.load_dataset('titanic')
titanic

#2. 클래스 별로 그룹화 하기
group = titanic.groupby('class')
group

for key,data in group:
    print(key)
    print(data.head(2)) #first그룹, second그룹, third그룹으로 나눠서 볼 수 있음
    

#3. 3등급 객실 데이터만 가져와보기
third = group.get_group('Third')
third

print(group.mean())
print("")

#4. 2개의 특성으로 그룹화 해서 집계
group = titanic.groupby(['class', 'sex']) #이 범주형 데이터 순서대로 분할을 계속하니까 정리하고 싶은 결과를 먼저 생각해야 할 듯!
print(group.describe())



First
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
1         1       1  female  38.0      1      0  71.2833        C  First   
3         1       1  female  35.0      1      0  53.1000        S  First   

     who  adult_male deck  embark_town alive  alone  
1  woman       False    C    Cherbourg   yes  False  
3  woman       False    C  Southampton   yes  False  
Second
    survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
9          1       2  female  14.0      1      0  30.0708        C  Second   
15         1       2  female  55.0      0      0  16.0000        S  Second   

      who  adult_male deck  embark_town alive  alone  
9   child       False  NaN    Cherbourg   yes  False  
15  woman       False  NaN  Southampton   yes   True  
Third
   survived  pclass     sex   age  sibsp  parch   fare embarked  class    who  \
0         0       3    male  22.0      1      0  7.250        S  Third    man   
2         1       3  female 

### 그룹화 후 원하는 함수를 적용하기

In [20]:
def func(x):
    return x.max() - x.min()

gr = titanic.groupby(['class'])
print(gr.agg(func))

#여러개의 함수를 적용하고 싶다면 agg에다가 (['max', 'min'])과 같이 리스트로 나열해버리면 됨
print(gr.agg(['max','mean','min']))

        survived  pclass    age  sibsp  parch      fare
class                                                  
First          1       0  79.08      3      4  512.3292
Second         1       0  69.33      3      3   73.5000
Third          1       0  73.58      8      6   69.5500
       survived               pclass            age                  sibsp  \
            max      mean min    max mean min   max       mean   min   max   
class                                                                        
First         1  0.629630   0      1    1   1  80.0  38.233441  0.92     3   
Second        1  0.472826   0      2    2   2  70.0  29.877630  0.67     3   
Third         1  0.242363   0      3    3   3  74.0  25.140620  0.42     8   

        ... parch      fare                 adult_male                  alone  \
        ...   min       max       mean  min        max      mean    min   max   
class   ...                                                                     
First   

### 필터링 : filter()

In [36]:
# 각 그룹별 데이터 개수 확인
for key, data in gr:
    print(key,len(data))
print("")

#데이터가 200개는 안되는 그룹은 제거 하는 함수 만들기
def count200(x):
    return len(x) >=200 #200개 이상 여부를 bool로 리턴을 받을 것

gr_filter = gr.filter(count200) #filter로 True에 해당이 안되면 날아감!
gr_filter

#위에 함수를 람다로도 많이 쓴다 (한줄이니까)
gr_filter=gr.filter(lambda x:len(x)>=200)
gr_filter

#age열의 평균이 30이 안되는 그룹을 제거해보자
for key, data in gr:
    print(key,data['age'].mean()) 
print("")

gr_filter2 = gr.filter(lambda x : x['age'].mean()>=30)
gr_filter2

First 216
Second 184
Third 491

First 38.233440860215055
Second 29.87763005780347
Third 25.14061971830986



Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
23,1,1,male,28.0,0,0,35.5000,S,First,man,True,A,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


### 멀티 인덱스

In [55]:
gr = titanic.groupby(['class','sex','survived'])
gdf = gr.mean()
gdf

#1. 위의 멀티 인덱스에서 원하는 데이터만 접근할 땐 어떻게 해야 할까???
gdf.loc['Third'] #loc를 이용하자!
gdf.loc['Third','male'] #Third의 남자만 알고 싶을 때

# 이렇게 하면 더이상 전체 class를 전체의 남자만 가져오는건 안됨..! >> xs 인덱서 이용

gdf.xs('male', level='sex')

Unnamed: 0_level_0,Unnamed: 1_level_0,pclass,age,sibsp,parch,fare,adult_male,alone
class,survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
First,0,1.0,44.581967,0.272727,0.25974,62.89491,1.0,0.649351
First,1,1.0,36.248,0.377778,0.311111,74.63732,0.933333,0.555556
Second,0,2.0,33.369048,0.307692,0.142857,19.488965,1.0,0.714286
Second,1,2.0,16.022,0.529412,0.647059,21.0951,0.470588,0.411765
Third,0,3.0,27.255814,0.523333,0.213333,12.204469,0.936667,0.773333
Third,1,3.0,22.274211,0.340426,0.297872,15.579696,0.808511,0.680851
