# Data Aggregation and Group Operations

In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

### Column-Wise and Multiple Function Application

In [48]:
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']

## Apply: General split-apply-combine

In [50]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips, n=6)

# sort_values는 오름차순으로 배열하므로, 가장 높은 값을 추출하려면 뒤에서부터 슬라이싱합니다.

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [54]:
print(tips.groupby('smoker').apply(top).shape)
tips.groupby('smoker').apply(top)

# 'smoker' column에 따라 분리된 dataframe에 각각 top 함수가 적용이 된 것을 확인할 수 있습니다.
# 결과는 계층적 색인을 갖습니다.

(10, 7)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [55]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

# apply에 넣는 함수에 인자로 넣어줄 argument는 함수 이름 뒤에 이어 적습니다.

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [57]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

# GroupBy 객체에도 describe()를 사용할 수 있습니다.

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [58]:
result.unstack('smoker')

# smoker를 열로 사용하도록 unstack에 인자로 넣어줍니다

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

f = lambda x: x.describe()
grouped.apply(f)

### 이것이 가능한 이유는, describe 등 pandas 함수 `some_func`를 사용하면, GroupBy 객체의 경우,
`grouped.apply(some_func)` 로 동작하기 때문입니다

### Suppressing the Group Keys

In [59]:
print(tips.groupby('smoker', group_keys=False).apply(top).smoker)
tips.groupby('smoker', group_keys=False).apply(top)

# group_keys를 False로 주게 되면 계층적 색인 없이 원래의 인덱스로 사용 가능합니다.

88      No
185     No
51      No
149     No
232     No
109    Yes
183    Yes
67     Yes
178    Yes
172    Yes
Name: smoker, dtype: object


Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### Quantile and Bucket Analysis

In [14]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

# 표준정규분포에서 1000개의 데이터 포인트를 추출한 후, 4 등간격 분할합니다.
# 'data1' column은 category dtype이 됩니다

0    (-1.23, 0.489]
1    (-1.23, 0.489]
2    (-1.23, 0.489]
3    (-1.23, 0.489]
4    (0.489, 2.208]
5    (0.489, 2.208]
6    (-1.23, 0.489]
7    (-1.23, 0.489]
8    (0.489, 2.208]
9    (0.489, 2.208]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]

In [70]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean(),
            'hj': 1}
grouped = frame.data2.groupby(quartiles)  # data1의 분할을 기준으로 data2 groupby한 객체 반환
print(type(grouped))
print(grouped.apply(get_stats))
grouped.apply(get_stats).unstack()  # innermost index를 column으로

# data2를 data1의 등간격 분할에 따라 groupby하고, 통계를 구합니다. 
# 딕셔너리를 반환하게 하면 바로 계층 인덱스로 활용할 수 있습니다

<class 'pandas.core.groupby.generic.SeriesGroupBy'>
data1                 
(-2.956, -1.23]  min       -3.399312
                 max        1.670835
                 count     95.000000
                 mean      -0.060603
                 hj         1.000000
(-1.23, 0.489]   min       -2.989741
                 max        3.260383
                 count    595.000000
                 mean      -0.010543
                 hj         1.000000
(0.489, 2.208]   min       -3.745356
                 max        2.954439
                 count    299.000000
                 mean       0.092054
                 hj         1.000000
(2.208, 3.928]   min       -1.929776
                 max        1.765640
                 count     11.000000
                 mean       0.030607
                 hj         1.000000
Name: data2, dtype: float64


Unnamed: 0_level_0,min,max,count,mean,hj
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-2.956, -1.23]",-3.399312,1.670835,95.0,-0.060603,1.0
"(-1.23, 0.489]",-2.989741,3.260383,595.0,-0.010543,1.0
"(0.489, 2.208]",-3.745356,2.954439,299.0,0.092054,1.0
"(2.208, 3.928]",-1.929776,1.76564,11.0,0.030607,1.0


In [74]:
# Return quantile numbers
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

# 도수가 같도록 분할하려면 qcut을 사용합니다.
# qcut에 labels=None을 주게 되면, 구간이 표시됩니다

Unnamed: 0_level_0,min,max,count,mean,hj
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-3.399312,1.670835,100.0,-0.06993,1.0
1,-1.801179,2.628441,100.0,0.043954,1.0
2,-2.925113,2.527939,100.0,-0.090777,1.0
3,-2.315555,3.260383,100.0,0.080316,1.0
4,-2.041696,2.074345,100.0,-0.126442,1.0
5,-2.989741,2.18481,100.0,0.009929,1.0
6,-2.084231,2.458842,100.0,-0.040484,1.0
7,-3.05699,2.954439,100.0,0.049913,1.0
8,-3.745356,2.735527,100.0,0.108133,1.0
9,-2.064111,2.37702,100.0,0.193693,1.0


### Example: Filling Missing Values with Group-Specific       Values

## 그룹별로 결측치 채우기

In [76]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    1.135212
2         NaN
3   -0.478629
4         NaN
5    0.736416
dtype: float64

In [78]:
s.fillna(s.mean())

# 기본적인 결측치처리 방법입니다. mean은 NaN을 기본적으로 무시합니다

0    0.464333
1    1.135212
2    0.464333
3   -0.478629
4    0.464333
5    0.736416
dtype: float64

In [83]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
group_key
# 데이터를 준비합니다. 리스트를 더하면 이어 붙일 수 있습니다

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [86]:
data = pd.Series(np.random.randn(8), index=states)
data

Ohio          0.303393
New York     -0.109817
Vermont       0.454248
Florida       2.522396
Oregon       -0.458230
Nevada       -0.732740
California   -0.597688
Idaho         0.051690
dtype: float64

In [84]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
print(data)
data.groupby(group_key).mean()

# East에서 하나의 주, West에서 두 주를 결측치로 바꾸고, group 별로 mean을 구합니다.

Ohio          0.086532
New York      0.835479
Vermont            NaN
Florida      -1.543934
Oregon        0.149815
Nevada             NaN
California    0.515861
Idaho              NaN
dtype: float64


East   -0.207308
West    0.332838
dtype: float64

In [20]:
fill_mean = lambda g: g.fillna(g.mean()) # 각각의 group에 적용됨
data.groupby(group_key).apply(fill_mean)

#각 group에 적용할 익명함수를 정의하고, 결측치를 East와 West의 평균으로 각각 채웁니다

Ohio         -1.613474
New York     -0.573966
Vermont      -0.309965
Florida       1.257544
Oregon       -1.065343
Nevada        0.307396
California    1.680135
Idaho         0.307396
dtype: float64

In [85]:
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])  # group은 name이라는 attribute가 있음
data.groupby(group_key).apply(fill_func)

# group은 각각 name을 가지고 있으므로, name으로부터 mapping하여 결측치를 채울 수 있습니다.

Ohio          0.086532
New York      0.835479
Vermont       0.500000
Florida      -1.543934
Oregon        0.149815
Nevada       -1.000000
California    0.515861
Idaho        -1.000000
dtype: float64

### Example: Random Sampling and Permutation

In [22]:
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C', 'D']  # 모양
card_val = (list(range(1, 11)) + [10] * 3) * 4  # 값
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']  # 이름, JKQ는 알파벳 순 같다
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)  # [AH, 2H, ..., QH]를 card에 연결할 것이므로, append 대신 extend 사용

deck = pd.Series(card_val, index=cards)

# 트럼프 카드 덱을 만듭니다.

In [23]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [24]:
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

# Series.sample()을 사용하여 추출합니다.

9C    9
4S    4
9H    9
4H    4
8H    8
dtype: int64

In [25]:
get_suit = lambda card: card[-1] # last letter is suit
deck.groupby(get_suit).apply(draw, n=2)

# get_suit의 반환값으로 모양별로 groupby후,
# group마다 sample합니다. 

C  8C      8
   JC     10
D  8D      8
   10D    10
H  6H      6
   8H      8
S  7S      7
   QS     10
dtype: int64

In [26]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

# group_keys=False로 주면, 원래의 인덱스를 사용합니다.

QC    10
8C     8
KD    10
JD    10
9H     9
4H     4
2S     2
QS    10
dtype: int64

### Example: Group Weighted Average and Correlation

In [27]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})
df

# 데이터프레임을 만듭니다

Unnamed: 0,category,data,weights
0,a,1.911078,0.321406
1,a,-0.284044,0.882984
2,a,0.412438,0.462226
3,a,0.319356,0.543428
4,b,-0.011886,0.170341
5,b,1.508348,0.936113
6,b,-1.177038,0.586625
7,b,-0.086293,0.82634


In [28]:
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)

# group별로 data에 weight를 곱해서 얻은 가중평균을 구합니다

category
a    0.329229
b    0.257271
dtype: float64

In [29]:
close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True,
                       index_col=0)
close_px.info()
close_px[-4:]

# 종가 정보를 불러옵니다

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [30]:
spx_corr = lambda x: x.corrwith(x['SPX'])
# 상관계수를 구하는 익명함수를 정의합니다

In [87]:
rets = close_px.pct_change().dropna()
rets[-4:]

# pct_change()를 통해 변화율을 계산합니다. 
# 그러나 사실 퍼센트 변화율은 아닙니다.
# rets는 returns의 준말입니다.

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,0.029526,0.002227,-0.000131,0.000544
2011-10-12,0.004747,-0.001481,0.011669,0.009795
2011-10-13,0.015515,0.00816,-0.010238,-0.002974
2011-10-14,0.033225,0.003311,0.022784,0.01738


In [32]:
get_year = lambda x: x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

# SPX의 변화율과 다른 종목의 변화율의 상관계수를 구합니다.

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [33]:
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

# AAPL과 MSFT의 변화율의 상관계수를 구합니다.

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

### Example: Group-Wise Linear Regression

In [34]:
import statsmodels.api as sm
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.  # 상수항이 0이라는 가정이 없으므로 추가
    result = sm.OLS(Y, X).fit()
    return result.params

# DataFrame과 columns를 받아 선형 회귀를 실시하는 함수를 정의합니다.

In [35]:
by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


## Pivot Tables and Cross-Tabulation

## 피벗 테이블

데이터를 하나 이상의 키로 수집해서 어떤 키는 row, 어떤 키는 column에 나열하여 데이터를 정리합니다.

In [36]:
tips.pivot_table(index=['day', 'smoker'])

# pivot_table의 기본 연산은 평균으로, index별로 분리된 group에서 평균을 구합니다.

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [37]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker')

# 집계를 할 데이터, index, columns 순으로 입력합니다

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [38]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker', margins=True)

# margins=True를 주면, All rows 그룹에서 연산을 하거나, All columns 그룹에서 연산합니다.

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [88]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker', margins=True, margins_name='Everyone')

# margins_name을 입력하면 부분합의 name을 정해줄 수 있습니다.

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,Everyone,No,Yes,Everyone
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
Everyone,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [39]:
tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',
                 aggfunc=len, margins=True)

# aggfunc를 바꾸면 다른 연산을 할 수 있습니다.

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106.0
Dinner,Yes,9.0,42.0,19.0,,70.0
Lunch,No,1.0,,,44.0,45.0
Lunch,Yes,6.0,,,17.0,23.0
All,,19.0,87.0,76.0,62.0,244.0


In [40]:
tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],
                 columns='day', aggfunc='mean', fill_value=0)

# fill_value 인자를 사용할 수 있습니다.                 

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,1,No,0.000000,0.137931,0.000000,0.000000
Dinner,1,Yes,0.000000,0.325733,0.000000,0.000000
Dinner,2,No,0.139622,0.162705,0.168859,0.159744
Dinner,2,Yes,0.171297,0.148668,0.207893,0.000000
Dinner,3,No,0.000000,0.154661,0.152663,0.000000
...,...,...,...,...,...,...
Lunch,3,Yes,0.000000,0.000000,0.000000,0.204952
Lunch,4,No,0.000000,0.000000,0.000000,0.138919
Lunch,4,Yes,0.000000,0.000000,0.000000,0.155410
Lunch,5,No,0.000000,0.000000,0.000000,0.121389


### Cross-Tabulations: Crosstab

In [89]:
from io import StringIO
data = """\
Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""
data = pd.read_table(StringIO(data), sep='\s+')

# 문자열을 파일처럼 사용(io.StringIO)
# https://wikidocs.net/122776

In [90]:
data

Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right-handed
1,2,Japan,Left-handed
2,3,USA,Right-handed
3,4,Japan,Right-handed
4,5,Japan,Left-handed
5,6,Japan,Right-handed
6,7,USA,Right-handed
7,8,USA,Left-handed
8,9,Japan,Right-handed
9,10,USA,Right-handed


In [43]:
pd.crosstab(data.Nationality, data.Handedness, margins=True)

# 해당하는 데이터 포인트의 개수를 셉니다. row, col 순으로 입력합니다.

Handedness,Left-handed,Right-handed,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [44]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

# 마찬가지입니다.

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244


In [45]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS

## Conclusion