# Data Aggregation and Group Operations

In [1]:
import pandas as pd
import numpy as np

## GroupBy mechanics

In [8]:
nrows = 10
df = pd.DataFrame({'company' : np.random.choice(list('ab'), nrows),
                  'data1' : np.random.randn(nrows) * 50 + 100,
                  'city' : np.random.choice(list('MP'), nrows),
                  'income' : np.random.randn(nrows) * 30000 + 50000 })
df

Unnamed: 0,company,data1,city,income
0,a,148.300038,P,65670.905071
1,b,89.662323,M,-10450.256993
2,a,55.600361,M,78985.252627
3,a,78.278191,P,17303.70431
4,a,112.505657,P,15087.119461
5,b,124.488859,P,64915.470065
6,a,85.665046,M,69404.493051
7,b,41.532992,M,18435.031493
8,a,103.322088,M,-18031.199879
9,a,173.747477,P,-7809.941479


In [11]:
grouped = df.groupby('company')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fdccb1d8c50>

In [12]:
grouped.sum()

Unnamed: 0_level_0,data1,income
company,Unnamed: 1_level_1,Unnamed: 2_level_1
a,757.418858,220610.333162
b,255.684175,72900.244565


In [13]:
grouped.mean()

Unnamed: 0_level_0,data1,income
company,Unnamed: 1_level_1,Unnamed: 2_level_1
a,108.202694,31515.76188
b,85.228058,24300.081522


In [14]:
grouped.min()

Unnamed: 0_level_0,data1,city,income
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,55.600361,M,-18031.199879
b,41.532992,M,-10450.256993


### Iterating over groups

In [24]:
for name, group in df.groupby('company'):
    print(name)

a
b


In [25]:
for name, group in df.groupby('company'):
    print(group)

  company       data1 city        income
0       a  148.300038    P  65670.905071
2       a   55.600361    M  78985.252627
3       a   78.278191    P  17303.704310
4       a  112.505657    P  15087.119461
6       a   85.665046    M  69404.493051
8       a  103.322088    M -18031.199879
9       a  173.747477    P  -7809.941479
  company       data1 city        income
1       b   89.662323    M -10450.256993
5       b  124.488859    P  64915.470065
7       b   41.532992    M  18435.031493


In [26]:
dict(list(df.groupby('company')))
# tenemos todos los datos separados en dataframes

{'a':   company       data1 city        income
 0       a  148.300038    P  65670.905071
 2       a   55.600361    M  78985.252627
 3       a   78.278191    P  17303.704310
 4       a  112.505657    P  15087.119461
 6       a   85.665046    M  69404.493051
 8       a  103.322088    M -18031.199879
 9       a  173.747477    P  -7809.941479,
 'b':   company       data1 city        income
 1       b   89.662323    M -10450.256993
 5       b  124.488859    P  64915.470065
 7       b   41.532992    M  18435.031493}

In [28]:
all_data = dict(list(df.groupby('company')))
all_data['a']

Unnamed: 0,company,data1,city,income
0,a,148.300038,P,65670.905071
2,a,55.600361,M,78985.252627
3,a,78.278191,P,17303.70431
4,a,112.505657,P,15087.119461
6,a,85.665046,M,69404.493051
8,a,103.322088,M,-18031.199879
9,a,173.747477,P,-7809.941479


### Selecting a column or subset of columns

In [15]:
df.groupby(['company','city']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,income
company,city,Unnamed: 2_level_1,Unnamed: 3_level_1
a,M,81.529165,43452.8486
a,P,128.207841,22562.946841
b,M,65.597658,3992.38725
b,P,124.488859,64915.470065


In [16]:
df.groupby(['company','city']).mean()['income']
# con este perdemos tiempo en calcular todas las medias

company  city
a        M       43452.848600
         P       22562.946841
b        M        3992.387250
         P       64915.470065
Name: income, dtype: float64

In [18]:
means = df.groupby(['company','city'])['income'].mean()
# company y city son indices, es un multi index

In [19]:
means

company  city
a        M       43452.848600
         P       22562.946841
b        M        3992.387250
         P       64915.470065
Name: income, dtype: float64

In [20]:
means['a','P']

22562.946840739576

In [22]:
means.index

MultiIndex([('a', 'M'),
            ('a', 'P'),
            ('b', 'M'),
            ('b', 'P')],
           names=['company', 'city'])

In [23]:
means.reset_index()

Unnamed: 0,company,city,income
0,a,M,43452.8486
1,a,P,22562.946841
2,b,M,3992.38725
3,b,P,64915.470065


In [31]:
df.groupby('company')[['data1','city']].max()

Unnamed: 0_level_0,data1,city
company,Unnamed: 1_level_1,Unnamed: 2_level_1
a,173.747477,P
b,124.488859,P


## Data aggregation

In [32]:
df.groupby('city').sum()

Unnamed: 0_level_0,data1,income
city,Unnamed: 1_level_1,Unnamed: 2_level_1
M,375.78281,138343.320299
P,637.320222,155167.257428


In [33]:
df.groupby('city').median()

Unnamed: 0_level_0,data1,income
city,Unnamed: 1_level_1,Unnamed: 2_level_1
M,85.665046,18435.031493
P,124.488859,17303.70431


In [35]:
df.groupby('city').quantile(.9)

TypeError: 'quantile' cannot be performed against 'object' dtypes!

In [37]:
stats = df.groupby('city').describe()
stats

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,income,income,income,income,income,income,income,income
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
M,5.0,75.156562,25.627776,41.532992,55.600361,85.665046,89.662323,103.322088,5.0,27668.66406,44727.120856,-18031.199879,-10450.256993,18435.031493,69404.493051,78985.252627
P,5.0,127.464044,36.152967,78.278191,112.505657,124.488859,148.300038,173.747477,5.0,31033.451486,32784.695413,-7809.941479,15087.119461,17303.70431,64915.470065,65670.905071


In [38]:
stats['data1']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M,5.0,75.156562,25.627776,41.532992,55.600361,85.665046,89.662323,103.322088
P,5.0,127.464044,36.152967,78.278191,112.505657,124.488859,148.300038,173.747477


In [39]:
stats['data1','mean']

city
M     75.156562
P    127.464044
Name: (data1, mean), dtype: float64

In [40]:
stats.columns

MultiIndex([( 'data1', 'count'),
            ( 'data1',  'mean'),
            ( 'data1',   'std'),
            ( 'data1',   'min'),
            ( 'data1',   '25%'),
            ( 'data1',   '50%'),
            ( 'data1',   '75%'),
            ( 'data1',   'max'),
            ('income', 'count'),
            ('income',  'mean'),
            ('income',   'std'),
            ('income',   'min'),
            ('income',   '25%'),
            ('income',   '50%'),
            ('income',   '75%'),
            ('income',   'max')],
           )

In [41]:
!wget https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv
# con el ! se manda al terminal

--2019-11-22 21:20:29--  https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.132.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.132.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943 (7.8K) [text/plain]
Saving to: ‘tips.csv’


2019-11-22 21:20:29 (11.7 MB/s) - ‘tips.csv’ saved [7943/7943]



In [43]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [45]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### Column-wise and multiple function application

In [46]:
tips.groupby('sex').mean()

Unnamed: 0_level_0,total_bill,tip,size,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,18.056897,2.833448,2.45977,0.166491
Male,20.744076,3.089618,2.630573,0.157651


In [47]:
tips.groupby('sex').std()

Unnamed: 0_level_0,total_bill,tip,size,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,8.009209,1.159495,0.937644,0.053632
Male,9.246469,1.489102,0.955997,0.064778


In [48]:
tips.groupby('sex').agg(['mean','std'])

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,18.056897,8.009209,2.833448,1.159495,2.45977,0.937644,0.166491,0.053632
Male,20.744076,9.246469,3.089618,1.489102,2.630573,0.955997,0.157651,0.064778


In [49]:
tips.groupby(['sex', 'smoker']).agg(['mean','std', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Female,No,18.105185,7.286455,54,2.773519,1.128425,54,2.592593,1.073146,54,0.156921,0.036421,54
Female,Yes,17.977879,9.189751,33,2.931515,1.219916,33,2.242424,0.613917,33,0.18215,0.071595,33
Male,No,19.791237,8.726566,97,3.113402,1.489559,97,2.71134,0.989094,97,0.160669,0.041849,97
Male,Yes,22.2845,9.911845,60,3.051167,1.50012,60,2.5,0.89253,60,0.152771,0.090588,60


In [51]:
tips.groupby(['sex', 'smoker']).agg([np.mean, np.std, np.count_nonzero])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count_nonzero,mean,std,count_nonzero,mean,std,count_nonzero,mean,std,count_nonzero
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Female,No,18.105185,7.286455,54.0,2.773519,1.128425,54.0,2.592593,1.073146,54,0.156921,0.036421,54.0
Female,Yes,17.977879,9.189751,33.0,2.931515,1.219916,33.0,2.242424,0.613917,33,0.18215,0.071595,33.0
Male,No,19.791237,8.726566,97.0,3.113402,1.489559,97.0,2.71134,0.989094,97,0.160669,0.041849,97.0
Male,Yes,22.2845,9.911845,60.0,3.051167,1.50012,60.0,2.5,0.89253,60,0.152771,0.090588,60.0


In [52]:
def minimax(series):
    return series.max() - series.min()
tips.groupby(['sex', 'smoker']).agg([np.mean, np.std, minimax])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,minimax,mean,std,minimax,mean,std,minimax,mean,std,minimax
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Female,No,18.105185,7.286455,28.58,2.773519,1.128425,4.2,2.592593,1.073146,5,0.156921,0.036421,0.195876
Female,Yes,17.977879,9.189751,41.23,2.931515,1.219916,5.5,2.242424,0.613917,3,0.18215,0.071595,0.360233
Male,No,19.791237,8.726566,40.82,3.113402,1.489559,7.75,2.71134,0.989094,4,0.160669,0.041849,0.220186
Male,Yes,22.2845,9.911845,43.56,3.051167,1.50012,9.0,2.5,0.89253,4,0.152771,0.090588,0.674707


In [54]:
tips.groupby(['sex', 'smoker']).agg({'total_bill' : [np.sum, np.mean],
                                     'tip' : [minimax, np.std]})

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,minimax,std
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,No,977.68,18.105185,4.2,1.128425
Female,Yes,593.27,17.977879,5.5,1.219916
Male,No,1919.75,19.791237,7.75,1.489559
Male,Yes,1337.07,22.2845,9.0,1.50012


## Group-wise operations and transformations

In [55]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204


In [59]:
tips_by_smoker = tips.groupby('smoker')['tip_pct'].agg([np.mean, np.std])
tips_by_smoker 

Unnamed: 0_level_0,mean,std
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.159328,0.03991
Yes,0.163196,0.085119


In [61]:
merged = tips.merge(tips_by_smoker, left_on='smoker', right_index=True)
merged.head()
# cuando hacemos un groupby el criterio que aplicamos queda como el indice
******

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,0.159328,0.03991
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,0.159328,0.03991
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,0.159328,0.03991
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,0.159328,0.03991
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,0.159328,0.03991


### Apply: General split-apply-combine

In [63]:
def top(df, n=5, col='tip_pct'):
    return df.sort_values(by=col, ascending=False).head(n)
top(merged)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345,0.163196,0.085119
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667,0.163196,0.085119
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733,0.163196,0.085119
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199,0.159328,0.03991
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535,0.163196,0.085119


In [64]:
merged.groupby('sex').apply(top)
# saca el top 5 de registro de cada grupo, top 5 de chicas y top 5 de chicos

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Female,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667,0.163196,0.085119
Female,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733,0.163196,0.085119
Female,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525,0.163196,0.085119
Female,93,16.32,4.3,Female,Yes,Fri,Dinner,2,0.26348,0.163196,0.085119
Female,221,13.42,3.48,Female,Yes,Fri,Lunch,2,0.259314,0.163196,0.085119
Male,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345,0.163196,0.085119
Male,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199,0.159328,0.03991
Male,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535,0.163196,0.085119
Male,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312,0.159328,0.03991
Male,181,23.33,5.65,Male,Yes,Sun,Dinner,2,0.242177,0.163196,0.085119


#### Suppressing the group keys

In [65]:
tips.groupby(['sex', 'smoker'], group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
139,13.16,2.75,Female,No,Thur,Lunch,2,0.208967
18,16.97,3.5,Female,No,Sun,Dinner,3,0.206246
14,14.83,3.02,Female,No,Sun,Dinner,2,0.203641
115,17.31,3.5,Female,No,Sun,Dinner,2,0.202195
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
93,16.32,4.3,Female,Yes,Fri,Dinner,2,0.26348
221,13.42,3.48,Female,Yes,Fri,Lunch,2,0.259314


### Quantile and bucket analysis

In [68]:
pd.cut(merged['total_bill'], 5)
# no le decimos las fronteras, sino los trozos, nos lo va a cortar en 5

0      (12.618, 22.166]
1       (3.022, 12.618]
2      (12.618, 22.166]
3      (22.166, 31.714]
4      (22.166, 31.714]
             ...       
234    (12.618, 22.166]
236     (3.022, 12.618]
237    (31.714, 41.262]
240    (22.166, 31.714]
241    (22.166, 31.714]
Name: total_bill, Length: 244, dtype: category
Categories (5, interval[float64]): [(3.022, 12.618] < (12.618, 22.166] < (22.166, 31.714] < (31.714, 41.262] < (41.262, 50.81]]

In [73]:
merged.groupby(pd.cut(merged['total_bill'], 5))['tip_pct'].agg([np.mean, np.std])

Unnamed: 0_level_0,mean,std
total_bill,Unnamed: 1_level_1,Unnamed: 2_level_1
"(3.022, 12.618]",0.190854,0.094609
"(12.618, 22.166]",0.163942,0.041264
"(22.166, 31.714]",0.143799,0.051131
"(31.714, 41.262]",0.12153,0.042792
"(41.262, 50.81]",0.125121,0.05265


In [74]:
merged['size'] > 2

0      False
1       True
2       True
3      False
4       True
       ...  
234    False
236    False
237    False
240    False
241    False
Name: size, Length: 244, dtype: bool

In [76]:
merged.groupby(merged['size'] > 2)['tip_pct'].agg([np.mean, np.std])

Unnamed: 0_level_0,mean,std
size,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.167009,0.067404
True,0.148982,0.04473


### Example: Filling missing values with group-specific values

## Pivot tables and Cross-tabulation

In [77]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204


In [80]:
tips.pivot_table(index='size', columns='smoker', values='tip_pct')

smoker,No,Yes
size,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.159829,0.274755
2,0.164996,0.166706
3,0.149671,0.157543
4,0.147604,0.142036
5,0.178415,0.086116
6,0.156229,


In [81]:
tips.pivot_table(index='size', columns=['smoker','time'], values='tip_pct')
# se agregan por la media por defecto

smoker,No,No,Yes,Yes
time,Dinner,Lunch,Dinner,Lunch
size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,0.137931,0.181728,0.325733,0.223776
2,0.164383,0.166005,0.167246,0.165266
3,0.153705,0.118742,0.148061,0.204952
4,0.148737,0.138919,0.139064,0.15541
5,0.206928,0.121389,0.086116,
6,0.103799,0.173706,,


In [82]:
tips.pivot_table(index='size', columns=['smoker','time'], values='total_bill', aggfunc='sum')

smoker,No,No,Yes,Yes
time,Dinner,Lunch,Dinner,Lunch
size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,7.25,10.07,3.07,8.58
2,899.48,481.33,892.3,292.78
3,488.81,57.44,303.12,35.18
4,635.89,86.11,273.06,63.64
5,50.54,41.19,58.61,
6,48.17,91.15,,
