# 10.1 GroupBy Mechanics

split-apply-combine

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'key1':['a','a','b','b','a'],
                   'key2':['one','two','one','two','one'],
                   'data1':np.random.randn(5),
                   'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.634556,0.565912
1,a,two,0.592006,0.621098
2,b,one,1.284634,0.689309
3,b,two,0.156312,1.378085
4,a,one,-0.011333,-0.466078


In [2]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x118b04f10>

In [3]:
grouped.mean()

key1
a   -0.017961
b    0.720473
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one    -0.322945
      two     0.592006
b     one     1.284634
      two     0.156312
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.322945,0.592006
b,1.284634,0.156312


In [7]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])

In [8]:
states

array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'], dtype='<U10')

In [9]:
years

array([2005, 2005, 2006, 2005, 2006])

In [10]:
df['data1'].groupby([states, years]).mean()

California  2005    0.592006
            2006    1.284634
Ohio        2005   -0.239122
            2006   -0.011333
Name: data1, dtype: float64

In [11]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.017961,0.240311
b,0.720473,1.033697


In [12]:
df.groupby(df['key1']).mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.017961,0.240311
b,0.720473,1.033697


In [13]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.322945,0.049917
a,two,0.592006,0.621098
b,one,1.284634,0.689309
b,two,0.156312,1.378085


In [14]:
df.groupby([df['key1'], df['key2']]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.322945,0.049917
a,two,0.592006,0.621098
b,one,1.284634,0.689309
b,two,0.156312,1.378085


In [15]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 10.1.1 Iterating Over Groups

In [16]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.634556  0.565912
1    a  two  0.592006  0.621098
4    a  one -0.011333 -0.466078
b
  key1 key2     data1     data2
2    b  one  1.284634  0.689309
3    b  two  0.156312  1.378085


In [17]:
for (k1,k2), group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.634556  0.565912
4    a  one -0.011333 -0.466078
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.592006  0.621098
('b', 'one')
  key1 key2     data1     data2
2    b  one  1.284634  0.689309
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.156312  1.378085


In [18]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one -0.634556  0.565912
 1    a  two  0.592006  0.621098
 4    a  one -0.011333 -0.466078,
 'b':   key1 key2     data1     data2
 2    b  one  1.284634  0.689309
 3    b  two  0.156312  1.378085}

In [19]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.634556,0.565912
1,a,two,0.592006,0.621098
4,a,one,-0.011333,-0.466078


In [20]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,1.284634,0.689309
3,b,two,0.156312,1.378085


By default, groupby groups on axis = 0

In [22]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [23]:
grouped = df.groupby(df.dtypes, axis = 1)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118b50b50>

In [24]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.634556  0.565912
 1  0.592006  0.621098
 2  1.284634  0.689309
 3  0.156312  1.378085
 4 -0.011333 -0.466078,
 dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [33]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.634556  0.565912
1  0.592006  0.621098
2  1.284634  0.689309
3  0.156312  1.378085
4 -0.011333 -0.466078
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


## 10.1.2 Selecting a Column or Subset of Columns

In [34]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.634556,0.565912
1,a,two,0.592006,0.621098
2,b,one,1.284634,0.689309
3,b,two,0.156312,1.378085
4,a,one,-0.011333,-0.466078


In [38]:
dict(list(df.groupby('key1')))

{'a':   key1 key2     data1     data2
 0    a  one -0.634556  0.565912
 1    a  two  0.592006  0.621098
 4    a  one -0.011333 -0.466078,
 'b':   key1 key2     data1     data2
 2    b  one  1.284634  0.689309
 3    b  two  0.156312  1.378085}

In [37]:
dict(list(df.groupby('key1')['data1']))

{'a': 0   -0.634556
 1    0.592006
 4   -0.011333
 Name: data1, dtype: float64,
 'b': 2    1.284634
 3    0.156312
 Name: data1, dtype: float64}

In [40]:
df.groupby('key1')['data1']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x118b50b10>

In [41]:
df['data1'].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x118bd5890>

In [42]:
df.groupby('key1')[['data2']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118bd5b50>

In [43]:
df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118bf2c90>

In [45]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.049917
a,two,0.621098
b,one,0.689309
b,two,1.378085


In [46]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.322945,0.049917
a,two,0.592006,0.621098
b,one,1.284634,0.689309
b,two,0.156312,1.378085


In [47]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.049917
      two     0.621098
b     one     0.689309
      two     1.378085
Name: data2, dtype: float64

In [48]:
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x118bfe710>

In [49]:
s_grouped.mean()

key1  key2
a     one     0.049917
      two     0.621098
b     one     0.689309
      two     1.378085
Name: data2, dtype: float64

## 10.1.3 Grouping with Dicts and Series

In [50]:
people = pd.DataFrame(np.random.randn(5,5),
                      columns = ['a','b','c','d','e'],
                      index = ['Joe','Steve','Wes','Jim','Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-1.169936,-0.449308,-0.743672,-2.491319,-0.250818
Steve,-1.203731,-0.452507,1.261291,0.867454,-2.257059
Wes,-0.992384,-1.685322,-0.342276,-0.487069,0.862144
Jim,-1.280992,0.13585,0.281353,-1.193667,0.696817
Travis,0.332474,0.283988,-1.201285,0.070342,1.666336


In [51]:
people.iloc[2:3,[1,2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.169936,-0.449308,-0.743672,-2.491319,-0.250818
Steve,-1.203731,-0.452507,1.261291,0.867454,-2.257059
Wes,-0.992384,,,-0.487069,0.862144
Jim,-1.280992,0.13585,0.281353,-1.193667,0.696817
Travis,0.332474,0.283988,-1.201285,0.070342,1.666336


In [52]:
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}

In [53]:
by_column = people.groupby(mapping, axis = 1)
by_column

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118bd5190>

In [54]:
dict(list(by_column))

{'blue':                c         d
 Joe    -0.743672 -2.491319
 Steve   1.261291  0.867454
 Wes          NaN -0.487069
 Jim     0.281353 -1.193667
 Travis -1.201285  0.070342,
 'red':                a         b         e
 Joe    -1.169936 -0.449308 -0.250818
 Steve  -1.203731 -0.452507 -2.257059
 Wes    -0.992384       NaN  0.862144
 Jim    -1.280992  0.135850  0.696817
 Travis  0.332474  0.283988  1.666336}

In [55]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-3.234991,-1.870063
Steve,2.128746,-3.913296
Wes,-0.487069,-0.13024
Jim,-0.912314,-0.448325
Travis,-1.130943,2.282798


In [56]:
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [57]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [58]:
people.groupby(map_series, axis = 1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## 10.1.4 Grouping with Functions

Any function passed as a group key will be called once per index value, with the return values being used as the group names

In [62]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.169936,-0.449308,-0.743672,-2.491319,-0.250818
Steve,-1.203731,-0.452507,1.261291,0.867454,-2.257059
Wes,-0.992384,,,-0.487069,0.862144
Jim,-1.280992,0.13585,0.281353,-1.193667,0.696817
Travis,0.332474,0.283988,-1.201285,0.070342,1.666336


In [59]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-3.443313,-0.313458,-0.462319,-4.172054,1.308143
5,-1.203731,-0.452507,1.261291,0.867454,-2.257059
6,0.332474,0.283988,-1.201285,0.070342,1.666336


In [60]:
people.groupby(len)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118bfef50>

In [61]:
dict(list(people.groupby(len)))

{3:             a         b         c         d         e
 Joe -1.169936 -0.449308 -0.743672 -2.491319 -0.250818
 Wes -0.992384       NaN       NaN -0.487069  0.862144
 Jim -1.280992  0.135850  0.281353 -1.193667  0.696817,
 5:               a         b         c         d         e
 Steve -1.203731 -0.452507  1.261291  0.867454 -2.257059,
 6:                a         b         c         d         e
 Travis  0.332474  0.283988 -1.201285  0.070342  1.666336}

In [63]:
key_list = ['one','one','one','two','two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.169936,-0.449308,-0.743672,-2.491319,-0.250818
3,two,-1.280992,0.13585,0.281353,-1.193667,0.696817
5,one,-1.203731,-0.452507,1.261291,0.867454,-2.257059
6,two,0.332474,0.283988,-1.201285,0.070342,1.666336


## 10.1.5 Grouping by Index Levels

In [64]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                     [1,3,5,1,3]],
                                    names = ['cty','tenor'])
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['cty', 'tenor'])

In [65]:
hier_df = pd.DataFrame(np.random.randn(4,5), columns = columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.123077,-1.338536,0.687923,-1.209345,-0.068366
1,-1.190118,0.072096,-1.201864,-0.675846,-1.262594
2,-0.014121,-0.867095,1.467005,1.491336,1.059492
3,0.774304,-0.717434,-0.367379,-0.154772,0.16897


In [67]:
hier_df.groupby(level = 'cty', axis = 1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [68]:
hier_df.groupby(level = 'tenor', axis = 1).count()

tenor,1,3,5
0,2,2,1
1,2,2,1
2,2,2,1
3,2,2,1


# 10.2 Data Aggregation

In [69]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.634556,0.565912
1,a,two,0.592006,0.621098
2,b,one,1.284634,0.689309
3,b,two,0.156312,1.378085
4,a,one,-0.011333,-0.466078


In [70]:
grouped = df.groupby('key1')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118d72a10>

In [71]:
dict(list(grouped))

{'a':   key1 key2     data1     data2
 0    a  one -0.634556  0.565912
 1    a  two  0.592006  0.621098
 4    a  one -0.011333 -0.466078,
 'b':   key1 key2     data1     data2
 2    b  one  1.284634  0.689309
 3    b  two  0.156312  1.378085}

In [72]:
grouped['data1'].quantile(0.9)

key1
a    0.471338
b    1.171802
Name: data1, dtype: float64

In [73]:
grouped['data1'].quantile(0.5)

key1
a   -0.011333
b    0.720473
Name: data1, dtype: float64

In [74]:
grouped['data1'].median()

key1
a   -0.011333
b    0.720473
Name: data1, dtype: float64

In [75]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [76]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.226563,1.087176
b,1.128322,0.688776


In [77]:
grouped.agg(min)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.634556,-0.466078
b,one,0.156312,0.689309


In [78]:
grouped.min()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.634556,-0.466078
b,one,0.156312,0.689309


In [83]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.017961,0.613308,-0.634556,-0.322945,-0.011333,0.290337,0.592006,3.0,0.240311,0.612372,-0.466078,0.049917,0.565912,0.593505,0.621098
b,2.0,0.720473,0.797844,0.156312,0.438392,0.720473,1.002553,1.284634,2.0,1.033697,0.487038,0.689309,0.861503,1.033697,1.205891,1.378085


In [82]:
grouped.describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,count,3.0,3.0
a,mean,-0.017961,0.240311
a,std,0.613308,0.612372
a,min,-0.634556,-0.466078
a,25%,-0.322945,0.049917
a,50%,-0.011333,0.565912
a,75%,0.290337,0.593505
a,max,0.592006,0.621098
b,count,2.0,2.0
b,mean,0.720473,1.033697


## 10.2.1 Column-Wise and Multiple Function Application

In [84]:
tips = pd.read_csv('/Users/boyuan/Desktop/OneDrive/Data Science/Python/Python for data analysis 2nd/examples/tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [85]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [86]:
grouped = tips.groupby(['day', 'smoker'])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118de4fd0>

In [87]:
grouped_pct = grouped['tip_pct']

In [88]:
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [90]:
grouped_pct.mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [91]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


pass tuple (name, function)

In [92]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [97]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result

  


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [98]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [101]:
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


Apply different functions to one or more of the columns

In [102]:
grouped.agg({'tip':np.max, 'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [103]:
grouped.agg({'tip_pct':['min','max','mean','std'], 'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


## 10.2.2 Returning Aggregated Data Without Row Indexes 

In [106]:
tips.groupby(['day', 'smoker']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


In [105]:
tips.groupby(['day', 'smoker'], as_index = False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


# 10.3 Apply: General split-apply-combine

In [107]:
def top(df, n = 5, column = 'tip_pct'):
    return df.sort_values(by = column)[-n:]

top(tips, n = 6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


The top function is called on each row group from the DataFrame, and then the results are glued together using pandas.concat, labeling the pieces with the group names. The result therefore has a hierarchical index whose inner level contains index values from the original DataFrame

In [108]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [109]:
tips.groupby(['smoker', 'day']).apply(top, n = 1, column = 'total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [110]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [111]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [112]:
f = lambda x: x.describe()
grouped.apply(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fri,No,count,4.000000,4.000000,4.00,4.000000
Fri,No,mean,18.420000,2.812500,2.25,0.151650
Fri,No,std,5.059282,0.898494,0.50,0.028123
Fri,No,min,12.460000,1.500000,2.00,0.120385
Fri,No,25%,15.100000,2.625000,2.00,0.137239
...,...,...,...,...,...,...
Thur,Yes,min,10.340000,2.000000,2.00,0.090014
Thur,Yes,25%,13.510000,2.000000,2.00,0.148038
Thur,Yes,50%,16.470000,2.560000,2.00,0.153846
Thur,Yes,75%,19.810000,4.000000,2.00,0.194837


## 10.3.1 Suppressing the Group Keys

In [113]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [114]:
tips.groupby('smoker', group_keys = False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


## 10.3.2 Quantile and Bucket Analysis

In [115]:
frame = pd.DataFrame({'data1':np.random.randn(1000),
                      'data2':np.random.randn(1000)})
frame

Unnamed: 0,data1,data2
0,-0.951337,0.931936
1,0.337340,-1.073341
2,-1.787752,0.405062
3,-0.685224,1.218815
4,0.504260,1.784226
...,...,...
995,0.646567,-0.317731
996,0.351880,-1.872836
997,0.615455,0.959999
998,-0.791114,-0.253857


In [116]:
quartiles = pd.cut(frame.data1,4)

In [117]:
quartiles[:10]

0    (-2.307, -0.569]
1     (-0.569, 1.169]
2    (-2.307, -0.569]
3    (-2.307, -0.569]
4     (-0.569, 1.169]
5     (-0.569, 1.169]
6     (-0.569, 1.169]
7      (1.169, 2.907]
8     (-0.569, 1.169]
9     (-0.569, 1.169]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-4.052, -2.307] < (-2.307, -0.569] < (-0.569, 1.169] < (1.169, 2.907]]

In [118]:
quartiles.value_counts()

(-0.569, 1.169]     608
(-2.307, -0.569]    265
(1.169, 2.907]      114
(-4.052, -2.307]     13
Name: data1, dtype: int64

The Categorical object returned by cut can be passed directly to groupby

In [119]:
def get_stats(group):
    return {'min':group.min(), 'max':group.max(), 'count':group.count(), 'mean':group.mean()}

grouped = frame.data2.groupby(quartiles)

In [120]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-4.052, -2.307]",-1.148417,1.461113,13.0,0.392121
"(-2.307, -0.569]",-3.237576,2.789086,265.0,-0.055475
"(-0.569, 1.169]",-3.107295,2.809845,608.0,-0.017979
"(1.169, 2.907]",-2.317704,2.824818,114.0,0.074272


In [121]:
grouping = pd.qcut(frame.data1,10,labels = False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-2.440624,1.80863,100.0,-0.107938
1,-3.237576,2.387552,100.0,-0.093973
2,-1.907374,2.789086,100.0,0.194056
3,-3.107295,2.359523,100.0,0.021092
4,-2.815032,2.809845,100.0,-0.13221
5,-1.963213,2.77406,100.0,0.084735
6,-2.036117,2.101721,100.0,0.004266
7,-2.197724,2.225705,100.0,-0.054352
8,-2.656754,2.824818,100.0,-0.067058
9,-2.317704,2.052427,100.0,0.030707


## 10.3.3 Example: Filling Missing Values with Group-Specific Values

In [122]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1   -0.408359
2         NaN
3    0.593828
4         NaN
5    1.586150
dtype: float64

In [123]:
s.fillna(s.mean())

0    0.590540
1   -0.408359
2    0.590540
3    0.593828
4    0.590540
5    1.586150
dtype: float64

In [124]:
states = ['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index = states)
data

Ohio         -1.034889
New York     -0.520513
Vermont       0.019657
Florida       0.641501
Oregon        0.611152
Nevada       -0.421957
California    0.340604
Idaho         0.763786
dtype: float64

In [125]:
data[['Vermont','Nevada','Idaho']] = np.nan
data

Ohio         -1.034889
New York     -0.520513
Vermont            NaN
Florida       0.641501
Oregon        0.611152
Nevada             NaN
California    0.340604
Idaho              NaN
dtype: float64

In [126]:
data.groupby(group_key).mean()

East   -0.304633
West    0.475878
dtype: float64

In [127]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio         -1.034889
New York     -0.520513
Vermont      -0.304633
Florida       0.641501
Oregon        0.611152
Nevada        0.475878
California    0.340604
Idaho         0.475878
dtype: float64

In [128]:
fill_values = {'East':0.5, 'West': -1}

fill_func = lambda g:g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)

Ohio         -1.034889
New York     -0.520513
Vermont       0.500000
Florida       0.641501
Oregon        0.611152
Nevada       -1.000000
California    0.340604
Idaho        -1.000000
dtype: float64

## 10.3.4 Example: Random Sampling and Permutation

In [129]:
suits = ['H','S','C','D']
card_val = (list(range(1,11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + ['J','K','Q']
cards = []
for suit in ['H','S','C','D']:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index = cards)

In [130]:
deck

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64

In [131]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [132]:
def draw(deck, n = 5):
    return deck.sample(n)

draw(deck)

6C      6
8C      8
4H      4
10H    10
5C      5
dtype: int64

In [133]:
get_suit = lambda card: card[-1] # last letter is suit
deck.groupby(get_suit).apply(draw, n = 2)

C  QC     10
   KC     10
D  4D      4
   8D      8
H  10H    10
   AH      1
S  10S    10
   KS     10
dtype: int64

In [134]:
deck.groupby(get_suit, group_keys = False).apply(draw, n = 2)

10C    10
7C      7
9D      9
7D      7
2H      2
3H      3
3S      3
AS      1
dtype: int64

## 10.3.5 Example: Group Weighted Average and Correlation

In [135]:
df = pd.DataFrame({'category':['a','a','a','a','b','b','b','b'],
                   'data':np.random.randn(8),
                   'weights':np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,0.011696,0.189876
1,a,-0.43664,0.796877
2,a,1.568597,0.858576
3,a,0.788607,0.443795
4,b,-0.297734,0.051795
5,b,0.203055,0.034829
6,b,-0.587603,0.671134
7,b,0.163557,0.362038


In [136]:
grouped = df.groupby('category')

In [137]:
get_wavg = lambda g: np.average(g['data'], weights = g['weights'])
grouped.apply(get_wavg)

category
a    0.590188
b   -0.306748
dtype: float64

In [138]:
close_px = pd.read_csv('/Users/boyuan/Desktop/OneDrive/Data Science/Python/Python for data analysis 2nd/examples/stock_px_2.csv',
                       parse_dates = True,
                       index_col = 0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [139]:
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [140]:
spx_corr = lambda x: x.corrwith(x['SPX'])
rets = close_px.pct_change().dropna()

In [141]:
get_year = lambda x: x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [142]:
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

## 10.3.6 Example: Group-Wise Linear Regression 

In [144]:
import statsmodels.api as sm

def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y,X).fit()
    return result.params

In [145]:
by_year.apply(regress, 'AAPL',['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


# 10.4 Pivot Tables and Cross-Tabulation

In [146]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


Default pivot_table aggregation type is group means

In [147]:
tips.pivot_table(index = ['day','smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [148]:
tips.pivot_table(['tip_pct','size'],
                 index = ['time','day'],
                 columns = 'smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [149]:
tips.pivot_table(['tip_pct','size'],
                 index = ['time', 'day'],
                 columns = 'smoker',
                 margins = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [150]:
tips.pivot_table('tip_pct', 
                 index = ['time','smoker'], 
                 columns = 'day', 
                 aggfunc = len,
                 margins = True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106.0
Dinner,Yes,9.0,42.0,19.0,,70.0
Lunch,No,1.0,,,44.0,45.0
Lunch,Yes,6.0,,,17.0,23.0
All,,19.0,87.0,76.0,62.0,244.0


In [151]:
tips.pivot_table('tip_pct', 
                 index = ['time','smoker'], 
                 columns = 'day', 
                 aggfunc = 'mean',
                 margins = True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,0.139622,0.158048,0.160113,0.159744,0.158653
Dinner,Yes,0.165347,0.147906,0.18725,,0.160828
Lunch,No,0.187735,,,0.160311,0.16092
Lunch,Yes,0.188937,,,0.163863,0.170404
All,,0.169913,0.153152,0.166897,0.161276,0.160803


In [152]:
tips.pivot_table('tip_pct', 
                 index = ['time','size','smoker'], 
                 columns = 'day', 
                 aggfunc = 'mean', 
                 fill_value = 0)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,1,No,0.0,0.137931,0.0,0.0
Dinner,1,Yes,0.0,0.325733,0.0,0.0
Dinner,2,No,0.139622,0.162705,0.168859,0.159744
Dinner,2,Yes,0.171297,0.148668,0.207893,0.0
Dinner,3,No,0.0,0.154661,0.152663,0.0
Dinner,3,Yes,0.0,0.144995,0.15266,0.0
Dinner,4,No,0.0,0.150096,0.148143,0.0
Dinner,4,Yes,0.11775,0.124515,0.19337,0.0
Dinner,5,No,0.0,0.0,0.206928,0.0
Dinner,5,Yes,0.0,0.106572,0.06566,0.0


## 10.4.1 Cross-Tabulations: Crosstab

A cross-tabulation is a special case of a pivot table that computes group frequencies

In [153]:
data = pd.DataFrame({'Sample':[1,2,3,4,5,6,7,8,9,10],
                     'Nationality':['USA','Japan','USA','Japan','Japan','Japan','USA','USA','Japan','USA'],
                     'Handedness':['Right','Left','Right','Right','Left','Right','Right','Left','Right','Right']})
data

Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right
1,2,Japan,Left
2,3,USA,Right
3,4,Japan,Right
4,5,Japan,Left
5,6,Japan,Right
6,7,USA,Right
7,8,USA,Left
8,9,Japan,Right
9,10,USA,Right


In [154]:
pd.crosstab(data.Nationality, 
            data.Handedness, 
            margins = True)

Handedness,Left,Right,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [155]:
pd.crosstab([tips.time, tips.day], 
            tips.smoker, 
            margins = True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244


# 10.5 Conclusion