##### GroupBy mechanics

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),'data2' : np.random.randn(5)})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.227775,0.526935
1,a,two,-0.566261,1.408261
2,b,one,0.331405,0.298135
3,b,two,-0.708324,-0.966981
4,a,one,0.914062,-0.137577


In [4]:
grouped=df.groupby(df['key2'])

In [5]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc387955748>

In [6]:
grouped.mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0.491081,0.229164
two,-0.637293,0.22064


In [8]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()

In [9]:
means

key1  key2
a     one     0.570919
      two    -0.566261
b     one     0.331405
      two    -0.708324
Name: data1, dtype: float64

In [10]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

In [11]:
df['data1'].groupby([states,years]).mean()

California  2005   -0.566261
            2006    0.331405
Ohio        2005   -0.240275
            2006    0.914062
Name: data1, dtype: float64

In [13]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.191859,0.599206
b,-0.18846,-0.334423


In [15]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.570919,0.194679
a,two,-0.566261,1.408261
b,one,0.331405,0.298135
b,two,-0.708324,-0.966981


In [16]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

##### Iterating over Groups

The GroupBy object supports iteration, generating a sequence of 2-tuples containing
the group name along with the chunk of data. Consider the following:

In [20]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.227775  0.526935
1    a  two -0.566261  1.408261
4    a  one  0.914062 -0.137577
b
  key1 key2     data1     data2
2    b  one  0.331405  0.298135
3    b  two -0.708324 -0.966981


In [21]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.227775  0.526935
4    a  one  0.914062 -0.137577
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.566261  1.408261
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.331405  0.298135
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.708324 -0.966981


In [23]:
pieces=dict(list(df.groupby('key1')))

In [24]:
pieces

{'a':   key1 key2     data1     data2
 0    a  one  0.227775  0.526935
 1    a  two -0.566261  1.408261
 4    a  one  0.914062 -0.137577, 'b':   key1 key2     data1     data2
 2    b  one  0.331405  0.298135
 3    b  two -0.708324 -0.966981}

In [25]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.331405,0.298135
3,b,two,-0.708324,-0.966981


In [26]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.227775,0.526935
1,a,two,-0.566261,1.408261
4,a,one,0.914062,-0.137577


In [27]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [30]:
grouped=df.groupby(df.dtypes,axis=1)

In [31]:
for dtype,group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.227775  0.526935
1 -0.566261  1.408261
2  0.331405  0.298135
3 -0.708324 -0.966981
4  0.914062 -0.137577
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


###### Select a column or set of columns

In [34]:
df.groupby(['key1','key2'])['data1'].mean()

key1  key2
a     one     0.570919
      two    -0.566261
b     one     0.331405
      two    -0.708324
Name: data1, dtype: float64

In [35]:
people = pd.DataFrame(np.random.randn(5, 5),columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [36]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.574632,0.466658,-0.884779,-0.683643,-1.34584
Steve,-1.572584,-0.399743,0.149115,-1.677352,-0.428607
Wes,-0.149978,-0.218541,0.730535,-0.279565,-0.696693
Jim,-1.731724,-1.233481,-2.361354,-0.306409,-0.14613
Travis,0.28961,-1.13648,1.655939,-0.975387,0.215306


In [37]:
people.iloc[2:3,[1,2]]=np.nan

In [38]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.574632,0.466658,-0.884779,-0.683643,-1.34584
Steve,-1.572584,-0.399743,0.149115,-1.677352,-0.428607
Wes,-0.149978,,,-0.279565,-0.696693
Jim,-1.731724,-1.233481,-2.361354,-0.306409,-0.14613
Travis,0.28961,-1.13648,1.655939,-0.975387,0.215306


In [39]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue','d': 'blue', 'e': 'red', 'f' : 'orange'}

In [41]:
by_column=people.groupby(mapping,axis=1)

In [43]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-1.568422,-1.453814
Steve,-1.528237,-2.400934
Wes,-0.279565,-0.846671
Jim,-2.667762,-3.111335
Travis,0.680553,-0.631563


In [44]:
people.groupby(mapping,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


###### Grouping with Functions

In [45]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.574632,0.466658,-0.884779,-0.683643,-1.34584
Steve,-1.572584,-0.399743,0.149115,-1.677352,-0.428607
Wes,-0.149978,,,-0.279565,-0.696693
Jim,-1.731724,-1.233481,-2.361354,-0.306409,-0.14613
Travis,0.28961,-1.13648,1.655939,-0.975387,0.215306


In [48]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-2.456334,-0.766823,-3.246132,-1.269616,-2.188663
5,-1.572584,-0.399743,0.149115,-1.677352,-0.428607
6,0.28961,-1.13648,1.655939,-0.975387,0.215306


Using Python functions is a more generic way of defining a group mapping compared
with a dict or Series. Any function passed as a group key will be called once per index
value, with the return values being used as the group names. More concretely, con‐
sider the example DataFrame from the previous section, which has people’s first
names as index values. Suppose you wanted to group by the length of the names;
while you could compute an array of string lengths, it’s simpler to just pass the len
function:

In [49]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.574632,0.466658,-0.884779,-0.683643,-1.34584
3,two,-1.731724,-1.233481,-2.361354,-0.306409,-0.14613
5,one,-1.572584,-0.399743,0.149115,-1.677352,-0.428607
6,two,0.28961,-1.13648,1.655939,-0.975387,0.215306


###### Grouping by Index Levels

A final convenience for hierarchically indexed datasets is the ability to aggregate
using one of the levels of an axis index. Let’s look at an example:

In [50]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],[1, 3, 5, 1, 3]],names=['cty', 'tenor'])

In [51]:
columns

MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           codes=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['cty', 'tenor'])

In [52]:
df = pd.DataFrame(np.random.randn(4, 5), columns=columns)

In [53]:
df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.218097,0.92853,-2.705662,-0.345343,0.059066
1,0.520827,-1.722669,0.999414,-0.656756,0.804873
2,0.347146,-1.444263,1.340986,-0.846785,0.656931
3,0.546165,1.192826,-0.279999,0.186863,0.921674


In [56]:
df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [59]:
df.groupby(level='tenor',axis=1).count()

tenor,1,3,5
0,2,2,1
1,2,2,1
2,2,2,1
3,2,2,1


##### data aggregation

Aggregations refer to any data transformation that produces scalar values from
arrays.

###### Quantile and Bucket Analysis

As you may recall from Chapter 8, pandas has some tools, in particular cut and qcut ,
for slicing data up into buckets with bins of your choosing or by sample quantiles.
Combining these functions with groupby makes it convenient to perform bucket or
quantile analysis on a dataset. Consider a simple random dataset and an equal-length
bucket categorization using cut :

In [62]:
frame=pd.DataFrame({'data1':np.random.randn(100),'data2':np.random.randn((100))})

In [61]:
frame

Unnamed: 0,data1,data2
0,-0.62405,-0.022985
1,0.605823,-0.306043
2,1.744979,-0.621714
3,0.992178,0.909742
4,1.020949,-0.058452
5,1.516585,-0.40288
6,-1.700494,-0.574133
7,1.608333,0.260558
8,-1.185205,-0.295682
9,-1.533073,-0.194175


In [63]:
quantiles=pd.cut(frame.data1,4)

In [66]:
quantiles[:10]

0    (-2.386, -1.128]
1      (0.124, 1.376]
2      (0.124, 1.376]
3    (-2.386, -1.128]
4      (0.124, 1.376]
5     (-1.128, 0.124]
6     (-1.128, 0.124]
7     (-1.128, 0.124]
8      (0.124, 1.376]
9     (-1.128, 0.124]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.386, -1.128] < (-1.128, 0.124] < (0.124, 1.376] < (1.376, 2.628]]

In [73]:
grouped=frame.data2.groupby(quantiles)

In [74]:
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc38649c2b0>

In [75]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),'count': group.count(), 'mean': group.mean()}

In [78]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.386, -1.128]",11.0,1.952602,-0.372619,-3.052839
"(-1.128, 0.124]",47.0,2.788734,0.155906,-1.831046
"(0.124, 1.376]",32.0,1.571792,0.090622,-2.62131
"(1.376, 2.628]",10.0,0.481131,-0.446774,-1.403589


##### Random Sampling and Permutation

Suppose you wanted to draw a random sample (with or without replacement) from a
large dataset for Monte Carlo simulation purposes or some other application. There
are a number of ways to perform the “draws”; here we use the sample method for
Series.

In [79]:
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []

In [84]:
base_names

['A', 2, 3, 4, 5, 6, 7, 8, 9, 10, 'J', 'K', 'Q']

##### pivot table and cross tabulation