In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'key1': ['a','a','b','b','a'],
    'key2': ['one','two','one','two','one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.815376,-1.237316
1,a,two,0.528443,-1.388264
2,b,one,-0.056965,-2.472505
3,b,two,1.145122,-0.315639
4,a,one,-0.581135,0.356536


In [5]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001AD27AAC808>

In [6]:
grouped.mean()

key1
a    0.254228
b    0.544079
Name: data1, dtype: float64

------

In [9]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one     0.117120
      two     0.528443
b     one    -0.056965
      two     1.145122
Name: data1, dtype: float64

In [10]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.11712,0.528443
b,-0.056965,1.145122


-------

In [22]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])

In [23]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.815376,-1.237316
1,a,two,0.528443,-1.388264
2,b,one,-0.056965,-2.472505
3,b,two,1.145122,-0.315639
4,a,one,-0.581135,0.356536


In [24]:
years

array([2005, 2005, 2006, 2005, 2006])

In [25]:
df['data1'].groupby([states,years]).mean()

California  2005    0.528443
            2006   -0.056965
Ohio        2005    0.980249
            2006   -0.581135
Name: data1, dtype: float64

-----

In [27]:
df.groupby('key1').mean() # Simply passing string column name

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.254228,-0.756348
b,0.544079,-1.394072


In [28]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.11712,-0.44039
a,two,0.528443,-1.388264
b,one,-0.056965,-2.472505
b,two,1.145122,-0.315639


-------------------

#  Iterating over Group By

In [33]:
for key_name, group in df.groupby('key1'):
    print(key_name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.815376 -1.237316
1    a  two  0.528443 -1.388264
4    a  one -0.581135  0.356536
b
  key1 key2     data1     data2
2    b  one -0.056965 -2.472505
3    b  two  1.145122 -0.315639


--------

In [34]:
## multiple keys

In [35]:
for (k1,k2), groups in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
2    b  one -0.056965 -2.472505
3    b  two  1.145122 -0.315639
('a', 'two')
  key1 key2     data1     data2
2    b  one -0.056965 -2.472505
3    b  two  1.145122 -0.315639
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.056965 -2.472505
3    b  two  1.145122 -0.315639
('b', 'two')
  key1 key2     data1     data2
2    b  one -0.056965 -2.472505
3    b  two  1.145122 -0.315639


# Dict of Groups

In [36]:
df.groupby('key1')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001AD28340A08>

In [37]:
list(df.groupby('key1'))

[('a',   key1 key2     data1     data2
  0    a  one  0.815376 -1.237316
  1    a  two  0.528443 -1.388264
  4    a  one -0.581135  0.356536), ('b',   key1 key2     data1     data2
  2    b  one -0.056965 -2.472505
  3    b  two  1.145122 -0.315639)]

In [38]:
dict(list(df.groupby('key1')))

{'a':   key1 key2     data1     data2
 0    a  one  0.815376 -1.237316
 1    a  two  0.528443 -1.388264
 4    a  one -0.581135  0.356536, 'b':   key1 key2     data1     data2
 2    b  one -0.056965 -2.472505
 3    b  two  1.145122 -0.315639}

In [40]:
grops = dict(list(df.groupby('key1')))
grops['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.056965,-2.472505
3,b,two,1.145122,-0.315639


# GroupBy Data types

In [47]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.815376,-1.237316
1,a,two,0.528443,-1.388264
2,b,one,-0.056965,-2.472505
3,b,two,1.145122,-0.315639
4,a,one,-0.581135,0.356536


In [41]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [48]:
gps = df.groupby(df.dtypes, axis=1)

In [49]:
for dtype,group in gps:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.815376 -1.237316
1  0.528443 -1.388264
2 -0.056965 -2.472505
3  1.145122 -0.315639
4 -0.581135  0.356536
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


-----------

# Selecting columns or subset of columns

In [61]:
temp1 = df.groupby('key1')['data1'].mean()
type(temp1)

pandas.core.series.Series

In [62]:
temp2 = df.groupby('key1')[['data1']].mean()
type(temp2)

pandas.core.frame.DataFrame

# Grouping Over Series and Dicts

In [63]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,0.715689,-0.431442,-0.394445,0.526751,0.23128
Steve,-0.124661,-1.554729,-1.390018,0.334033,-1.269969
Wes,-0.107139,,,0.700422,-0.183158
Jim,-0.740819,0.381127,1.15069,-1.349295,-1.04692
Travis,1.185227,-0.183205,-0.176833,-1.891521,-0.616972


In [64]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [78]:
by_column = people.groupby(mapping, axis=1)
by_column.mean()

Unnamed: 0,blue,red
Joe,0.066153,0.171842
Steve,-0.527993,-0.983119
Wes,0.700422,-0.145148
Jim,-0.099303,-0.468871
Travis,-1.034177,0.12835


-------