In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [13]:
df = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

In [14]:
df

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.12063,0.403595,X,alpha
1,-0.4,0.07841,X,beta
2,0.380808,-0.877596,Y,alpha
3,2.003836,-0.894135,Y,beta
4,-1.37063,-0.635263,Z,alpha


In [15]:
# Grab the dataset1 column and group it by the k1 key
group1 = df['dataset1'].groupby(dframe['k1'])

In [16]:
group1

<pandas.core.groupby.SeriesGroupBy object at 0x11926f5f8>

In [17]:
# Perform average on group1
group1.mean()

k1
X   -0.139685
Y    1.192322
Z   -1.370630
Name: dataset1, dtype: float64

In [18]:
#We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['Oct','Jun','JAN','FEB','Sep'])

In [19]:
# Use the data from dataset1, group the means by city and month
df['dataset1'].groupby([cities,month]).mean()

LA  JAN    0.380808
    Jun   -0.400000
NY  FEB    2.003836
    Oct    0.120630
    Sep   -1.370630
Name: dataset1, dtype: float64

In [20]:
# Pass column names as group keys
df.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.139685,0.241003
Y,1.192322,-0.885865
Z,-1.37063,-0.635263


In [22]:
# Or multiple column names
df.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0.12063,0.403595
X,beta,-0.4,0.07841
Y,alpha,0.380808,-0.877596
Y,beta,2.003836,-0.894135
Z,alpha,-1.37063,-0.635263


In [23]:
# Another useful groupby method is getting the group sizes
df.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [25]:
# We can also iterate over groups
for name,group in df.groupby('k1'):
    print ("This is the %s group" %name)
    print (group)
    print ('\n')

This is the X group
   dataset1  dataset2 k1     k2
0   0.12063  0.403595  X  alpha
1  -0.40000  0.078410  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  0.380808 -0.877596  Y  alpha
3  2.003836 -0.894135  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4  -1.37063 -0.635263  Z  alpha




In [26]:
# We can also iterate with multiple keys
for (k1,k2) , group in df.groupby(['k1','k2']):
    print ("Key1 = %s Key2 = %s" %(k1,k2))
    print (group)
    print ('\n')

Key1 = X Key2 = alpha
   dataset1  dataset2 k1     k2
0   0.12063  0.403595  X  alpha


Key1 = X Key2 = beta
   dataset1  dataset2 k1    k2
1      -0.4   0.07841  X  beta


Key1 = Y Key2 = alpha
   dataset1  dataset2 k1     k2
2  0.380808 -0.877596  Y  alpha


Key1 = Y Key2 = beta
   dataset1  dataset2 k1    k2
3  2.003836 -0.894135  Y  beta


Key1 = Z Key2 = alpha
   dataset1  dataset2 k1     k2
4  -1.37063 -0.635263  Z  alpha




In [27]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(df.groupby('k1')))

#Show the group with X
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.12063,0.403595,X,alpha
1,-0.4,0.07841,X,beta


In [28]:
# We could have also chosen to do this with axis = 1
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

In [29]:
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  0.878651 -0.033094
 1  0.289487  1.772775
 2 -0.692689 -0.469119
 3 -0.867727  1.562645
 4 -0.587436  0.993638, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [30]:
# Group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-0.033094
X,beta,1.772775
Y,alpha,-0.469119
Y,beta,1.562645
Z,alpha,0.993638
