In [5]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [6]:
#Let's make a dframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

#Show
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.904295,-1.24881
1,X,beta,-0.658636,-0.605309
2,Y,alpha,-0.339683,-0.54661
3,Y,beta,-0.646764,-0.841136
4,Z,alpha,-0.066098,-0.281069


In [7]:
#Now let's see how to use groupby

#Lets grab the dataset1 column and group it by the k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])

#Show the groupby object
group1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x11ed7a040>

In [8]:
#Now we can perform operations on this particular group
group1.mean()

k1
X   -0.781466
Y   -0.493224
Z   -0.066098
Name: dataset1, dtype: float64

In [9]:
# We can use group keys that are series as well

#For example:

#We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

#Now using the data from dataset1, group the means by city and month
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB   -0.658636
    JAN   -0.339683
NY  FEB   -0.646764
    JAN   -0.485196
Name: dataset1, dtype: float64

In [10]:
# let's see the original dframe again.
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.904295,-1.24881
1,X,beta,-0.658636,-0.605309
2,Y,alpha,-0.339683,-0.54661
3,Y,beta,-0.646764,-0.841136
4,Z,alpha,-0.066098,-0.281069


In [22]:
# WE can also pass column names as group keys
# dframe.groupby('k1').mean() # BP this does not work: TypeError: agg function failed [how->mean,dtype->object] (probably because it cannot do mean on k2 colum)
# dframe[['dataset1', 'dataset2', 'k1']].groupby('k1').mean() # BP you have to do this weird thing
dframe.groupby('k1')[['dataset1', 'dataset2']].mean()



Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.781466,-0.92706
Y,-0.493224,-0.693873
Z,-0.066098,-0.281069


In [12]:
# Or multiple column names
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.904295,-1.24881
X,beta,-0.658636,-0.605309
Y,alpha,-0.339683,-0.54661
Y,beta,-0.646764,-0.841136
Z,alpha,-0.066098,-0.281069


In [13]:
# Another useful groupby method is getting the group sizes
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [14]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby('k1'):
    print("This is the {} group".format(name))
    print(group)
    print("\n")

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha -0.904295 -1.248810
1  X   beta -0.658636 -0.605309


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha -0.339683 -0.546610
3  Y   beta -0.646764 -0.841136


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha -0.066098 -0.281069




In [15]:
# We can also iterate with multiple keys
for (k1,k2), group in dframe.groupby(['k1','k2']):
    print("Key1 = {} Key2 = {}".format(k1,k2))
    print(group)
    print('\n')

Key1 = X Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha -0.904295  -1.24881


Key1 = X Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta -0.658636 -0.605309


Key1 = Y Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha -0.339683  -0.54661


Key1 = Y Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta -0.646764 -0.841136


Key1 = Z Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha -0.066098 -0.281069




In [16]:
# A possibly useful tactic is creating a dictionary of the data pieces
group_dict = dict(list(dframe.groupby('k1')))

#Show the group with X
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.904295,-1.24881
1,X,beta,-0.658636,-0.605309


In [17]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects! -> separate the data into data types based on column (column types)
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

#show
group_dict_axis1

  group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))


{dtype('float64'):    dataset1  dataset2
 0 -0.904295 -1.248810
 1 -0.658636 -0.605309
 2 -0.339683 -0.546610
 3 -0.646764 -0.841136
 4 -0.066098 -0.281069,
 dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [18]:
# Next we'll learn how to use groupby with columns

In [19]:
# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-1.24881
X,beta,-0.605309
Y,alpha,-0.54661
Y,beta,-0.841136
Z,alpha,-0.281069


In [20]:
#Next we'll have a quick lesson on grouping with dictionaries and series!