In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'], 
                   'k2': ['alpha','beta','alpha','beta','alpha'], 
                   'dataset1':np.random.randn(5),
                   'dataset2':np.random.randn(5)})

dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.073015,-2.160746
1,X,beta,0.170608,0.954943
2,Y,alpha,-0.749514,0.074501
3,Y,beta,-0.7462,0.56389
4,Z,alpha,-0.755117,-0.987117


In [3]:
#Use groupby to create a series groupby object (ex: dataset1 column groupby k1 key)
group1 = dframe['dataset1'].groupby(dframe['k1'])

In [4]:
group1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001D8994A4FA0>

In [5]:
group1.mean()

k1
X    0.621811
Y   -0.747857
Z   -0.755117
Name: dataset1, dtype: float64

In [6]:
cities = np.array(['NY','LA','LA','NY','NY'])

month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [7]:
#Assigned arrays onto the dataset1 and gave the means based on the groupby
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB    0.170608
    JAN   -0.749514
NY  FEB   -0.746200
    JAN    0.158949
Name: dataset1, dtype: float64

In [9]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.073015,-2.160746
1,X,beta,0.170608,0.954943
2,Y,alpha,-0.749514,0.074501
3,Y,beta,-0.7462,0.56389
4,Z,alpha,-0.755117,-0.987117


In [8]:
#Grouping by key 1 and getting the mean for each distinct key value
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.621811,-0.602902
Y,-0.747857,0.319195
Z,-0.755117,-0.987117


In [10]:
#Mean on multiple keys in a groupby (ex:means of each x alpha, x beta ...)
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,1.073015,-2.160746
X,beta,0.170608,0.954943
Y,alpha,-0.749514,0.074501
Y,beta,-0.7462,0.56389
Z,alpha,-0.755117,-0.987117


In [11]:
#How many terms are in each group
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [15]:
#Gives the name and the groups
for name,group in dframe.groupby('k1'):
    print('This is the %s group' %name)
    print(group)
    print('\n')

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha  1.073015 -2.160746
1  X   beta  0.170608  0.954943


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha -0.749514  0.074501
3  Y   beta -0.746200  0.563890


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha -0.755117 -0.987117




In [18]:
for (k1,k2),group in dframe.groupby(['k1','k2']):
    print('Key1 = %s Key2 = %s' %(k1,k2))
    print(group)
    print('\n')

Key1 = X Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha  1.073015 -2.160746


Key1 = X Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta  0.170608  0.954943


Key1 = Y Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha -0.749514  0.074501


Key1 = Y Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta   -0.7462   0.56389


Key1 = Z Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha -0.755117 -0.987117




In [16]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.073015,-2.160746
1,X,beta,0.170608,0.954943
2,Y,alpha,-0.749514,0.074501
3,Y,beta,-0.7462,0.56389
4,Z,alpha,-0.755117,-0.987117


In [19]:
#Can use a dictionary for the data pieces in groupby (maps the distinct keys to the groups)
group_dict = dict(list(dframe.groupby('k1'))) 

In [20]:
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.073015,-2.160746
1,X,beta,0.170608,0.954943


In [21]:
#Can also group with axis = 1
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))
#Separates data by actual datatype using groupby (useful for separating words from numbers)

In [22]:
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  1.073015 -2.160746
 1  0.170608  0.954943
 2 -0.749514  0.074501
 3 -0.746200  0.563890
 4 -0.755117 -0.987117,
 dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [23]:
#Specify the groupby specifically on a dataset or column
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-2.160746
X,beta,0.954943
Y,alpha,0.074501
Y,beta,0.56389
Z,alpha,-0.987117


In [None]:
#Useful tool later on! Come back to this