In [1]:
import pandas as pd
import numpy as np

In [3]:
# Split-apply-combine

df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.611933,0.428965
1,a,two,-0.021511,0.189915
2,b,one,0.044566,0.168489
3,b,two,-0.28535,0.057889
4,a,one,0.659677,-0.11132


In [8]:
# group by key, means of data1 as per key1 groups
# use indi col series, takes list of key seriecols 
grouped = df["data1"].groupby(df["key1"]) # group by object
# each of group
grouped.mean()

key1
a    0.416700
b   -0.120392
Name: data1, dtype: float64

In [11]:
# compound group key
means = df["data1"].groupby([df["key1"], df["key2"]])
means.mean()
# heirarchial with key1(0) key2(1)

key1  key2
a     one     0.635805
      two    -0.021511
b     one     0.044566
      two    -0.285350
Name: data1, dtype: float64

In [13]:
# grouping on other array of len = nrows of data
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()



California  2005   -0.021511
            2006    0.044566
Ohio        2005    0.163291
            2006    0.659677
Name: data1, dtype: float64

In [20]:
# when groupin keys as cols then cols as arg to groupby on df instead of series
df.groupby(['key1' ,'key2']).mean() # all numeric cols aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.635805,0.158823
a,two,-0.021511,0.189915
b,one,0.044566,0.168489
b,two,-0.28535,0.057889


In [24]:
# df.groupby('key1').mean() # not working
# .size() - Count* Groupby for not null vals
df.groupby(["key1","key2"]).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [25]:
# iterating over groups (2tuple() elements)
for name,group in df.groupby("key1"):
    print(name) #grouping key
    print(group) #group partition/subset

a
  key1 key2     data1     data2
0    a  one  0.611933  0.428965
1    a  two -0.021511  0.189915
4    a  one  0.659677 -0.111320
b
  key1 key2     data1     data2
2    b  one  0.044566  0.168489
3    b  two -0.285350  0.057889


In [27]:
# compound key or use (k1,k2),group
for name,group in df.groupby(["key1", "key2"]):
    print(name)
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.611933  0.428965
4    a  one  0.659677 -0.111320
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.021511  0.189915
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.044566  0.168489
('b', 'two')
  key1 key2    data1     data2
3    b  two -0.28535  0.057889


In [28]:
dict([['s','isbib'], ['d', 'fihjf']])
# to be used now

{'s': 'isbib', 'd': 'fihjf'}

In [33]:
# datapieces as one liners
pieces = dict(list(df.groupby(['key1', 'key2'])))
pieces

{('a',
  'one'):   key1 key2     data1     data2
 0    a  one  0.611933  0.428965
 4    a  one  0.659677 -0.111320,
 ('a',
  'two'):   key1 key2     data1     data2
 1    a  two -0.021511  0.189915,
 ('b',
  'one'):   key1 key2     data1     data2
 2    b  one  0.044566  0.168489,
 ('b',
  'two'):   key1 key2    data1     data2
 3    b  two -0.28535  0.057889}

In [35]:
# All this was axis=0 grouping
# on axis 1 cols can be grouped by like dtype
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [36]:
for name,grp in df.groupby(df.dtypes, axis=1):
    print(name)
    print(grp)

float64
      data1     data2
0  0.611933  0.428965
1 -0.021511  0.189915
2  0.044566  0.168489
3 -0.285350  0.057889
4  0.659677 -0.111320
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


  for name,grp in df.groupby(df.dtypes, axis=1):


In [39]:
# selectinng col or subset of cols, for larger sets than aggregating all cols, indexing operation
df.groupby('key1')['data1'] # like df['data1'].groupby('key1')

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001C259EA6C10>

In [42]:
# just for data2 means on key1,key2
# df.groupby(['key1','key2'])['data2'].mean() # series grpby
df.groupby(['key1','key2'])[['data2']].mean() # df grpby

#  object returned by this indexing operation is a grouped DataFrame if a list or  array is passed or a grouped Series if a singlecol key in []

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.158823
a,two,0.189915
b,one,0.168489
b,two,0.057889


In [43]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
s_grouped.mean()


key1  key2
a     one     0.158823
      two     0.189915
b     one     0.168489
      two     0.057889
Name: data2, dtype: float64

### Grouping with Dict and Series

In [44]:
# grouping info other than array like in DIC mapping
people = pd.DataFrame(np.random.randn(5, 5),
             columns=['a', 'b', 'c', 'd', 'e'],
             index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,-0.737307,-0.745358,-0.074868,0.352757,-0.769376
Steve,0.005868,-0.615162,-0.460731,1.079361,0.164863
Wes,-0.510495,,,0.071137,0.830325
Jim,0.150623,-1.401807,-1.789204,2.159481,0.20857
Travis,0.674408,0.414671,-0.658702,0.891891,-0.111645


In [46]:
mapping = {
    "a" : "red",
    "b" : "red",
    "e" : "red",
    "c" : "blue",
    "d" : "blue",
    "f" : "orange",
}

people.groupby(mapping, axis=1).sum()
# TF
#  FutureWarning: DataFrame.groupby with axis=1 is deprecated. Do `frame.T.groupby(...)` without axis instead.
#   people.groupby(mapping, axis=1).sum()

  people.groupby(mapping, axis=1).sum()


Unnamed: 0,blue,red
Joe,0.277889,-2.252041
Steve,0.61863,-0.444431
Wes,0.071137,0.31983
Jim,0.370277,-1.042614
Travis,0.233189,0.977433


In [47]:
# similarly by series
map_series = pd.Series(mapping)
map_series

a       red
b       red
e       red
c      blue
d      blue
f    orange
dtype: object

In [52]:
# COUNT * GROUP BY on cols
people.groupby(map_series, axis=1).count()

  people.groupby(map_series, axis=1).count()


Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### Grouping with functions( funcs define mapping )

In [57]:
# func called per index and return val used as group name 
people.groupby(len).size() #grp size with size, count does all no. of val count

3    3
5    1
6    1
dtype: int64

In [58]:
# mix array(same size as axis) and func
#             joe    ste   wes     jim    trav
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).size() # to get feel

3  one    2
   two    1
5  one    1
6  two    1
dtype: int64

In [59]:
people.groupby([len, key_list]).min() #book sake

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.737307,-0.745358,-0.074868,0.071137,-0.769376
3,two,0.150623,-1.401807,-1.789204,2.159481,0.20857
5,one,0.005868,-0.615162,-0.460731,1.079361,0.164863
6,two,0.674408,0.414671,-0.658702,0.891891,-0.111645


### Grouping by Index levels

In [61]:
# grp on one of levels of heirarchial index
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP' , 'JP'], [1,3,5,1,3]], names=["city" , "tenor"])

heir_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
heir_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.699378,-0.560714,-0.425634,-0.634587,-0.247882
1,-0.461214,-0.664006,-0.102153,0.355431,-0.635583
2,-0.119589,-1.330325,-1.394781,-0.647572,1.14039
3,-0.86736,2.169945,-2.216718,-1.021797,-0.521235


In [63]:
# pass level number or name to group on it
heir_df.groupby(level='city', axis=1).count()

  heir_df.groupby(level='city', axis=1).count()


city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [64]:
heir_df.groupby(level='city', axis=1).size()

  heir_df.groupby(level='city', axis=1).size()


city
JP    2
US    3
dtype: int64