# GroupBy: Split, Apply, Combine

In [6]:
#Simple aggregations can give you a flavor of your dataset, but often we would prefer
#to aggregate conditionally on some label or index: this is implemented in the so-
#called groupby operation. The name “group by” comes from a command in the SQL
#database language, but it is perhaps more illuminative to think of it in the terms first
#coined by Hadley Wickham of Rstats fame: split, apply, combine.

In [7]:
import numpy as np

In [8]:
import pandas as pd

In [14]:
df=pd.DataFrame({'Key':['A','B','C','A','B','C'],'Data':range(6)},columns=['Key','Data'])

In [15]:
df

Unnamed: 0,Key,Data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [16]:
df.groupby('Key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1d478f1220>

In [17]:
#Notice that what is returned is not a set of DataFrame s, but a DataFrameGroupBy
#object. This object is where the magic is: you can think of it as a special view of the
#DataFrame , which is poised to dig into the groups but does no actual computation
#until the aggregation is applied. This “lazy evaluation” approach means that common
#aggregates can be implemented very efficiently in a way that is almost transparent to
#the user.
#To produce a result, we can apply an aggregate to this DataFrameGroupBy object,
#which will perform the appropriate apply/combine steps to produce the desired
#result:

In [18]:
df.groupby('Key').sum()

Unnamed: 0_level_0,Data
Key,Unnamed: 1_level_1
A,3
B,5
C,7


# Column indexing.

In [19]:
#The GroupBy object supports column indexing in the same way as
#the DataFrame , and returns a modified GroupBy object. For example:

In [20]:
df=pd.DataFrame({'key':['A','B','C','D','A','C','D','B'],'data1':range(8),'data2':np.random.randint(0,10,8)})

In [21]:
df

Unnamed: 0,key,data1,data2
0,A,0,4
1,B,1,4
2,C,2,6
3,D,3,1
4,A,4,5
5,C,5,6
6,D,6,3
7,B,7,4


In [22]:
df.groupby('key')['data2'].sum()

key
A     9
B     8
C    12
D     4
Name: data2, dtype: int64

# describe() method of DataFrame s to perform a set of aggregations that describe each group in the data

In [25]:
df.groupby('key')['data2'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2.0,4.5,0.707107,4.0,4.25,4.5,4.75,5.0
B,2.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0
C,2.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0
D,2.0,2.0,1.414214,1.0,1.5,2.0,2.5,3.0


# Aggregate, filter, transform, apply

In [26]:
#The preceding discussion focused on aggregation for the combine operation, but
#there are more options available. In particular, GroupBy objects have aggregate() ,
#filter() , transform() , and apply() methods that efficiently implement a variety of
#useful operations before combining the grouped data.

In [27]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': np.random.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])

df

Unnamed: 0,key,data1,data2
0,A,0,7
1,B,1,1
2,C,2,7
3,A,3,6
4,B,4,5
5,C,5,4


# Aggregation.

In [28]:
#We’re now familiar with GroupBy aggregations with sum() , median() ,
#and the like, but the aggregate() method allows for even more flexibility. It can take
#a string, a function, or a list thereof, and compute all the aggregates at once. Here is a
#quick example combining all these

In [29]:
df.groupby('key').aggregate(['min',np.median,max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,6,6.5,7
B,1,2.5,4,1,3.0,5
C,2,3.5,5,4,5.5,7


In [30]:
#Another useful pattern is to pass a dictionary mapping column names to operations
#to be applied on that column:

In [31]:
df.groupby('key').aggregate({'data1':'sum','data2':'std'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,0.707107
B,5,2.828427
C,7,2.12132


# Filtering.

In [32]:
#A filtering operation allows you to drop data based on the group proper‐
#ties. For example, we might want to keep all groups in which the standard deviation is
#larger than some critical value

In [35]:
def filter_func(x):
    return x['data2'].std()>=2

In [36]:
df.groupby('key').filter(filter_func)

Unnamed: 0,key,data1,data2
1,B,1,1
2,C,2,7
4,B,4,5
5,C,5,4


# Transformation.

In [37]:
#While aggregation must return a reduced version of the data, trans‐
#formation can return some transformed version of the full data to recombine. For
#such a transformation, the output is the same shape as the input. A common example
#is to center the data by subtracting the group-wise mean:

In [38]:
df.groupby('key').transform(lambda x:x-x.mean())

Unnamed: 0,data1,data2
0,-1.5,0.5
1,-1.5,-2.0
2,-1.5,1.5
3,1.5,-0.5
4,1.5,2.0
5,1.5,-1.5


# The apply() method.

In [39]:
#The apply() method lets you apply an arbitrary function to the
#group results. The function should take a DataFrame , and return either a Pandas
#object (e.g., DataFrame , Series ) or a scalar; the combine operation will be tailored to
#the type of output returned

In [40]:
#For example, here is an apply() that normalizes the first column by the sum of the second

In [42]:
def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].sum()
    return x

In [43]:
print(df.groupby('key').apply(norm_by_data2))

  key     data1  data2
0   A  0.000000      7
1   B  0.166667      1
2   C  0.181818      7
3   A  0.230769      6
4   B  0.666667      5
5   C  0.454545      4


# Thank You