# Data Aggregation and Group Operations

In [1]:
# Categorizing a data set and applying a function to each group, whether an aggregation or transformation, is often a critical component of a data analysis workflow.
# After loading, merging, and preparing a dataset, you may need to compute group statistics or possible pivot tables for reporting or visualization purpose.
# pandas provides a flexible and high performance groupby facility, enabling you to slice and dice, and summarize data sets in a natural way.

# GroupBy mechanics

In [2]:
import pandas as pd 
import numpy as np

df = pd.DataFrame({ 'key1' : ['a', 'a', 'b', 'b', 'a'],
                    'key2' : ['one', 'two', 'one', 'two', 'one'],
                    'data1' : np.random.randn(5),
                    'data2' : np.random.randn(5)}) 

In [3]:
df.head()

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.323361,-0.353553
1,a,two,-0.76319,2.18522
2,b,one,-0.049697,0.494983
3,b,two,0.977056,1.473394
4,a,one,-0.720139,-0.071264


In [6]:
# Suppose you wanted to compute the mean of the data1 column using the labels from key1. 
# There are a number of ways to do this. One is to access data1 and call groupby with the column(a Series) at key1: 
grouped = df['data1'].groupby(df['key1']) # group data1 by key1
grouped # returns a GroupBy object 

<pandas.core.groupby.generic.SeriesGroupBy object at 0x12fb02a50>

In [8]:
# # This "grouped" variable is now a GroupBy object. It has not actualy computed anything yet except for some intermediate data about the group key df['key1'].
# # The idea is that this object has all of the information needed to then apply some operation to each of the groups. 
# # For example, to compute group means we can call the GroupBy's mean method: 
grouped.mean()

key1
a   -0.053323
b    0.463680
Name: data1, dtype: float64

In [10]:
# If instead we had passed multiple arrays as a list, we'd get something different: 

means = df['data1'].groupby([df['key1'], df['key2']]).mean() # group data1 by key1 and key2 and compute mean 
means

key1  key2
a     one     0.301611
      two    -0.763190
b     one    -0.049697
      two     0.977056
Name: data1, dtype: float64

In [12]:
# # In this case, we grouped the data using two keys, and the resulting Series now has a hierarchical index consisting of  the unique pairs of keys observed:
means.unstack() # unstack the data 

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.301611,-0.76319
b,-0.049697,0.977056


In [14]:
# In these examples, the group keys are all Series, though they could be any arrays of the right length:

states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio']) 
years = np.array([2005, 2005, 2006, 2005, 2006])

In [16]:
df['data1'].groupby([states, years]).mean() # group data1 by states and years and compute mean 

California  2005   -0.763190
            2006   -0.049697
Ohio        2005    1.150208
            2006   -0.720139
Name: data1, dtype: float64

In [20]:
# Frequently the grouping information is found in the same DataFrame as the data you want to work on. 
# In that case, you can pass column names(whther those are strings, numbers, or other Python objects) as the group keys:
df.groupby('key1').mean() # group df by key1 and compute mean 

  df.groupby('key1').mean() # group df by key1 and compute mean


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.235384,0.006903
b,-0.148516,-0.131355


In [23]:
df.groupby(['key1', 'key2']).mean() 
# # You may have noticed in the first case df.groupby('key1').mean() that there is no key2 column in the result.
# # Because df['key2'] is not numeric data, it is said to be a nuisance column, which is therefore excluded from the result.
# # By default, all of the numeric columns are aggregated, though it is possible to filter down to a subset, as you'll see soon.

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.195111,0.030563
a,two,-1.096372,-0.040417
b,one,-0.070876,0.416844
b,two,-0.226155,-0.679555


In [24]:
# Regardless of the objective in using groupby, a generally useful GroupBy method is size, which returns a Series containing group sizes:
df.groupby(['key1', 'key2']).size() # group df by key1 and key2 and compute size 

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

# Iterating Over Groups

In [25]:
# The GroupBy object supports iteration, generating a sequence of 2-tuples containing the group name along with the chunk of data: 
# Consider the following:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.249953 -0.323921
1    a  two -1.096372 -0.040417
4    a  one  0.640174  0.385047
b
  key1 key2     data1     data2
2    b  one -0.070876  0.416844
3    b  two -0.226155 -0.679555


In [26]:
# In the case of multiple keys, the first element in the tuple will be a tuple of key values: 
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1, k2) 
    print(group) 

a one
  key1 key2     data1     data2
0    a  one -0.249953 -0.323921
4    a  one  0.640174  0.385047
a two
  key1 key2     data1     data2
1    a  two -1.096372 -0.040417
b one
  key1 key2     data1     data2
2    b  one -0.070876  0.416844
b two
  key1 key2     data1     data2
3    b  two -0.226155 -0.679555


In [27]:
# Of course you can choose to do whatever you want with the pieces of data. A recipe you may find useful is computing a dict of teh data pieces as a one-liner:

pieces = dict(list(df.groupby('key1'))) # group df by key1 and store the data in a dict

In [28]:
pieces['b'] # print the data for key1 = b 

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.070876,0.416844
3,b,two,-0.226155,-0.679555


In [29]:
# By default groupby groups on axis=0, but you can group on any of the other axes. For example, we could group the columns of our example df here by dtype like so:
df.dtypes 

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [30]:
grouped = df.groupby(df.dtypes, axis=1) 

In [31]:
dict(list(grouped)) # group df by dtypes and store the data in a dict

{dtype('float64'):       data1     data2
 0 -0.249953 -0.323921
 1 -1.096372 -0.040417
 2 -0.070876  0.416844
 3 -0.226155 -0.679555
 4  0.640174  0.385047,
 dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

# Selecting a column or Subset of Columns

In [19]:
# Indexing a GroupBy object created from a DataFrame with a column name or array of column names has the effect of 
# selecting those columns for aggregation. This means that: 
df.groupby('key1')['data1'] # group df by key1 and select data1 column
df.groupby('key1')[['data2']] # group df by key1 and select data2 column
# are syntactic sugar for: 
df['data1'].groupby(df['key1']) # group data1 by key1
df[['data2']].groupby(df['key1']) # group data2 by key1 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x12fb13b50>

In [21]:
# Especially for large datasets, it may be desirable to aggregate only a few columns. 
# For example, in the preceding dataset, to compute means for just the data2 column and get the result as a DataFrame, we could write:
df.groupby(['key1', 'key2'])[['data2']].mean() # group df by key1 and key2 and compute mean of data2 column 

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.212408
a,two,2.18522
b,one,0.494983
b,two,1.473394


In [22]:
# The object returned by this indexing operation is a grouped DataFrame if a list or array is passed and a grouped Series 
# is just a single column name is passed as a scalar: 
s_grouped = df.groupby(['key1', 'key2'])['data2'] 

In [23]:
s_grouped.mean() # group df by key1 and key2 and compute mean of data2 column 

key1  key2
a     one    -0.212408
      two     2.185220
b     one     0.494983
      two     1.473394
Name: data2, dtype: float64

# Grouping with Dicts and Series 

In [24]:
# Grouping information may exist in a form other than an array. Let's consider another example DataFrame:
people = pd.DataFrame(np.random.randn(5, 5),
                        columns=['a', 'b', 'c', 'd', 'e'],
                        index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) 

In [25]:
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values 

In [26]:
people

Unnamed: 0,a,b,c,d,e
Joe,1.401812,-0.178154,1.538714,0.29612,0.368429
Steve,-0.165176,-0.515568,0.927953,-0.571039,0.310042
Wes,0.163607,,,-1.197289,0.70027
Jim,-1.50013,-0.788464,1.619117,-1.642483,-1.059542
Travis,1.587649,0.708425,0.305256,1.141281,0.578276


In [27]:
# Now suppose I have a group correspondence for the columns and want to sum together the columns by group:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [28]:
# Now you could easily construct an array from this dict to pass to groupby, but instead we can just pass the dict: 
by_column = people.groupby(mapping, axis=1) # group people by mapping and axis = 1

In [29]:
by_column.sum() # group people by mapping and axis = 1 and compute sum 

Unnamed: 0,blue,red
Joe,1.834834,1.592088
Steve,0.356915,-0.370703
Wes,-1.197289,0.863878
Jim,-0.023367,-3.348135
Travis,1.446537,2.87435


In [30]:
# The same functionality holds for Series, which can be viewwed as a fixed size mapping. 
# When I used Series as group keys in the above examples, pandas does in fact, inspect each Series to ensure that 
# its index is aligned with the axis it's grouping:
map_series = pd.Series(mapping) # create a Series from mapping 

In [31]:
map_series 

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [32]:
people.groupby(map_series, axis=1).count()  # group people by map_series and axis = 1 and compute count

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


# Grouping with Functions