In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


class display(object):
    """Display HTML representation of multiple objects"""

    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""

    def __init__(self, *args):
        self.args = args

    def _repr_html_(self):
        return "\n".join(
            self.template.format(a, eval(a)._repr_html_()) for a in self.args
        )

    def __repr__(self):
        return "\n\n".join(a + "\n" + repr(eval(a)) for a in self.args)

Planets data

In [2]:
planets = sns.load_dataset("planets")
planets.shape

(1035, 6)

In [3]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [4]:
# Simple Aggregation in Pandas
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [5]:
ser.sum()

2.811925491708157

In [6]:
ser.mean()

0.5623850983416314

In [7]:
# for a DataFrame, by default the aggregates return results within each column:
df = pd.DataFrame({"A": rng.rand(5), "B": rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [8]:
df.mean()

A    0.477888
B    0.443420
dtype: float64

In [9]:
# by specifying the axis argument to the aggregate method, you can instead aggregate within each row:
df.mean(axis="columns")

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [10]:
# Pandas Series: Describe Method
# it computes several common aggregates for each column and returns the result we now drop rows with missing values
planets.dropna().describe()
# This method helps us understand the overall properties of a dataset. For example we see in the year column that although exoplanets were discovered as far back as 1989, half of all planets in the dataset were not dicovered until 2010 or after. This is largely thanks to the kepler mission, which is a space-based telescope specifically designed for finding planets around other stars

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [11]:
# to go deeper the next level is groupby operation which allows us to quickly and efficently compute aggergates on subsets of data
# groupby is conditional aggregation on some label or index. there is split, apply, combine
# split: involves breaking up and grouping a dataframe depending on the value of the specified key
# apply: involves computing some function, usually an aggregate, transformation or filtering, within the individual groups
# combine: merges the results of these operations into an output array
# intermediate splits do not need to be explicitly instantiated. groupby can often do this in a single pass over the data, updating the sum, mean, count, min, or other aggregate for each group along the way.
# the power of the groupby is that it abstracts away these steps: the user need not think about how the computation is done under the hood, but rather thinks about the operation as a whole
# for example we can start by creating the input dataframe
df = pd.DataFrame(
    {"key": ["A", "B", "C", "A", "B", "C"], "data": range(6)}, columns=["key", "data"]
)
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [12]:
# the most basic split-apply-combine operation can be computed with the groupby() method of DataFrames, passing the name of the desired key column
df.groupby("key")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x128f08910>

In [13]:
# what is returned is a dataframegroupby object it is a special view of the dataframe which is poised to dig into the groups but does no actual computation until the aggregation is applied
# this lazy evaluation approach means that common aggregates can be implimented efficently in a way that is almost transparent to the user.
# we can apply an aggregate to this dataframegroupby object, which will perform the appropriate apply/combine steps to produce the desired result
df.groupby("key").sum()
# the sum method is just one possibility almost any aggregation function or any valid dataframe operation can be used here

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


The GroupBy Object

In [14]:
# this is a flexible abstraction can be treated as simply as a collection of DataFrames.
# aggregate, filter, transform, and apply are the operations made available by this interface
# groupby object supports column indexing in the same way as the dataframe, and returns a modified groupby object
planets.groupby("method")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x10d4984d0>

In [15]:
planets.groupby("method")["orbital_period"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x128626c10>

In [17]:
# we've selected a particular series group from the original dataframe groupby reference
# no computation is done until we call some aggregate on the object
planets.groupby("method")["orbital_period"].median()
# this gives an idea of the gneral scale of orbital periods that each method is sensitive to

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [18]:
# the groupby object supports direct iteration over the groups, returning each group as a series or dataframe
for method, group in planets.groupby("method"):
    print("{0:30s} shape={1}".format(method, group.shape))
# this can be useful for manual inspection of groups for the sake of debugging but it is often faster to use apply

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [20]:
# dispatch methods
# any method not explicitly implemented by the groupby object will be passed through and called on the groups, whether they are dataframe or series objects for example the describe() method is equivalent to calling describe() on each group
planets.groupby("method")["year"].describe().unstack()
# this table helps us to better understand the data: the vast majority of planets until 2014 were discovered using the radial velocity method, and since then the transit method has taken over
# these dispatch methods are applied to each individual group, and the results are then combined within the groupby and returned. this means that they can be any valid dataframe or series method, and will return a dataframe or series result

       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

In [21]:
# Aggregate, filter, transform, and apply
rng = np.random.RandomState(0)
df = pd.DataFrame(
    {
        "key": ["A", "B", "C", "A", "B", "C"],
        "data1": range(6),
        "data2": rng.randint(0, 10, 6),
    },
    columns=["key", "data1", "data2"],
)
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [22]:
# aggregation
# the aggregate method can take a string, a function, or a list thereof, and compute all the aggregates at once
df.groupby("key").aggregate(["min", np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [23]:
# another common pattern is to pass a dictionary mapping column names to operations to be applied on that column
df.groupby("key").aggregate({"data1": "min", "data2": "max"})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [24]:
# filtering
# a filtering operation allows you to drop data based on the group properties
# we might want to keep all groups in which the standard deviation is larger than some critical value
def filter_func(x):
    return x["data2"].std() > 4


display("df", "df.groupby('key').std()", "df.groupby('key').filter(filter_func)")
# the filter funtion should rewturn a boolean value specifying whether the group passes the filtering
# here becuase group A does not have a standard deviation greater than 4 it is dropped from the result

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


In [25]:
# transformation
# while aggregation must return a reduced version of the data, transformation can return some transformed version of the full data to recombine. the output is the same shape as the input a common example is to center the data by subtracting the group-wise mean
def center(x):
    return x - x.mean()


df.groupby("key").transform(center)

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [26]:
# the apply() method lets you apply an arbitrary function to the group results. the functiion should take a dataframe and returns either a pandas object or a scaler.
# here is an apply operation that normalizes the first column by the sum of the second
def norm_by_data2(x):
    # x is a dataframe of group values
    x["data1"] /= x["data2"].sum()
    return x


df.groupby("key").apply(norm_by_data2)
# the only criterion is that the function takes a dataframe and returns a pandas object or a scalar.

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df.groupby('key').apply(norm_by_data2)


Unnamed: 0,key,data1,data2
0,A,0.0,5
1,B,0.142857,0
2,C,0.166667,3
3,A,0.375,3
4,B,0.571429,7
5,C,0.416667,9


In [27]:
# specifying the split key
# in the example presented above we split the dataframe on a single column name this is just one of many options by whcih the groups can be defined
# the key can be any series or list with a length matching that of the dataframe
L = [0, 1, 0, 1, 2, 0]
df.groupby(L).sum()

  df.groupby(L).sum()


Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [28]:
# this means that there's another more verbose way of accomplishing the df.groupby('key') from before
df.groupby(df["key"]).sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,8
B,5,7
C,7,12


In [30]:
# a dictionary or series mapping index to group
df2 = df.set_index("key")
mapping = {"A": "vowel", "B": "consonant", "C": "consonant"}
display("df2", "df2.groupby(mapping).sum()")

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
consonant,12,19
vowel,3,8


In [31]:
# similar to mapping, you can pass any python function that will input the index value and output the group
df2.groupby(str.lower).mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


In [32]:
# further, any of the preceding key choices can be combined to group on a multi-index
df2.groupby([str.lower, mapping]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key,key,Unnamed: 2_level_1,Unnamed: 3_level_1
a,vowel,1.5,4.0
b,consonant,2.5,3.5
c,consonant,3.5,6.0


In [33]:
# as an example of this, in a few lines we can put all these together and count discovered planets by method and by decade
decade = 10 * (planets["year"] // 10)
decade = decade.astype(str) + "s"
decade.name = "decade"
planets.groupby(["method", decade])["number"].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


In [None]:
# this shows the power of combining many of the operations we've discussed this far