In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### basic

In [2]:
planets = sns.load_dataset("planets")
print(planets.head())
print(planets.shape)

            method  number  orbital_period   mass  distance  year
0  Radial Velocity       1         269.300   7.10     77.40  2006
1  Radial Velocity       1         874.774   2.21     56.95  2008
2  Radial Velocity       1         763.000   2.60     19.84  2011
3  Radial Velocity       1         326.030  19.40    110.62  2007
4  Radial Velocity       1         516.220  10.50    119.47  2009
(1035, 6)


In [3]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [4]:
planets["distance"].std()

733.1164929404421

In [5]:
# mean absolute deviation
np.sum((planets["distance"] - planets["distance"].mean()).abs()) / planets[
    "distance"
].count()

318.10088545240666

In [6]:
# root mean squared deviation
_ = np.sum((planets["distance"] - planets["distance"].mean()).pow(2)) / (
    planets["distance"].count() - 1
)
np.sqrt(_)

733.1164929404421

### split, apply, combine

In [7]:
df = pd.DataFrame(
    {"key": ["A", "B", "C", "A", "B", "C"], "data": range(6)}, columns=["key", "data"]
)
print(df)

  key  data
0   A     0
1   B     1
2   C     2
3   A     3
4   B     4
5   C     5


In [8]:
# the groupby object
df.groupby("key")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002B4FA842C90>

In [9]:
for i in df.groupby("key").__dir__():
    if not i.startswith("_"):
        print(i)

agg
aggregate
all
any
apply
bfill
boxplot
corr
corrwith
count
cov
cumcount
cummax
cummin
cumprod
cumsum
data
describe
diff
dtypes
ewm
expanding
ffill
fillna
filter
first
get_group
groups
head
hist
idxmax
idxmin
indices
key
last
max
mean
median
min
ndim
ngroup
ngroups
nth
nunique
ohlc
pct_change
pipe
plot
prod
quantile
rank
resample
rolling
sample
sem
shift
size
skew
std
sum
tail
take
transform
value_counts
var


In [10]:
df.groupby("key").sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [11]:
# column indexing
planets.groupby("method")["orbital_period"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002B4FA8428A0>

In [12]:
planets.groupby("method")["orbital_period"].count()

method
Astrometry                         2
Eclipse Timing Variations          9
Imaging                           12
Microlensing                       7
Orbital Brightness Modulation      3
Pulsar Timing                      5
Pulsation Timing Variations        1
Radial Velocity                  553
Transit                          397
Transit Timing Variations          3
Name: orbital_period, dtype: int64

In [13]:
planets.groupby("method")["orbital_period"].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [14]:
# iteration over groups
for (method, group) in planets.groupby("method"):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [15]:
# dispatch methods
planets.groupby("method")["year"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


In [16]:
planets["year"].describe()

count    1035.000000
mean     2009.070531
std         3.972567
min      1989.000000
25%      2007.000000
50%      2010.000000
75%      2012.000000
max      2014.000000
Name: year, dtype: float64

### aggregate, filter, transform, apply

In [17]:
rng = np.random.RandomState(0)
df = pd.DataFrame(
    {"key": ["A", "B", "C", "A", "B", "C"], "data1": range(6), "data2": rng.randint(0, 10, 6)},
    columns=["key", "data1", "data2"],
)
print(df)

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9


In [19]:
# aggregate
df.groupby("key").aggregate(["min", np.median, max])

  df.groupby("key").aggregate(["min", np.median, max])
  df.groupby("key").aggregate(["min", np.median, max])


Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [20]:
df.groupby("key").aggregate({"data1": "min", "data2": "max"})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [21]:
df.groupby("key").agg(
    one_min=pd.NamedAgg(column="data1", aggfunc="min")
)


Unnamed: 0_level_0,one_min
key,Unnamed: 1_level_1
A,0
B,1
C,2


In [15]:
# filtering
def filter_func(x):
    return x["data2"].std() > 4

In [17]:
print(df); print(df.groupby("key").std())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641


In [18]:
print(df.groupby("key").filter(filter_func))

  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9


In [19]:
# transformation
df.groupby("key").transform(lambda x: x - x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [20]:
# apply
def norm_by_data2(x):
    x["data1"] /= x["data2"].sum()
    return x

print(df); print(df.groupby("key").apply(norm_by_data2))

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
      key     data1  data2
key                       
A   0   A  0.000000      5
    3   A  0.375000      3
B   1   B  0.142857      0
    4   B  0.571429      7
C   2   C  0.166667      3
    5   C  0.416667      9


  print(df); print(df.groupby("key").apply(norm_by_data2))


In [23]:
def norm_by_data2(x):
    x["res"] = x["data1"] / x["data2"].sum()
    return x

df.groupby("key").apply(norm_by_data2)

  df.groupby("key").apply(norm_by_data2)


Unnamed: 0_level_0,Unnamed: 1_level_0,key,data1,data2,res
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,0,A,0,5,0.0
A,3,A,3,3,0.375
B,1,B,1,0,0.142857
B,4,B,4,7,0.571429
C,2,C,2,3,0.166667
C,5,C,5,9,0.416667


### grouping key specification

In [24]:
# list, array, series, or index providing the grouping keys
L = [0, 1, 0, 1, 2, 0]
print(df); print(df.groupby(L).sum())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   key  data1  data2
0  ACC      7     17
1   BA      4      3
2    B      4      7


In [25]:
df.groupby(df["key"]).sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,8
B,5,7
C,7,12


In [26]:
# a dictionary or series mapping index to group
df2 = df.set_index("key")
mapping = {"A": "vowel", "B": "consonant", "C": "consonant"}
df2.groupby(mapping).sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
consonant,12,19
vowel,3,8


In [27]:
# any Python function
df2.groupby(str.lower).mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


In [29]:
# a list of valid keys
df2.groupby([str.lower, mapping]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key,key,Unnamed: 2_level_1,Unnamed: 3_level_1
a,vowel,1.5,4.0
b,consonant,2.5,3.5
c,consonant,3.5,6.0


In [31]:
# groupby example
planets

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [32]:
decade = 10 * (planets["year"] // 10)
decade = decade.astype(str) + "s"
decade.name = "decade"
planets.groupby(["method", decade])["number"].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


### ways to specify agg

In [None]:
df = pd.read_csv('../titanic.csv')

In [None]:
df["fare_bin"] = pd.qcut(df["fare"], q=4)

# groupby named arg
df.groupby(["fare_bin"], dropna=False, observed=False).agg(
    n=pd.NamedAgg(column="fare_bin", aggfunc="count"),
    shape=pd.NamedAgg(
        column="who",
        aggfunc=lambda x: df.loc[x.index, :].shape,
    ),
    survive_rate=pd.NamedAgg(column="survived", aggfunc="mean"),
    n_man=pd.NamedAgg(column="who", aggfunc=lambda x: x.isin(["man"]).sum()),
    pct_man=pd.NamedAgg(
        column="who",
        aggfunc=lambda x: (
            x.isin(["man"]).sum() / x.count() if x.count() > 0 else np.nan
        )
        * 100,
    ),
)

Unnamed: 0_level_0,n,shape,survive_rate,n_man,pct_man
fare_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-0.001, 7.91]",223,"(223, 16)",0.197309,179,80.269058
"(7.91, 14.454]",224,"(224, 16)",0.303571,153,68.303571
"(14.454, 31.0]",222,"(222, 16)",0.454955,111,50.0
"(31.0, 512.329]",222,"(222, 16)",0.581081,94,42.342342
