In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({"key1": ["a", "a", None, "b", "b", "a", None],
                   "key2": pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
                   "data1": np.random.standard_normal(7),
                   "data2": np.random.standard_normal(7)})

In [3]:
df

In [4]:
grouped = df["data1"].groupby(df["key1"])

In [5]:
grouped

In [6]:
grouped.mean()

In [7]:
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()

In [8]:
means

In [9]:
means.unstack()

In [13]:
states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])

In [11]:
years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]

In [14]:
df["data1"].groupby([states, years]).mean()

In [15]:
df["data1"]

In [16]:
df.groupby("key1").mean()

In [17]:
df.groupby("key2").mean()

In [18]:
df

In [21]:
df.groupby(["key1", "key2"]).mean()

In [22]:
df.groupby(["key1", "key2"]).size()

In [23]:
df.groupby("key1", dropna=False).size()

In [24]:
df.groupby(["key1", "key2"], dropna=False).size()

In [25]:
df.groupby("key1").count()

In [26]:
for name, group in df.groupby("key1"):
    print(name)
    print(group)

In [27]:
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)

In [28]:
pieces = {name: group for name, group in df.groupby("key1")}

In [29]:
pieces["b"]

In [30]:
grouped = df.groupby({"key1": "key", "key2": "key", "data1": "data", "data2": "data"}, axis="columns")

In [31]:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

In [32]:
df.groupby("key1")["data1"]

In [33]:
df.groupby("key2")["data2"]

In [34]:
df["data1"].groupby(df["key1"])

In [35]:
df.groupby(["key1", "key2"])[["data2"]].mean()

In [36]:
s_grouped = df.groupby(["key1", "key2"])["data2"]

In [37]:
s_grouped

In [38]:
s_grouped.mean()

In [39]:
s_grouped.max()

In [40]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])

In [41]:
people.iloc[2:3, [1, 2]] = np.nan

In [42]:
people

In [43]:
mapping = {"a": "red", "b": "red", "c": "blue", "d": "blue", "e": "red", "f": "orange"}

In [47]:
by_column = people.groupby(mapping, axis="columns")

In [45]:
# by_column = people.T.groupby(mapping)

In [49]:
by_column.sum()

In [50]:
map_series = pd.Series(mapping)

In [51]:
map_series

In [52]:
people.groupby(map_series, axis="columns").count()

In [53]:
people

In [54]:
people.groupby(len).sum()

In [55]:
key_list = ["one", "one", "one", "two", "two"]

In [56]:
people.groupby([len, key_list]).min()

In [57]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                     [1, 3, 5, 1, 3]],
                                    names=["cty", "tenor"])

In [58]:
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)

In [59]:
hier_df

In [60]:
hier_df.groupby(level="cty", axis="columns").count()

In [61]:
df

In [62]:
grouped = df.groupby("key1")

In [63]:
grouped["data1"].nsmallest(2)

In [64]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [65]:
grouped.agg(peak_to_peak)

In [66]:
grouped.describe()

In [67]:
tips = pd.read_csv("examples/tips.csv")

In [68]:
tips.head()

In [69]:
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

In [70]:
tips.head()

In [71]:
grouped = tips.groupby(["day", "smoker"])

In [72]:
grouped_pct = grouped["tip_pct"]

In [73]:
grouped_pct.agg("mean")

In [74]:
grouped_pct.agg(["mean", "std", peak_to_peak])

In [75]:
grouped_pct.agg([("average", "mean"), ("stdev", np.std)])

In [76]:
functions = ["count", "mean", "max"]

In [77]:
result = grouped[["tip_pct", "total_bill"]].agg(functions)

In [78]:
result

In [79]:
result["tip_pct"]

In [80]:
ftuples = [("Average", "mean"), ("Variance", np.var)]

In [81]:
grouped[["tip_pct", "total_bill"]].agg(ftuples)

In [82]:
grouped.agg({"tip": np.max, "size": "sum"})

In [84]:
grouped.agg({"tip_pct": ["min", "max", "mean", "std"], "size": "sum"})

In [88]:
tips.groupby(["day", "smoker"], as_index=False).min()

In [89]:
def top(df, n=5, columns="tip_pct"):
    return df.sort_values(columns, ascending=False)[:n]

In [90]:
top(tips, n=6)

In [91]:
tips.groupby("smoker").apply(top)

In [92]:
tips.groupby(["smoker", "day"]).apply(top, n=1, columns="total_bill")

In [93]:
result = tips.groupby("smoker")["tip_pct"].describe()

In [94]:
result

In [95]:
result.unstack()

In [96]:
def f(group):
    return group.describe()

In [97]:
grouped.apply(f)

In [98]:
tips.groupby("smoker", group_keys=False).apply(top)

In [99]:
frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})

In [100]:
frame.head()

In [101]:
quartiles = pd.cut(frame["data1"], 4)

In [102]:
quartiles.head(10)

In [103]:
def get_stats(group):
    return pd.DataFrame({
        "min": group.min(),
        "max": group.max(),
        "count": group.count(),
        "mean": group.mean()
    })

In [104]:
grouped = frame.groupby(quartiles)

In [105]:
grouped.apply(get_stats)

In [106]:
grouped.agg(["min", "max", "count", "mean"])

In [107]:
quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)

In [108]:
quartiles_samp.head()

In [109]:
grouped = frame.groupby(quartiles_samp)

In [110]:
grouped.apply(get_stats)

In [111]:
s = pd.Series(np.random.standard_normal(6))

In [112]:
s[::2] = np.nan

In [113]:
s

In [114]:
s.fillna(s.mean())

In [115]:
states = ["Ohio", "New York", "Vermont", "Florida", "Oregon", "Nevada", "California", "Idaho"]

In [116]:
group_key = ["East", "East", "East", "East", "West", "West", "West", "West"]

In [117]:
data = pd.Series(np.random.standard_normal(8), index=states)

In [118]:
data

In [119]:
data[["Vermont", "Nevada", "Idaho"]] = np.nan

In [120]:
data

In [121]:
data.groupby(group_key).size()

In [122]:
data.groupby(group_key).count()

In [123]:
data.groupby(group_key).mean()

In [124]:
def fill_mean(group):
    return group.fillna(group.mean())

In [125]:
data.groupby(group_key).apply(fill_mean)

In [126]:
fill_values = {"East": 0.5, "West": -1}

In [127]:
def fill_func(group):
    return group.fillna(fill_values[group.name])

In [128]:
data.groupby(group_key).apply(fill_func)

In [129]:
group_key

In [130]:
["East", "West"] * 4

In [131]:
sorted(["East", "West"] * 4)

In [132]:
suits = ["H", "S", "C", "D"] # Hearts(♥️), Spades（♠️）, Clubs（♣️）, Diamonds（♦️）
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2,11)) + ["J", "K", "Q"]
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [139]:
card_val = (list(range(1, 11)) + [10] * 3) * 4

In [142]:
len(card_val)

In [143]:
card_val

In [133]:
deck.head(13)

In [134]:
def draw(deck, n=5):
    return deck.sample(n)

In [135]:
draw(deck)

In [136]:
def get_suit(card):
    # last letter is suit
    return card[-1]

In [137]:
deck.groupby(get_suit).apply(draw, n=2)

In [145]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

In [146]:
df = pd.DataFrame({"category": ["a", "a", "a", "a", "b", "b", "b", "b"],
                   "data": np.random.standard_normal(8),
                   "weights": np.random.uniform(size=8)})

In [147]:
df

In [148]:
grouped = df.groupby("category")

In [149]:
def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])

In [150]:
grouped.apply(get_wavg)

In [151]:
close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True,index_col=0)

In [152]:
close_px.info()

In [153]:
close_px.tail(4)

In [154]:
def spx_corr(group):
    return group.corrwith(group["SPX"])

In [155]:
rets = close_px.pct_change().dropna()

In [156]:
def get_year(x):
    return x.year

In [157]:
by_year = rets.groupby(get_year)

In [158]:
by_year.apply(spx_corr)

In [159]:
def corr_aapl_msft(group):
    return group["AAPL"].corr(group["MSFT"])

In [160]:
by_year.apply(corr_aapl_msft)

In [161]:
import statsmodels.api as sm

def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    result = sm.OLS(Y,X).fit()
    return result.params

In [162]:
by_year.apply(regress, yvar="AAPL", xvars=["SPX"])

In [163]:
df = pd.DataFrame({"key": ["a", "b", "c"] * 4,
                   "value": np.arange(12.)})

In [164]:
df

In [165]:
g = df.groupby("key")["value"]

In [166]:
g.mean()

In [167]:
def get_mean(group):
    return group.mean()

In [168]:
g.transform(get_mean)

In [169]:
g.transform('mean')

In [170]:
def times_two(group):
    return group * 2

In [171]:
g.transform(times_two)

In [172]:
def get_ranks(group):
    return group.rank(ascending=False)

In [173]:
g.transform(get_ranks)

In [174]:
def normalize(x):
    return (x - x.mean()) / x.std()

In [175]:
g.transform(normalize)

In [176]:
g.apply(normalize)

In [178]:
g.transform("mean")

In [179]:
normalized = (df["value"] - g.transform("mean")) / g.transform("std")

In [180]:
normalized

In [181]:
tips.head()

In [182]:
tips.pivot_table(index=["day", "smoker"])

In [183]:
tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"])

In [184]:
tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"], margins=True)

In [187]:
tips.pivot_table(index=["time", "smoker"], columns="day",
                 values="tip_pct", aggfunc=len, margins=True)

In [188]:
tips.pivot_table(index=["time", "size", "smoker"], columns="day",
                 values="tip_pct", fill_value=0)

In [189]:
from io import StringIO

In [190]:
data = """Sample    Nationality Handedness
1   USA  Right-handed
2   Japan   Left-handed
3   USA Right-handed
4   Japan   Right-handed
5   Japan   Left-handed
6   Japan   Right-handed
7   USA Right-handed
8   USA Left-handed
9   Japan   Right-handed
10  USA Right-handed
"""

In [191]:
data = pd.read_table(StringIO(data), sep="\s+")

In [192]:
data

In [193]:
pd.crosstab(data["Nationality"], data["Handedness"], margins=True)

In [194]:
pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True)