In [38]:
import pandas as pd
import numpy as np

In [31]:
df = pd.DataFrame({'word':'a the a an the'.split(),
                   'tag': list('SSTTT'),
                   'count': [30, 20, 60, 5, 10]})

In [32]:
df.head()

Unnamed: 0,word,tag,count
0,a,S,30
1,the,S,20
2,a,T,60
3,an,T,5
4,the,T,10


In [20]:
df.to_dict()

{'word': {0: 'a', 1: 'the', 2: 'a', 3: 'an', 4: 'the'},
 'tag': {0: 'S', 1: 'S', 2: 'T', 3: 'T', 4: 'T'},
 'count': {0: 30, 1: 20, 2: 60, 3: 5, 4: 10}}

In [14]:
#for every "word", to find the "tag" that has the most "count"
#by default axis = 0
df.groupby(['word','tag'])[['word','tag','count']].sum().reset_index()

Unnamed: 0,word,tag,count
0,a,S,30
1,a,T,60
2,an,T,5
3,the,S,20
4,the,T,10


In [15]:

# to find the row ids for each word having the max count
df.groupby('word')[['count']].max().reset_index()

Unnamed: 0,word,count
0,a,60
1,an,5
2,the,20


In [16]:
df.groupby('word')[['count']].idxmax()


Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
a,2
an,3
the,1


In [33]:
#in this case idxmax returns the index of the row having max('count') for each group of 'word'
df.groupby('word')[['count']].idxmax()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
a,2
an,3
the,1


In [35]:
idx = df.groupby('word')['count'].idxmax()
df.loc[idx,['word','tag']]

Unnamed: 0,word,tag
2,a,T
3,an,T
1,the,S


In [36]:
#using apply
df.groupby('word').apply(lambda subdf: subdf['tag'][subdf['count'].idxmax()])

word
a      T
an     T
the    S
dtype: object

In [27]:
df['tag'][df['count'].idxmax()]

'T'

In [84]:
df = pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)

df.head()

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [57]:
df.head()

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [58]:
df.groupby(["order","class"], axis="rows")[["order",'class']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,order,class
order,class,Unnamed: 2_level_1,Unnamed: 3_level_1
Carnivora,mammal,2,2
Falconiformes,bird,1,1
Primates,mammal,1,1
Psittaciformes,bird,1,1


In [85]:
df.groupby("order", axis="columns").groups

{}

In [101]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)


In [102]:
df.head()

Unnamed: 0,A,B,C,D
0,foo,one,1.290294,0.178889
1,bar,one,-1.209721,0.292738
2,foo,two,0.658686,-0.160605
3,bar,three,0.018864,-0.495938
4,foo,two,-0.096323,-1.607796


In [103]:
df.index.names

FrozenList([None])

In [104]:
df2 = df.set_index(["A", "B"])
df2.index.names

FrozenList(['A', 'B'])

In [105]:
df2.index.names.difference(['B'])

FrozenList(['A'])

In [108]:
df2.groupby(level=df2.index.names.difference(["B"]))['C'].max()

A
bar    0.018864
foo    1.290294
Name: C, dtype: float64

In [74]:
df2.groupby(level=df2.index.names.difference(["B"]))[['C']].sum()

Unnamed: 0_level_0,C
A,Unnamed: 1_level_1
bar,1.719782
foo,-1.443647


In [75]:
df2.groupby(level=df2.index.names.difference(["B"])).sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.719782,2.018919
foo,-1.443647,-1.179108


In [77]:
#groupby sorting
# by default it is sorted
df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]})
df2

Unnamed: 0,X,Y
0,B,1
1,B,2
2,A,3
3,A,4


In [80]:
df2.groupby(["X"], sort=False).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


In [82]:
df2.groupby(["X"], sort=False).sum().columns, df2.groupby(["X"], sort=False).sum().reset_index().columns

(Index(['Y'], dtype='object'), Index(['X', 'Y'], dtype='object'))

In [86]:
#GroupBy dropna
#By default NA values are excluded from group keys during the groupby operation. However, 
# in case you want to include NA values in group keys, you could pass dropna=False to achieve it.

df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])
df_dropna

Unnamed: 0,a,b,c
0,1,2.0,3
1,1,,4
2,2,1.0,3
3,1,2.0,2


In [87]:
df_dropna.groupby(by=["b"], dropna=False).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5
,1,4


In [89]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,1.598096,0.159969
1,bar,one,-0.009492,1.545673
2,foo,two,0.19807,-0.140125
3,bar,three,0.148645,-1.446921
4,foo,two,-0.93476,-0.027414
5,bar,two,0.161474,0.409921
6,foo,one,-0.358416,-0.957096
7,foo,three,0.929314,-0.016349


In [91]:
# The groups attribute is a dict whose keys are the computed unique groups and corresponding values being the axis 
# labels belonging to each group. In the above example we have:
df.groupby('B').groups


{'one': [0, 1, 6], 'three': [3, 7], 'two': [2, 4, 5]}

In [94]:
df.groupby(['A','B'], axis=0).groups

{('bar', 'one'): [1], ('bar', 'three'): [3], ('bar', 'two'): [5], ('foo', 'one'): [0, 6], ('foo', 'three'): [7], ('foo', 'two'): [2, 4]}

In [95]:
#GroupBy with MultiIndex
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])
s = pd.Series(np.random.randn(8), index=index)
s

first  second
bar    one      -0.670520
       two       1.270915
baz    one      -0.158306
       two       0.100360
foo    one       0.426268
       two      -1.556313
qux    one      -1.007640
       two       1.148724
dtype: float64

In [98]:
s.groupby(level=[0,1]).sum()

first  second
bar    one      -0.670520
       two       1.270915
baz    one      -0.158306
       two       0.100360
foo    one       0.426268
       two      -1.556313
qux    one      -1.007640
       two       1.148724
dtype: float64