In [4]:
import pandas as pd
import numpy as np

In [142]:
df = pd.read_csv('data/sensors.csv')
# df.columns = df.columns.str.lower().str.replace(' ', '_')

df = df[df.host == 'Russia'].drop(
    columns=['home_xg', 
             'away_xg', 
             'attendance', 
             'officials', 
             'notes', 
             'host', 
             'year', 
             'home_manager',
             'away_manager',
             'score',
             'home_captain',
             'away_captain'])

In [143]:
df.head(5)

Unnamed: 0,interval,sensor,axis,reading
0,0,accel,Z,0.0
1,0,accel,Y,0.5
2,0,accel,X,1.0
3,1,accel,Z,0.1
4,1,accel,Y,0.4


In [144]:
df.shape

(24, 4)

In [145]:
df.columns

Index(['interval', 'sensor', 'axis', 'reading'], dtype='object')

# Pandas Grouping and Aggregating

`groupby()`
`pd.cut()`
`pd.Grouper()`
`agg()`

## Groupby

In [146]:
# the object represents an interim description of the grouping that will eventually be performed

grouped = df.groupby(['sensor'])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fad88020340>

In [147]:
# quantity of groups

grouped.ngroups

2

In [148]:
# the groups property returns a Python dictionary whose keys represent the names of each group
# the values in the dictionary are an array of the index labels contained within each respective group

grouped.groups

{'accel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'orientation': [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]}

In [149]:
# accessing the result of a grouping

def print_groups(grouped_obj):
    for name, group in grouped_obj:
        print(name)
        print(group)
        
print_groups(grouped)

accel
    interval sensor axis  reading
0          0  accel    Z      0.0
1          0  accel    Y      0.5
2          0  accel    X      1.0
3          1  accel    Z      0.1
4          1  accel    Y      0.4
5          1  accel    X      0.9
6          2  accel    Z      0.2
7          2  accel    Y      0.3
8          2  accel    X      0.8
9          3  accel    Z      0.3
10         3  accel    Y      0.2
11         3  accel    X      0.7
orientation
    interval       sensor axis  reading
12         0  orientation    Z      0.0
13         0  orientation    Y      0.1
14         0  orientation    X      0.0
15         1  orientation    Z      0.0
16         1  orientation    Y      0.2
17         1  orientation    X      0.1
18         2  orientation    Z      0.0
19         2  orientation    Y      0.3
20         2  orientation    X      0.2
21         3  orientation    Z      0.0
22         3  orientation    Y      0.4
23         3  orientation    X      0.3


In [150]:
# a summary of the size of all the groups

grouped.size()

sensor
accel          12
orientation    12
dtype: int64

In [151]:
# the number of items in each column of every group

grouped.count()

Unnamed: 0_level_0,interval,axis,reading
sensor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,12,12,12
orientation,12,12,12


In [152]:
# get the first row in each group

grouped.first()

Unnamed: 0_level_0,interval,axis,reading
sensor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,0,Z,0.0
orientation,0,Z,0.0


In [153]:
# get the last row in each group

grouped.last()

Unnamed: 0_level_0,interval,axis,reading
sensor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,3,X,0.7
orientation,3,X,0.3


In [155]:
# get a specific group by name

grouped.get_group('accel')

Unnamed: 0,interval,sensor,axis,reading
0,0,accel,Z,0.0
1,0,accel,Y,0.5
2,0,accel,X,1.0
3,1,accel,Z,0.1
4,1,accel,Y,0.4
5,1,accel,X,0.9
6,2,accel,Z,0.2
7,2,accel,Y,0.3
8,2,accel,X,0.8
9,3,accel,Z,0.3


In [156]:
# return the specified number of items in each group

print(grouped.head(3))
print(grouped.tail(3))

    interval       sensor axis  reading
0          0        accel    Z      0.0
1          0        accel    Y      0.5
2          0        accel    X      1.0
12         0  orientation    Z      0.0
13         0  orientation    Y      0.1
14         0  orientation    X      0.0
    interval       sensor axis  reading
9          3        accel    Z      0.3
10         3        accel    Y      0.2
11         3        accel    X      0.7
21         3  orientation    Z      0.0
22         3  orientation    Y      0.4
23         3  orientation    X      0.3


In [157]:
# return the n-th item in each group

grouped.nth(2)

Unnamed: 0_level_0,interval,axis,reading
sensor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,0,X,1.0
orientation,0,X,0.0


In [158]:
# get the descriptive statistics

grouped.describe()

Unnamed: 0_level_0,interval,interval,interval,interval,interval,interval,interval,interval,reading,reading,reading,reading,reading,reading,reading,reading
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sensor,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
accel,12.0,1.5,1.167748,0.0,0.75,1.5,2.25,3.0,12.0,0.45,0.32891,0.0,0.2,0.35,0.725,1.0
orientation,12.0,1.5,1.167748,0.0,0.75,1.5,2.25,3.0,12.0,0.133333,0.143548,0.0,0.0,0.1,0.225,0.4


In [159]:
# show unique values using for grouping

grouped.groups.keys()

dict_keys(['accel', 'orientation'])

### Grouping using multiple columns

In [160]:
mcg = df.groupby(['sensor', 'interval'])
print_groups(mcg)

('accel', 0)
   interval sensor axis  reading
0         0  accel    Z      0.0
1         0  accel    Y      0.5
2         0  accel    X      1.0
('accel', 1)
   interval sensor axis  reading
3         1  accel    Z      0.1
4         1  accel    Y      0.4
5         1  accel    X      0.9
('accel', 2)
   interval sensor axis  reading
6         2  accel    Z      0.2
7         2  accel    Y      0.3
8         2  accel    X      0.8
('accel', 3)
    interval sensor axis  reading
9          3  accel    Z      0.3
10         3  accel    Y      0.2
11         3  accel    X      0.7
('orientation', 0)
    interval       sensor axis  reading
12         0  orientation    Z      0.0
13         0  orientation    Y      0.1
14         0  orientation    X      0.0
('orientation', 1)
    interval       sensor axis  reading
15         1  orientation    Z      0.0
16         1  orientation    Y      0.2
17         1  orientation    X      0.1
('orientation', 2)
    interval       sensor axis  reading

In [161]:
mcg.groups.keys()

dict_keys([('accel', 0), ('accel', 1), ('accel', 2), ('accel', 3), ('orientation', 0), ('orientation', 1), ('orientation', 2), ('orientation', 3)])

### Grouping using index levels

In [173]:
mi = df.copy()
mi = mi.set_index(['sensor', 'axis'])
mi

Unnamed: 0_level_0,Unnamed: 1_level_0,interval,reading
sensor,axis,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,Z,0,0.0
accel,Y,0,0.5
accel,X,0,1.0
accel,Z,1,0.1
accel,Y,1,0.4
accel,X,1,0.9
accel,Z,2,0.2
accel,Y,2,0.3
accel,X,2,0.8
accel,Z,3,0.3


In [174]:
# group by the first level of the index

print_groups(mi.groupby(level=0))

accel
             interval  reading
sensor axis                   
accel  Z            0      0.0
       Y            0      0.5
       X            0      1.0
       Z            1      0.1
       Y            1      0.4
       X            1      0.9
       Z            2      0.2
       Y            2      0.3
       X            2      0.8
       Z            3      0.3
       Y            3      0.2
       X            3      0.7
orientation
                  interval  reading
sensor      axis                   
orientation Z            0      0.0
            Y            0      0.1
            X            0      0.0
            Z            1      0.0
            Y            1      0.2
            X            1      0.1
            Z            2      0.0
            Y            2      0.3
            X            2      0.2
            Z            3      0.0
            Y            3      0.4
            X            3      0.3


In [175]:
# group by multiple levels of the index

print_groups(mi.groupby(level=['sensor', 'axis']))

('accel', 'X')
             interval  reading
sensor axis                   
accel  X            0      1.0
       X            1      0.9
       X            2      0.8
       X            3      0.7
('accel', 'Y')
             interval  reading
sensor axis                   
accel  Y            0      0.5
       Y            1      0.4
       Y            2      0.3
       Y            3      0.2
('accel', 'Z')
             interval  reading
sensor axis                   
accel  Z            0      0.0
       Z            1      0.1
       Z            2      0.2
       Z            3      0.3
('orientation', 'X')
                  interval  reading
sensor      axis                   
orientation X            0      0.0
            X            1      0.1
            X            2      0.2
            X            3      0.3
('orientation', 'Y')
                  interval  reading
sensor      axis                   
orientation Y            0      0.1
            Y            1     

## Applying aggregate functions, transforms, and filters

In [176]:
grouped2 = mi.groupby(level=['sensor', 'axis'])
grouped2.agg(np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,interval,reading
sensor,axis,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,X,1.5,0.85
accel,Y,1.5,0.35
accel,Z,1.5,0.15
orientation,X,1.5,0.15
orientation,Y,1.5,0.25
orientation,Z,1.5,0.0


In [177]:
# many aggregation functions are built in directly to the GroupBy object

mcg.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,reading
sensor,interval,Unnamed: 2_level_1
accel,0,0.5
accel,1,0.466667
accel,2,0.433333
accel,3,0.4
orientation,0,0.033333
orientation,1,0.1
orientation,2,0.166667
orientation,3,0.233333


In [178]:
mcg.min()

Unnamed: 0_level_0,Unnamed: 1_level_0,axis,reading
sensor,interval,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,0,X,0.0
accel,1,X,0.1
accel,2,X,0.2
accel,3,X,0.2
orientation,0,X,0.0
orientation,1,X,0.0
orientation,2,X,0.0
orientation,3,X,0.0


In [179]:
mcg.max()

Unnamed: 0_level_0,Unnamed: 1_level_0,axis,reading
sensor,interval,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,0,Z,1.0
accel,1,Z,0.9
accel,2,Z,0.8
accel,3,Z,0.7
orientation,0,Z,0.1
orientation,1,Z,0.2
orientation,2,Z,0.3
orientation,3,Z,0.4


In [180]:
mcg.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,reading
sensor,interval,Unnamed: 2_level_1
accel,0,1.5
accel,1,1.4
accel,2,1.3
accel,3,1.2
orientation,0,0.1
orientation,1,0.3
orientation,2,0.5
orientation,3,0.7


### Applying multiple aggregating functions

In [181]:
# several functions to all columns

grouped2#.agg([np.sum, np.std])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fadd85b5280>

In [182]:
# different functions to different columns

grouped2.agg({
    'interval': len,
    'reading': np.mean
})

Unnamed: 0_level_0,Unnamed: 1_level_0,interval,reading
sensor,axis,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,X,4,0.85
accel,Y,4,0.35
accel,Z,4,0.15
orientation,X,4,0.15
orientation,Y,4,0.25
orientation,Z,4,0.0


In [183]:
# several functions to same column

grouped2.agg({'reading': ['min', 'mean', 'max']})

Unnamed: 0_level_0,Unnamed: 1_level_0,reading,reading,reading
Unnamed: 0_level_1,Unnamed: 1_level_1,min,mean,max
sensor,axis,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
accel,X,0.7,0.85,1.0
accel,Y,0.2,0.35,0.5
accel,Z,0.0,0.15,0.3
orientation,X,0.0,0.15,0.3
orientation,Y,0.1,0.25,0.4
orientation,Z,0.0,0.0,0.0


In [184]:
# different functions to different columns with castomized columns names

grouped2.agg(read_min=('reading', 'min'), int_min=('interval', 'min'))

Unnamed: 0_level_0,Unnamed: 1_level_0,read_min,int_min
sensor,axis,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,X,0.7,0
accel,Y,0.2,0
accel,Z,0.0,0
orientation,X,0.0,0
orientation,Y,0.1,0
orientation,Z,0.0,0


### Applying lambda-fanctions

In [185]:
grouped2.agg(lambda x: x.to_list())

Unnamed: 0_level_0,Unnamed: 1_level_0,interval,reading
sensor,axis,Unnamed: 2_level_1,Unnamed: 3_level_1
accel,X,"[0, 1, 2, 3]","[1.0, 0.9, 0.8, 0.7]"
accel,Y,"[0, 1, 2, 3]","[0.5, 0.4, 0.3, 0.2]"
accel,Z,"[0, 1, 2, 3]","[0.0, 0.1, 0.2, 0.3]"
orientation,X,"[0, 1, 2, 3]","[0.0, 0.1, 0.2, 0.3]"
orientation,Y,"[0, 1, 2, 3]","[0.1, 0.2, 0.3, 0.4]"
orientation,Z,"[0, 1, 2, 3]","[0.0, 0.0, 0.0, 0.0]"
