# Simple Aggregation in pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
rng  = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser.sum() 
# ser.mean()

2.811925491708157

In [3]:
# Aggregation on a Dataframe returns results within each column

df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})
df.mean()

# Aggregating between each rows by specifying axis argument
df.mean(axis='columns')

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

# Loading in planet Dataset from sea born package

In [4]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape
planets.dropna().describe()
planets.groupby('method')['orbital_period'].median()

# Iteration over groups
for (method,group) in planets.groupby("method"):
    print("{0:30s} shape={1}" . format(method,group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


# GroupBy: Split, Apply, Combine

In [5]:
# using groupby

df = pd.DataFrame({'key1': ['A','B','C','A','B','C'],
                   'key2': ['one', 'two', 'two','three', 'two', 'three'],
                   'data1': np.random.randn(6),
                   'data2': np.random.randn(6)})

# Perfroming the basic split-apply-combine operation using the groupby method

# Using groupby on a specific column data
grouped = df['data1'].groupby(df['key1'])
grouped.mean()

# Applying groupby on every column in a data set
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,one,-1.144846,-1.389672
A,three,0.706842,0.394626
B,two,-0.996359,-1.598027
C,three,-0.998385,-1.290746
C,two,-0.691918,2.094209


In [6]:
first_50 = planets.head(50)
median_orbital = first_50.groupby('method')['orbital_period']
median_orbital.median()

method
Eclipse Timing Variations    4343.5
Imaging                      6000.0
Radial Velocity               417.9
Name: orbital_period, dtype: float64

In [7]:
planets.groupby(['method'])[['orbital_period']].sum()

Unnamed: 0_level_0,orbital_period
method,Unnamed: 1_level_1
Astrometry,1262.36
Eclipse Timing Variations,42764.8
Imaging,1418973.0
Microlensing,22075.0
Orbital Brightness Modulation,2.12792
Pulsar Timing,36715.11
Pulsation Timing Variations,1170.0
Radial Velocity,455315.1
Transit,8377.523
Transit Timing Variations,239.3505


# Iteration over groups

In [8]:
for (method,group) in planets.groupby('method'):
    print(f'{method.ljust(30)} {group.shape}')
    
planets.groupby('method')['orbital_period'].describe()

Astrometry                     (2, 6)
Eclipse Timing Variations      (9, 6)
Imaging                        (38, 6)
Microlensing                   (23, 6)
Orbital Brightness Modulation  (3, 6)
Pulsar Timing                  (5, 6)
Pulsation Timing Variations    (1, 6)
Radial Velocity                (553, 6)
Transit                        (397, 6)
Transit Timing Variations      (4, 6)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,631.18,544.217663,246.36,438.77,631.18,823.59,1016.0
Eclipse Timing Variations,9.0,4751.644444,2499.130945,1916.25,2900.0,4343.5,5767.0,10220.0
Imaging,12.0,118247.7375,213978.177277,4639.15,8343.9,27500.0,94250.0,730000.0
Microlensing,7.0,3153.571429,1113.166333,1825.0,2375.0,3300.0,3550.0,5100.0
Orbital Brightness Modulation,3.0,0.709307,0.725493,0.240104,0.291496,0.342887,0.943908,1.544929
Pulsar Timing,5.0,7343.021201,16313.265573,0.090706,25.262,66.5419,98.2114,36525.0
Pulsation Timing Variations,1.0,1170.0,,1170.0,1170.0,1170.0,1170.0,1170.0
Radial Velocity,553.0,823.35468,1454.92621,0.73654,38.021,360.2,982.0,17337.5
Transit,397.0,21.102073,46.185893,0.355,3.16063,5.714932,16.1457,331.60059
Transit Timing Variations,3.0,79.7835,71.599884,22.3395,39.67525,57.011,108.5055,160.0


In [9]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': range(2,8)},
                  columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,2
1,B,1,3
2,C,2,4
3,A,3,5
4,B,4,6
5,C,5,7


In [10]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [21]:
def filter_funct(x):
    return x['data2'].std() > 4
    

df.groupby('key').filter(filter_funct)

# Transforming the data by using the transform 
df_mean = df.groupby('key').transform(lambda x : x-x.mean())

L = [0, 1, 0, 1, 2, 0]
display(df,df.groupby(L).sum())

# Specifying split keys with a dictionary or series mapping index to group

df1 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C':'consonant'}
df1.groupby(mapping).sum()

# Specifying split keys with python function..
# this passs in the index and output the group

df1.groupby(str.lower).mean()

# Specifying with a lsit of valid keys

df1.groupby([str.lower, mapping]).mean()

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


Unnamed: 0,Unnamed: 1,data1,data2
a,vowel,1.5,4.0
b,consonant,2.5,3.5
c,consonant,3.5,6.0


In [38]:
decade = (10*planets['year']//10)
decade = decade.astype('str') + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)

decade,1989s,1992s,1994s,1995s,1996s,1997s,1998s,1999s,2000s,2001s,...,2005s,2006s,2007s,2008s,2009s,2010s,2011s,2012s,2013s,2014s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Astrometry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
Eclipse Timing Variations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,1.0,4.0,5.0,1.0,0.0,0.0
Imaging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,1.0,17.0,3.0,9.0,3.0,2.0,7.0,0.0
Microlensing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,0.0,6.0,2.0,2.0,1.0,8.0,4.0,0.0
Orbital Brightness Modulation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0
Pulsar Timing,0.0,6.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Pulsation Timing Variations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Radial Velocity,1.0,0.0,0.0,1.0,15.0,1.0,11.0,24.0,27.0,15.0,...,61.0,33.0,47.0,76.0,105.0,92.0,176.0,70.0,65.0,21.0
Transit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,16.0,17.0,20.0,85.0,162.0,175.0,197.0,93.0
Transit Timing Variations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0


In [35]:
planets.columns

Index(['method', 'number', 'orbital_period', 'mass', 'distance', 'year'], dtype='object')