# Simple Aggregation in pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
rng  = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser.sum() 
# ser.mean()

2.811925491708157

In [3]:
# Aggregation on a Dataframe returns results within each column

df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})
df.mean()

# Aggregating between each rows by specifying axis argument
df.mean(axis='columns')

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

# Loading in planet Dataset from sea born package

In [27]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape
planets.dropna().describe()
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

# GroupBy: Split, Apply, Combine

In [21]:
# using groupby

df = pd.DataFrame({'key1': ['A','B','C','A','B','C'],
                   'key2': ['one', 'two', 'two','three', 'two', 'three'],
                   'data1': np.random.randn(6),
                   'data2': np.random.randn(6)})

# Perfroming the basic split-apply-combine operation using the groupby method

# Using groupby on a specific column data
grouped = df['data1'].groupby(df['key1'])
grouped.mean()

# Applying groupby on every column in a data set
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,one,1.616311,0.757745
A,three,0.249814,0.333858
B,two,0.203872,-2.187102
C,three,-1.00964,-1.980117
C,two,-1.673084,-0.643046


In [25]:
first_50 = planets.head(50)
median_orbital = first_50.groupby('method')['orbital_period']
median_orbital.median()

method
Eclipse Timing Variations    4343.5
Imaging                      6000.0
Radial Velocity               417.9
Name: orbital_period, dtype: float64