## How to Carry out Descriptive Statistics in Python
This Jupyter Notebook contains a lot of descriptive statistic examples and how to carry them out in Python. Note, this is the code for the blog post (https://www.marsja.se/pandas-python-descriptive-statistics/). 

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode, gmean, hmean

### Simulate Data using Python:

In [2]:
N = 20
P = ["noise","quiet"]
Q = [1,2,3]

values = [[998,511], [1119,620], [1300,790]]

mus = np.concatenate([np.repeat(value, N) for value in values])

data = pd.DataFrame(data = {'id': [subid for subid in range(N)]*(len(P)*len(Q))
    ,'iv1': np.concatenate([np.array([p]*N) for p in P]*len(Q))
    ,'iv2': np.concatenate([np.array([q]*(N*len(P))) for q in Q])
    ,'rt': np.random.normal(mus, scale=112.0, size=N*len(P)*len(Q))})

### Summary Statistics using Pandas:

In [3]:
data.describe()

Unnamed: 0,id,iv2,rt
count,120.0,120.0,120.0
mean,9.5,2.0,896.561499
std,5.790459,0.81992,316.153639
min,0.0,1.0,158.180417
25%,4.75,1.0,656.700515
50%,9.5,2.0,941.34293
75%,14.25,3.0,1114.698885
max,19.0,3.0,1658.175594


#### Grouped Descriptive Statistics:

In [4]:
grouped_data = data.groupby(['iv1', 'iv2'])
grouped_data['rt'].describe().unstack()

Unnamed: 0_level_0,count,count,count,mean,mean,mean,std,std,std,min,...,25%,50%,50%,50%,75%,75%,75%,max,max,max
iv2,1,2,3,1,2,3,1,2,3,1,...,3,1,2,3,1,2,3,1,2,3
iv1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
noise,20.0,20.0,20.0,1024.735384,1100.092451,1354.486062,56.72255,89.007461,136.175297,937.829372,...,1262.47645,1020.079029,1097.572049,1369.197266,1070.702218,1188.625837,1448.767732,1118.973159,1214.972325,1658.175594
quiet,20.0,20.0,20.0,477.587892,644.544313,777.922896,145.842666,128.757001,102.03602,158.180417,...,681.289723,515.252994,661.832697,795.377012,568.324702,730.277749,837.037028,688.494784,892.713374,994.637414


#### Getting the Mean Values in Pandas:

In [5]:
grouped_data['rt'].mean().reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1024.735384
1,noise,2,1100.092451
2,noise,3,1354.486062
3,quiet,1,477.587892
4,quiet,2,644.544313
5,quiet,3,777.922896


In [6]:
grouped_data['rt'].aggregate(np.mean).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1024.735384
1,noise,2,1100.092451
2,noise,3,1354.486062
3,quiet,1,477.587892
4,quiet,2,644.544313
5,quiet,3,777.922896


### Geometric & Harmonic Mean in Python

#### SciPy and Pandas Method:

In [7]:
grouped_data['rt'].apply(gmean, axis=None).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1023.251476
1,noise,2,1096.511041
2,noise,3,1348.021823
3,quiet,1,452.220016
4,quiet,2,632.008212
5,quiet,3,771.648688


#### Harmonic using Scipy & Pandas:

In [8]:
grouped_data['rt'].apply(hmean, axis=None).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1021.776254
1,noise,2,1092.758392
2,noise,3,1341.601111
3,quiet,1,420.907334
4,quiet,2,619.204583
5,quiet,3,765.477057


#### Trimmed Mean in Python

In [9]:
trimmed_mean = grouped_data['rt'].apply(trim_mean, .1)
trimmed_mean.reset_index()

  return np.mean(atmp[sl], axis=axis)


Unnamed: 0,iv1,iv2,rt
0,noise,1,1023.632981
1,noise,2,1109.011706
2,noise,3,1348.134663
3,quiet,1,483.176326
4,quiet,2,642.112128
5,quiet,3,771.078668


### Pandas Median

In [10]:
# Pandas Only:
# grouped_data['rt'].median().reset_index()
# Pandas + NumPy
grouped_data['rt'].aggregate(np.median).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1020.079029
1,noise,2,1097.572049
2,noise,3,1369.197266
3,quiet,1,515.252994
4,quiet,2,661.832697
5,quiet,3,795.377012


### Scipy Mode

In [11]:
grouped_data['rt'].apply(mode, axis=None).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,"([992.5056185086947], [1.0])"
1,noise,2,"([1200.2164002564496], [1.0])"
2,noise,3,"([1151.2709350345488], [1.0])"
3,quiet,1,"([655.8536461210358], [1.0])"
4,quiet,2,"([685.4831950406578], [1.0])"
5,quiet,3,"([994.6374144874794], [1.0])"


### Median, Standard Deviation, Mean, and Trimmed Mean in a Pandas Dataframe

In [12]:
descr = grouped_data['rt'].aggregate([np.median, np.std, np.mean]).reset_index()
descr['trimmed_mean'] = pd.Series(trimmed_mean.values, index=descr.index)
descr

Unnamed: 0,iv1,iv2,median,std,mean,trimmed_mean
0,noise,1,1020.079029,56.72255,1024.735384,1023.632981
1,noise,2,1097.572049,89.007461,1100.092451,1109.011706
2,noise,3,1369.197266,136.175297,1354.486062,1348.134663
3,quiet,1,515.252994,145.842666,477.587892,483.176326
4,quiet,2,661.832697,128.757001,644.544313,642.112128
5,quiet,3,795.377012,102.03602,777.922896,771.078668


### Measures of Variability in Python

### Pandas Standard deviation

In [13]:
grouped_data['rt'].std().reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,56.72255
1,noise,2,89.007461
2,noise,3,136.175297
3,quiet,1,145.842666
4,quiet,2,128.757001
5,quiet,3,102.03602


### Inter quartile range

In [14]:
grouped_data['rt'].quantile([.25, .5, .75]).unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,0.25,0.5,0.75
iv1,iv2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
noise,1,978.632734,1020.079029,1070.702218
noise,2,1053.911499,1097.572049,1188.625837
noise,3,1262.47645,1369.197266,1448.767732
quiet,1,373.880293,515.252994,568.324702
quiet,2,552.412112,661.832697,730.277749
quiet,3,681.289723,795.377012,837.037028


### Pandas Variance

In [15]:
grouped_data['rt'].var().reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,3217.44769
1,noise,2,7922.328117
2,noise,3,18543.711533
3,quiet,1,21270.083176
4,quiet,2,16578.365277
5,quiet,3,10411.349474
