# Python Statistics Fundamentals: How to Describe Your Data

In [None]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

## <ins>Mean</ins>

In [None]:
mean_ = sum(x) / len(x)
mean_

### Statistics

In [None]:
mean_ = statistics.mean(x)
mean_

In [None]:
mean_ = statistics.mean(x_with_nan)
mean_

### NumPy

In [None]:
mean_ = np.mean(y)  # Function
mean_

In [None]:
mean_ = y.mean()  # Method
mean_

In [None]:
print(np.mean(y_with_nan))
y_with_nan.mean()

In [None]:
np.nanmean(y_with_nan)  # Omit NaNs

### Pandas

In [None]:
mean_ = z.mean()  # Omit NaNs by default
mean_

In [None]:
z_with_nan.mean()

## <ins>Weighted Mean</ins>

The weighted mean is very handy when you need the mean of a dataset containing items that occur with given relative frequencies.

In [None]:
0.2 * 2 + 0.5 * 4 + 0.3 * 8

In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean

In [None]:
wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
wmean

### NumPy

In [None]:
y, w = np.array(x), np.array(w)
wmean = np.average(y, weights=w)
wmean

## <ins>Harmonic Mean</ins>

The harmonic mean is the reciprocal of the mean of the reciprocals of all items in the dataset

In [None]:
hmean = len(x) / sum(1 / item for item in x)
hmean

### Statistics

In [None]:
statistics.harmonic_mean(x)

In [None]:
statistics.harmonic_mean(x_with_nan)

In [None]:
statistics.harmonic_mean([1, 0, 2])

statistics.harmonic_mean does not support negative values

### SciPy

In [None]:
scipy.stats.hmean(y)

In [None]:
scipy.stats.hmean(z)

## <ins>Geometric Mean</ins>

The geometric mean is the 𝑛-th root of the product of all 𝑛 elements 𝑥ᵢ in a dataset 𝑥

In [None]:
gmean = 1
for item in x:
    gmean *= item
gmean **= 1 / len(x)
gmean

### SciPy

In [None]:
scipy.stats.gmean(y)

In [None]:
scipy.stats.gmean(z)

## <ins>Median</ins>

In [None]:
n = len(x)
if n % 2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])
median_

### Statistics

In [None]:
statistics.median(x)

In [None]:
statistics.median(x[:-1])

In [None]:
statistics.median_low(x[:-1])

In [None]:
statistics.median_high(x[:-1])

In [None]:
statistics.median(x_with_nan)

In [None]:
statistics.median_low(x_with_nan)

In [None]:
statistics.median_high(x_with_nan)

### NumPy

In [None]:
np.median(y)

In [None]:
np.median(y[:-1])

In [None]:
np.nanmedian(y_with_nan)  # NumPy median with NaN

In [None]:
np.nanmedian(y_with_nan[:-1])  # NumPy median with NaN

### Pandas

In [None]:
z.median()

In [None]:
z_with_nan.median()

## <ins>Mode<ins>

In [None]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]
mode_ = max((u.count(item), item) for item in set(u))[1]
mode_

### Statistics

In [None]:
statistics.mode(u)

In [None]:
statistics.mode([2, math.nan, 2])

In [None]:
statistics.mode([2, math.nan, 0, math.nan, 5])

statistics.multimode() is introduced in Python 3.8.

### SciPy

In [None]:
u, v = np.array(u), np.array(v)

In [None]:
scipy.stats.mode(u)

In [None]:
scipy.stats.mode(v)

In [None]:
scipy.stats.mode(v).mode

In [None]:
scipy.stats.mode(v).count

### Pandas

In [None]:
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 2, math.nan])

In [None]:
u.mode()

In [None]:
v.mode()

In [None]:
w.mode()

## <ins>Variance</ins>

In [None]:
n = len(x)
mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n - 1)
var_

### Statistics

In [None]:
statistics.variance(x)

In [None]:
statistics.variance(x_with_nan)

### NumPy

In [None]:
np.var(y_with_nan, ddof=1)

In [None]:
y_with_nan.var(ddof=1)

In [None]:
np.nanvar(y_with_nan, ddof=1)

### Pandas

In [None]:
z.var(ddof=1)

In [None]:
z_with_nan.var(ddof=1)

## <ins>Standard Deviation</ins>

In [None]:
std_ = var_ ** 0.5
std_

### Statistics

In [None]:
statistics.stdev(x)

### NumPy

In [None]:
np.std(y, ddof=1)

In [None]:
y.std(ddof=1)

In [None]:
np.std(y_with_nan, ddof=1)

In [None]:
y_with_nan.std(ddof=1)

In [None]:
np.nanstd(y_with_nan, ddof=1)

### Pandas

In [None]:
z.std(ddof=1)

In [None]:
z_with_nan.std(ddof=1)

## <ins>Skewness</ins>

skewness measures the asymmetry of a data sample

In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
n = len(x)
mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n - 1)
std_ = var_ ** 0.5
skew_ = (sum((item - mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))
skew_