## Calculating percentiles / quantiles

### Random examples from Stack Overflow

https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy

In [1]:
import numpy as np

In [24]:
data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
weights = np.array([1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5])

In [18]:
quantile = 0.25

ind_sorted = np.argsort(data)
sorted_data = data[ind_sorted]
sorted_weights = weights[ind_sorted]

Sn = np.cumsum(sorted_weights)
Pn = (Sn-0.5*sorted_weights)/Sn[-1]
np.interp(quantile, Pn, sorted_data)

2.25

In [19]:
np.quantile(data, 0.25)

2.5

In [20]:
def weighted_percentile(a, q=np.array([75, 25]), w=None):
    """
    Calculates percentiles associated with a (possibly weighted) array

    Parameters
    ----------
    a : array-like
        The input array from which to calculate percents
    q : array-like
        The percentiles to calculate (0.0 - 100.0)
    w : array-like, optional
        The weights to assign to values of a.  Equal weighting if None
        is specified

    Returns
    -------
    values : np.array
        The values associated with the specified percentiles.  
    """
    # Standardize and sort based on values in a
    q = np.array(q) / 100.0
    if w is None:
        w = np.ones(a.size)
    idx = np.argsort(a)
    a_sort = a[idx]
    w_sort = w[idx]

    # Get the cumulative sum of weights
    ecdf = np.cumsum(w_sort)

    # Find the percentile index positions associated with the percentiles
    p = q * (w.sum() - 1)

    # Find the bounding indices (both low and high)
    idx_low = np.searchsorted(ecdf, p, side='right')
    idx_high = np.searchsorted(ecdf, p + 1, side='right')
    idx_high[idx_high > ecdf.size - 1] = ecdf.size - 1

    # Calculate the weights 
    weights_high = p - np.floor(p)
    weights_low = 1.0 - weights_high

    # Extract the low/high indexes and multiply by the corresponding weights
    x1 = np.take(a_sort, idx_low) * weights_low
    x2 = np.take(a_sort, idx_high) * weights_high

    # Return the average
    return np.add(x1, x2)

In [25]:
weighted_percentile(data, q=np.array([10, 25, 50, 75, 90, 100]), w=weights)

array([ 1.   ,  2.875,  5.   ,  7.625,  9.   , 10.   ])

In [26]:
def weighted_percentile2(data, weights, perc):
    """
    perc : percentile in [0-1]!
    """
    ix = np.argsort(data)
    data = data[ix] # sort data
    weights = weights[ix] # sort weights
    cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function
    return np.interp(perc, cdf, data)

In [29]:
weighted_percentile2(data, weights, 0.1)

0.6000000000000001

### statsmodels

In [58]:
from statsmodels.stats.weightstats import DescrStatsW

In [60]:
wq = DescrStatsW(data=np.arange(0, 101), weights=np.ones(101)* 1.5)
wq.quantile(probs=np.arange(0, 1.01, 0.01), return_pandas=False)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100])

Source code with details of the calculation of weighted quantiles [here](https://github.com/statsmodels/statsmodels/blob/2d5df534aa1bd8396ffa147168094c975fcd832a/statsmodels/stats/weightstats.py#L228).