In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
import numpy.random as npr

%matplotlib inline

We're going to see how to use Bayesian Poisson estimation to model proportion data.

Let's say there are three bacteria species that characterize the gut, and we hypothesize that they are ever so shifted off from one another, but we don't know how (i.e. ignore the data-generating distribution below). Can we figure out the proportion parameters and their uncertainty?

Here, every patient is one sample, and we are recording the number of sequencing reads corresponding to some OTUs (bacteria). Each row is one sample (patient), and each column is one OTU (sample).

In [None]:
def proportion(arr):
    return arr / arr.sum()

healthy_proportions = np.array([10, 16, 2]) / np.sum([10, 16, 2])
healthy_proportions

In [None]:
sick_proportions = np.array([10, 27, 15]) / np.sum([10, 27, 15])
sick_proportions

In [None]:
n_data_points = 100

def make_healthy_multinomial(arr):
    n_sequencing_reads = npr.poisson(lam=50)
    return npr.multinomial(n_sequencing_reads, healthy_proportions)

def make_sick_multinomial(arr):
    n_sequencing_reads = npr.poisson(lam=50)
    return npr.multinomial(n_sequencing_reads, sick_proportions)

# Generate healthy data
healthy_reads = np.zeros((n_data_points, 3))
healthy_reads = np.apply_along_axis(make_healthy_multinomial, axis=1, arr=healthy_reads)

# Generate sick reads
sick_reads = np.zeros((n_data_points, 3))
sick_reads = np.apply_along_axis(make_sick_multinomial, axis=1, arr=sick_reads)

In [None]:
# Make pandas dataframe
healthy_df = pd.DataFrame(healthy_reads)
healthy_df.columns = ['bacteria1', 'bacteria2', 'bacteria3']

sick_df = pd.DataFrame(sick_reads)
sick_df.columns = ['bacteria1', 'bacteria2', 'bacteria3']

healthy_df.mean()

In [None]:
sick_df.mean()

The modelling strategy here uses the raw data. We start by modelling the counts of the healthy and the sick peoples' bacteria (e.g. number of counts in sequencing reads), and deterministicaly computing the proportions (and their associated uncertainty).

$$ \mu_{healthy} \sim DiscreteUniform(0, 100) $$

$$ counts_{healthy} \sim Poisson(\mu_{healthy}) $$

$$ p_{healthy} = \frac{counts_{healthy, i}}{sum(counts_{healthy})} $$

In [None]:
with pm.Model() as poisson_model:
    healthy_mu = pm.DiscreteUniform('healthy_mu', 0, 100, shape=(3,))
    healthy_counts = pm.Poisson('healthy_counts', mu=healthy_mu, shape=(3,), observed=healthy_df)
    healthy_proportions = pm.Deterministic('healthy_proportions', healthy_mu / healthy_mu.sum())
    
    sick_mu = pm.DiscreteUniform('sick_mu', 0, 100, shape=(3,))
    sick_counts = pm.Poisson('sick_counts', mu=sick_mu, shape=(3,), observed=sick_df)
    sick_proportions = pm.Deterministic('sick_proportions', sick_mu / sick_mu.sum())
    
    diff_proportions = pm.Deterministic('diff_proportions', healthy_proportions - sick_proportions)

In [None]:
with poisson_model:
    poisson_trace = pm.sample(draws=10000)

In [None]:
pm.traceplot(poisson_trace[2000:])

In [None]:
pm.forestplot(poisson_trace, varnames=['healthy_proportions'])

What happens if we don't have the raw counts data, but only the proportions? Can we use Bayesian estimation to help us find the population proportions for healthy, and its associated uncertainty?

Firstly, we assume that the each of the `p`s come from a Uniform(0, 1). We won't restrict them to be summed to 1; the likelihood function will be the Student's T distribution.

In [None]:
df = pd.concat([healthy_df, sick_df])
df_norm = df.apply(lambda x: proportion(x), axis=1)
df.head(5)

In [None]:
with pm.Model() as tdist_model:
    p1 = pm.Uniform('p1', 0, 1)
    p2 = pm.Uniform('p2', 0, 1)
    p3 = pm.Deterministic('p3', 1 - p1 - p2)
    # ps = pm.Uniform('ps', shape=(3,))
    std = pm.Exponential('std', lam=0.5, shape=(3,))
    nu = pm.Poisson('nu', mu=1, shape=(3,))
    
    like = pm.StudentT('like', mu=[p1, p2, p3], sd=std, nu=nu, observed=df_norm[0:n_data_points])

In [None]:
with model:
    tdist_trace = pm.sample(draws=2000)

In [None]:
pm.traceplot(tdist_trace[200:])

In [None]:
tdist_trace['proportions'].mean(axis=0)

In [None]:
healthy_df.mean() / healthy_df.mean().sum()

Here's a final formulation, using the Dirichlet/Multinomial distribution.

In [None]:
with pm.Model() as dirichlet_model:
    mu = pm.HalfNormal('mu', sd=100**2)
    n_seq_reads = pm.Poisson('n_seq_reads', mu=mu, observed=healthy_reads.sum(axis=1))
    proportions = pm.Dirichlet('proportions', a=np.ones(3), shape=(3,))
    for i in range(healthy_reads.shape[0]):
        draws = pm.Multinomial(f'draws_{i}', n=healthy_reads[i].sum(), p=proportions, observed=healthy_reads[i,:])
    dirichlet_trace = pm.sample(draws=2000)
    pm.traceplot(dirichlet_trace)

In [None]:
pm.summary(dirichlet_trace)