In [None]:
%config InlineBackend.figure_format = "retina"
import numpy as np
import matplotlib.pyplot as plt

from scipy.integrate import quad
from scipy.special import comb
from scipy.optimize import minimize_scalar
from scipy import stats


plt.rcParams["font.size"] = 14

## Example: The mexican peso

In [None]:
N_HEADS = 3
N_TRIALS = 3

The problem at hand is a binomial distribution, for which the density is given by:

$$ p(y \vert N, \theta) = {N \choose y} \theta^y (1 - \theta)^{(N - y)}$$

* y: Number of sucesses,
* N: Number of trials
* $\theta$: Success probability in single trial

In [None]:
def binomial_likelihood(n_success, n_trials, p):
    C = comb(n_trials, n_success) # combinatorial coefficient
    return C * (p**n_success) * (1 - p)**(n_trials - n_success)

In [None]:
theta_space = np.linspace(0, 1, num=500) 

In [None]:
likelihood = binomial_likelihood(N_HEADS, N_TRIALS, theta_space)

In [None]:
plt.plot(theta_space, likelihood)
plt.xlabel("Theta")
plt.ylabel("Likelihood")
plt.show()

The maximum is clearly at $\theta = 1$, but let's just build some muscle memory and use scipy again anyway:

In [None]:
# to find the MLE, we *minimize* the negative likelihood
minimize_scalar(
    lambda x: -binomial_likelihood(N_HEADS, N_TRIALS, x),
    bounds=(0, 1),
    method="bounded"
)

According to the likelihood, the coin is not fair since we would conclude that $\theta$ is equal to one, so the coin _always_ lands heads.

Let's now make use of our prior knowledge and Bayes theorem. First, we will need a probability distribution over $\theta$ that somehow captures what we know from experience about theta.

### Choosing a prior

* $\theta$ is countinuous and bounded between $[0, 1]$. The only distribution we know that satisfy this is the _beta_ distribution.
* Since we know that most coins are fairly symmetric (hence fair), we are going to choose a beta distribution with mean at 0.5
* But how concentrated around 0.5 do we want it to be? This is where our priors will probably differ. I have seen and played with many mexican coins, so I have a pretty good knowledge of how fair they are. On the other hand, chances are that you haven't played with many mexican pesos so perhaps you want to remain open to wild possibilities.

Go to: https://distribution-explorer.github.io/continuous/beta.html, and play around with the parameters of $\alpha$ and $\beta$ until you find a distribution that captures what you think it's true about $\theta$. Hint: For it to have mean 0.5, you will need both parameters to be equal.

In my case something like a beta distribution with parameters (20, 20) seems reasonable, because I don't really doubt that the bias of the coin can be above 70% (nor below 30%). Let's check what this beta distribution actually says

In [None]:
# Can't be asked to write my own function, so I will just use the one that comes with scipy
prior = stats.beta(20, 20)

In [None]:
plt.plot(theta_space, prior.pdf(theta_space))

In [None]:
# The percentile point function is a readily available method that allows me to find quantiles quickly, without having to do integrals myself.
# I want to know where 99% of the probability concentrates, so I look for the (0.005, 0.995) interval
low, high = prior.ppf([0.005, 0.995])
print(low, high)

So, with my prior, I'm saying there's a 1% chance that the bias is outise the inteval [0.30, 0.70]. I'm happy with this assumption. If you're not, change it! Tweak the prior to whatever is reasonable to you. A great advantage of a Bayesian approach is that you have to quantify your knowledge explicitly. Every assumption you make can be directly discussed and challenged by others -- This is not a bug, it's a feature!

In [None]:
# The shaded region only has 1% of probabilty
plt.plot(theta_space, prior.pdf(theta_space))
plt.axvspan(0, low, color='gray', alpha=0.3)
plt.axvspan(high, 1, color='gray', alpha=0.3)
plt.xlabel("Theta")
plt.title("Prior distribution")
plt.show()

Accoding to Bayes theorem we have that the posterior is

$$ P(\theta \vert y) = \frac{P(y\vert \theta) P(\theta)}{P(y)} $$


* We have the *likelihood* $P(Data \vert Parameters)$, this is the `binomial_likelihood` with `n_trials = 3` and `n_heads=3`.
* We have the **prior**, $P(Parameters)$, in my case I choose a Beta(20, 20) distribution.
* What is $P(y)$? (Also known as the evidence)

We don't have it directly but we can compute it making use of the rules of probability:

$$P(y) =  \int P(y, \theta) d\theta = \int P(y\vert \theta) P(\theta) d \theta$$

In [None]:
# Let's calculate the integral

In [None]:
def integrand(theta):
    likelihood = binomial_likelihood(N_HEADS, N_TRIALS, theta)
    prior = stats.beta(20, 20).pdf(theta)
    return likelihood * prior

In [None]:
# theta can take values between 0 and 1, so those should be our integration limits
evidence, numerical_error = quad(integrand, 0, 1)

In [None]:
print(evidence)

In [None]:
# The numerical error in the result is small (compared to the result), so we proceed happily :)
print(numerical_error)

# Finally let's define the posterior

In [None]:
def posterior(theta):
    likelihood = binomial_likelihood(N_HEADS, N_TRIALS, theta)
    prior = stats.beta(20, 20).pdf(theta)
    return likelihood * prior / 0.1341463414634147 # hard code it so that we don't have to re-integrate every time

In [None]:
plt.plot(theta_space, binomial_likelihood(N_HEADS, N_TRIALS, theta_space), label='Likelihood')
plt.plot(theta_space, stats.beta(20, 20).pdf(theta_space), label='Prior')
plt.plot(theta_space, posterior(theta_space), label='Posterior')
plt.legend(bbox_to_anchor=[1, 0.5], loc='center left')
plt.xlabel("Theta")
plt.show()

The value of theta that maximises the posterior probability is very different to the one that maximised the likelihood. In this example, the prior is clearly the dominating term. This is not always the case, as we gather more data the likelihood function often outweights the prior.

In [None]:
minimize_scalar(
    lambda theta: -posterior(theta),
    bounds=(0, 1),
    method="bounded"
)

In [None]:
# Side question: Did we really need to compute the evidence term? What happens to the shape of the function if we exclude it?

### Posterior Statistics

Now that we are happy with our model we want to compute some statistics which describe $\theta$'s distribution.

#### A. Expectation value

**Exercise:** Compute the expected value of $\theta$ under the posterior. Note that this is defined by

$$\mathbb{E}\left[\theta \vert y\right] = \int_0^{\infty} \theta\ p(\theta \vert y) d\theta $$

(Tip: you can do the integration by using ```quad```.)

In [None]:
mean = quad(
    lambda theta: theta * posterior(theta),
    0,
    np.inf,
)[0]

print(mean)

#### B. Event Probabilities

The probability of $\theta \in [\theta_1, \theta_2]$ under the posterior can be computed by

$$ \int_{\theta_1}^{\theta_2} p(\theta \vert y) \mathrm{d} \theta.$$

##### So ... is the mexican peso a fair coin?

What do we mean by fair? In practice we know that there's no such thing as a fair coin. Every coin will have some minor imperfections that make it not symmetric. We usually do not care about those imperfections since we have some "error tolerance". 

More technically: Being a continuous variable, there is 0% probability that $\theta$ is exactly 0.5. It doesn't matter if it is the maximum of the posterior or not! What we should care about is the probability _mass_ around a neighbourhood of 0.5.

$$\text{Probability of fair} = \int_{0.49}^{0.51} p(\theta \vert Data) d\theta $$


In [None]:
tolerance = 0.01

In [None]:
# Calculate the probability of the coin being "fair" within the tolerance
# complete the code
result, numerical_error = ...
print(result)

#### C. Quantiles

From the posterior distributions we can also compute quantiles.

For a given probability $P \in [0,1]$ the associated posterior quantile is defined by

$$ \arg\max_x \left\{ \int_0^{x} \ p(\theta \vert y) d\theta \le P \right\}$$


Let's use the formula above to compute the 5-th and 95-th percentile for our $\theta$ parameter.

In [None]:
def quantile_function(P):
    # Complete code
    ...
    return ...

In [None]:
print(quantile_function(0.05))
print(quantile_function(0.95))

# Try it yourself: Incubation period revisited

Now we try to answer: What is the _expected_ incubation period  of the virus?

In [None]:
ys = [9.3, 3.0, 0.7, 11.3, 3.3, 2.3, 14.9, 0.2, 29.5, 16.2]

In [None]:
beta_space = np.linspace(0.1, 50, num=500)

We already constructed the likelihood in the last notebook:

In [None]:
def exponential_likelihood(y, beta):
    y = np.atleast_1d(y)
    n = len(np.atleast_1d(y))
    num = np.exp(-(1/beta) * sum(y))
    den = beta**n
    return num / den

Let's turn this into a Bayesian inference problem by first defining a prior distribution over the incubation period.
Let's assume that the virus is for a _skin_ infection. If you don't have any knowledge of skin infections, here's some info scrapped from a medical website:

#### Typical incubation periods of skin infections
| disease | incubation (days) |
| --- | --- |
| chickenpox | 10 to 21 |
| Fifth desease | 4 to 14 |
| Hand, foot and mouth disease | 3 to 6 |
| Impetigo | 2 to 5 |
| Lice | 7 |
| Measles | 8 to 12 |
| Roseola | 9 to 10 |
| Rubella | 14 to 21 |
| Scabbies | 30 to 45 |
| Scarlet fever | 3 to 6 |
| Shingles | 14 to 16 |

Looking at the table, we see that a typical incubation period can be as low as 2, and as hight as 45 days. 

But in principle, it could be any positive number (and not necessarily an integer!), so when defining a prior we need to look for a _semi-bounded_ distribution (were the bound is zero), which can take continuous values.

The _gamma_ distribution fits these criteria.
Find the parameters of a _gamma_ distribution such that 99% of its mass is within those values.

In [None]:
# Change the value of the b parameter until the 99% interval is close to [2, 45]

a = 3.5     # Leave fixed
b = ...     # Choose a value

gamma_prior = stats.gamma(a, loc=0, scale=1/b)
gamma_prior.ppf([0.005, 0.995]) # find the 99% interval, is it close to [2, 45] ?

In [None]:
# visualize your chosen prior
plt.title("Prior distribution")
plt.plot(beta_space, gamma_prior.pdf(beta_space))
plt.xlabel("Incubation period (days)")
plt.ylabel("Prior density")
plt.show()

# Compute the Evidence:

$$ p(y) = \int_{0}^{\infty}p(y\vert \theta) p(\theta) $$

In [None]:
# Calculate the integrand
def integrand(beta, ys):
    # Complete the code 
    likelihood = exponential_likelihood(ys, beta)
    a = 3.5
    b = ...
    prior = ...
    return ...

In [None]:
# do the integral, remember the second value is the numerical error
# If the numerical error is of the same order of magnitude as the result,
# increase the precision of the integration by choosing a smaller value for `epsabs`

quad(integrand, 0, np.inf, args=(ys,), epsabs=1e-15)

In [None]:
# complete the code
def incubation_posterior(beta, ys):
    # Complete the code 
    likelihood = ...
    a = 3.5
    b = ...
    prior = ...
    evidence = ... 
    return likelihood * prior / evidence

In [None]:
plt.title("Posterior distribution")
plt.plot(beta_space, incubation_posterior(beta_space, ys))
plt.xlabel("Incubation period (days)")
plt.ylabel("Posterior density")
plt.show()

#### Show prior, likelihood and posterior

In [None]:
# Plot together posterior, prior, and likelihood
# complete the code
plt.plot(beta_space, ..., label='Likelihood')
plt.plot(..., label='Prior')
plt.plot(..., label='Posterior')
plt.legend(bbox_to_anchor=[1, 0.5], loc='center left')
plt.xlabel("Beta")
plt.show()

#### What is the expected value?
By definition, it is
$$\mathbb{E}\left[\beta \vert y\right] = \int_0^{\infty} \beta\ p(\beta \vert y) d\beta $$

In [None]:
quad(
    lambda beta: beta * incubation_posterior(beta, ys),
    0,
    np.inf,
)

#### What is the probability that the incubation period is more than 30 days?

In [None]:
# Give the answer as the expected value:
# Complete the code
quad(
    ...,
    ...,
    ...,
    epsabs=1e-15,
)

#### What is the probability that the incubation period is between 8 and 12 days?

In [None]:
# Write your own code