In [1]:
import numpy as np
import scipy as sp
import scipy.stats as st
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib as mp
import matplotlib.pyplot as plt
# force inline plots
%matplotlib inline
plt.style.use('seaborn-deep')
import torch.nn as nn
import copy
import pandas as pd


In particle physics, the most important experiment is a counting experiment, represented by a Poisson probability model, where $N$ is the observed count and $\theta$ is a nuissance parameter, hence the probability model is $P(N|\theta) = \text{Poisson}(N,\theta)$.
where the most important parameter is the cross section $\sigma$, which is related to the mean event count 

$$N = \sigma \mathcal{L} +b$$


Suppose we have the measured background mean $B$ and the estimated background mean $b$, then $P(B|b) = L(b) =\frac{e^{-b}b^B}{B!}$ so now the posterior, using Baye's rule is $P(b|B) = \frac{P(B|b)P(b)}{P(B)}$ and if we assume a flat prior for $b$, then it corresponds to an inverse Poisson or Gamma distribution, so $P(b|B) =  \frac{e^{-b}b^B}{B!} Gamma(b)$. The same argument holds for if we include the signal and other parameters.

(more fully it is $\mu = \varepsilon \sigma \mathcal{L} +b$ where $\varepsilon = \prod_i \varepsilon_i$ is the product of all the efficiencies for the signal). Here the interesting parameter is only the cross section, whereas $\mathcal{L}, \ b$ are nuissance parameters. In a Bayesian context, one could eliminate the nuissance parameters by marginalization, i.e. by integrating the probability with respect to the nuissance parameters.

Therefore our probability model is 

$$P(\text{observed}|\text{parameters}) = P(N, \mathcal{L}_{\text{measured}}| \sigma, \mathcal{L}, b)$$

or  $P(N, \mathcal{L}_{\text{measured}}, B| \sigma, \mathcal{L}, b)$ if the background is also measured, where $N$ and $\mathcal{L}_\text{measured}$ are measured quantities and $\sigma, \mathcal{L}, b$ are parameters, of which $\mathcal{L},b$ are nuissance.

$$ P(N, \mathcal{L}_{\text{measured}} | \sigma, \mathcal{L}, b) =L(\sigma, \mathcal{L}, b)= \frac{e^{-(\sigma \mathcal{L} +b)}(\sigma \mathcal{L} +b)^N}{N !} \ Gamma(\mathcal{L_{\text{measured}}}) $$



We want to construct a test to that the probability of making a type I error (rejecting the null hypothesis when it is true) is bounded, it cannot be larger than $\alpha$. This is done by defining a critical region $R(D)$ where $D$ is the observed data, which is composed of all the values of $\theta$ that are not rejected by the test $\delta$

$$ R(D) = \{ \theta_0 \in \Theta \mid \text{test } \delta_{\theta_o} \text{ does not reject the null hypothesis} \}$$

We want to collect all these $\theta_0$ values in $R\{ D \}$ 

# Starting Simple: Using Algorithm 2 to calculate the $p$-value of a Poisson distribution.

We start with the very simple likelihood 
$$L(\theta) = \frac{e^{-\theta} \theta^N}{N !}$$

In [2]:
Bprime=1000000
D = 9 #this does not work for large values of D
L_obs=30 
#b= mean background
print('The size of B: ', Bprime)
print('The observed signal signal N (or bold X in the paper): ', D)
print('The observed luminosity: ', L_obs)
# print('The observed background'

The size of B:  1000000
The observed signal signal N (or bold X in the paper):  9
The observed luminosity:  30


Note that $D$ is only a constant and appeard only in the calculation of the test statistic $\lambda (D, \theta_0)$


### Test statistic $\lambda (X, \theta_0)$

Test statistic could be the likelihood ratio, which is the ratio of the likelihood to the profiled likelihood (likelihood computed at an MLE estimate of one of the parameters), or $t=-2log (this ratio)$

We have several options for this statistic, such as the likelihood ratio, etc. as is shown by Ann Lee's paper. In our case we'd probably like to use the likelihood ratio. For one parameter, $\hat{\theta} = N$ so $\lambda = -2 \log \frac{L(\theta)}{L(N)}$

In [3]:
def L(X, theta):
    """likelihood with one parameter"""
    return st.poisson.pmf(X, mu=theta)
    
def labd_one_param(X, theta):
    num = L(X, theta)
    den = L(X, D)
    return -2 * np.log(num/den)

For our simple example we draw a single $X ~ F_{\theta}$

In [4]:
#T=[[theta_i],[Z_i]]
def generate_training_data(Bprime, D):
    
    T = [[],[]]
    for i in range(Bprime):
        theta = st.expon.rvs() #sample theta from an exponential distribution
        #theta has to be positive because its an input to a poisson. This prior should also be close to the cound D

        N = np.random.poisson(lam=theta) #draw count samples randomly from a poisson distribution
        #this X is really N
        lam_observed = labd_one_param(X=D, theta=theta)#
        lam_i = labd_one_param(X=X, theta=theta)
        if lam_i < lam_true:
            Z_i=1
        else:
            Z_i=0
        T[0].append(theta)
        T[1].append(Z_i)
        
        return np.array(T[0]), np.array(T[1])

For a one-parameter problem the statistic is the count itself.

In [6]:
def generate_training_data_one_parameter(Bprime, D, save_data=False):
    #T=[[theta_i],[Z_i]]
    T = [[],[]]
    for i in range(Bprime):
        theta = st.expon.rvs() #sample theta from an exponential distribution
        #theta has to be positive because its an input to a poisson. This prior should also be close to the cound D

        N = np.random.poisson(lam=theta) #draw count samples randomly from a poisson distribution
        #this X is really N

        if D < N:
            Z_i=1
        else:
            Z_i=0
        T[0].append(theta)
        T[1].append(Z_i)
        
    if save_data:
        Training_data_1_param = {'theta' : T[0], 'Z' : T[1]}
        Training_data_1_param = pd.DataFrame.from_dict(Training_data_1_param)
        Training_data_1_param.to_csv('data/Training_data_1_param_1M.csv')
        
    return np.array(T[0]), np.array(T[1])

$\widehat{\mathbb{E}}[Z \mid \theta] = \frac{N_{Z=1}}{N_{Z, \ total}}$

In [None]:
theta, Z = generate_training_data_one_parameter(Bprime=1000000, D=9, save_data=True)
np.sum(Z)