### IMPORTS

In [1]:
import pymc as pm
import numpy as np
import pandas as pd
import arviz as az



$\textbf{QUESTION 1}$

In [2]:
babies = pd.read_csv('babies.csv')
cord_clamped = babies['x']
not_clamped = babies['y']

In [3]:
print(cord_clamped.describe())

count    16.000000
mean      9.643750
std       1.714631
min       8.000000
25%       8.350000
50%       9.150000
75%      10.300000
max      13.800000
Name: x, dtype: float64


In [4]:
print(not_clamped.describe())

count    16.00000
mean     12.09375
std       2.23591
min       8.20000
25%      11.00000
50%      12.05000
75%      13.52500
max      16.20000
Name: y, dtype: float64


In [5]:

with pm.Model() as model:
    
    # Define noninformative Gamma priors
    shape_prior = 0.001
    rate_prior = 0.001
    alpha1 = pm.Gamma(name="alpha1", alpha=shape_prior, beta=rate_prior)
    beta1 = pm.Gamma(name="beta1", alpha=shape_prior, beta=rate_prior)
    alpha2 = pm.Gamma(name="alpha2", alpha=shape_prior, beta=rate_prior)
    beta2 = pm.Gamma(name="beta2", alpha=shape_prior, beta=rate_prior)
    
    # Priors for the means
    # mean1 = pm.Gamma(name='mean1', alpha=shape_prior, beta=rate_prior)
    # mean2 = pm.Gamma(name='mean2', alpha=shape_prior, beta=rate_prior)
    mean1, mean2 = alpha1 / beta1, alpha2 / beta2 

    # Likelihoods for the data
    likelihood1 = pm.Gamma(name='likelihood1', alpha=alpha1, beta=beta1, observed=cord_clamped)
    likelihood2 = pm.Gamma(name='likelihood2', alpha=alpha2, beta=beta2, observed=not_clamped)

    # Difference in means
    diff = mean1 - mean2
    diff_means = pm.Deterministic(name='diff_means', var=diff)

    # Sampling
    trace = pm.sample(draws=1000, tune=1000, target_accept=0.90, cores=None)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha1, beta1, alpha2, beta2]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 131 seconds.


In [6]:
# View trace diff_means
diff_means_trace = az.summary(data=trace, var_names=['diff_means'], hdi_prob=.90)

# Print trace summary
print(f"Trace Summary:\n{az.summary(data=trace, hdi_prob=.90)}\n")

# Check Credible Set
print(f"The 90% Credible Set for the difference of means:\n{diff_means_trace}")

Trace Summary:
              mean      sd  hdi_5%  hdi_95%  mcse_mean  mcse_sd  ess_bulk  \
alpha1      35.108  12.483  16.389   55.413      0.336    0.237    1292.0   
beta1        3.643   1.304   1.609    5.696      0.035    0.025    1307.0   
alpha2      28.405  10.434  13.545   45.920      0.286    0.203    1316.0   
beta2        2.349   0.869   1.070    3.775      0.024    0.017    1313.0   
diff_means  -2.466   0.732  -3.631   -1.251      0.011    0.008    4255.0   

            ess_tail  r_hat  
alpha1        1217.0    1.0  
beta1         1166.0    1.0  
alpha2        1585.0    1.0  
beta2         1520.0    1.0  
diff_means    2997.0    1.0  

The 90% Credible Set for the difference of means:
             mean     sd  hdi_5%  hdi_95%  mcse_mean  mcse_sd  ess_bulk  \
diff_means -2.466  0.732  -3.631   -1.251      0.011    0.008    4255.0   

            ess_tail  r_hat  
diff_means    2997.0    1.0  


$$
\text{The 90\% credible set doesn't contain 0.} \Rightarrow \text{The difference is statistically significant.}
$$

$\textbf{QUESTION 2}$

In [7]:
intraocular_pressure = pd.read_excel('iop2.xlsx', header=None, names=['indicator', 'cornea_thickness'])
low_iop = intraocular_pressure['indicator']
corn_thickness = intraocular_pressure['cornea_thickness']
corn_mean = corn_thickness.mean()
corn_std = corn_thickness.std()

In [8]:
print(low_iop.describe())

count    140.000000
mean       0.242857
std        0.430349
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: indicator, dtype: float64


In [9]:
print(corn_thickness.describe())

count    140.000000
mean     484.657143
std       39.032319
min      386.000000
25%      456.750000
50%      482.500000
75%      513.000000
max      590.000000
Name: cornea_thickness, dtype: float64


In [10]:
# Borrow some code from Aaron's GitHub
def standardize(x, mu, sig):
        return (x - mu) / (2 * sig)

$$
\textbf{PART A}\\~\\
\text{Model With Logit}
$$

In [11]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_logistic:

    # Define x, y
    corn_standard_log = standardize(x=corn_thickness,
                                mu=corn_mean,
                                sig=corn_std)
    corn_log = pm.Data(name="corn_data", 
                        value=corn_standard_log, 
                        mutable=True)
    iop_log = pm.Data(name="iop_data", 
                       value=low_iop, 
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha", 
                      mu=0, 
                      sigma=2)
    betas = pm.Normal(name="beta", 
                      mu=0, 
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_log, 
                                 r=betas)
    p = pm.math.invlogit(logist)

    pm.Bernoulli(name="low_iop", 
                 p=p, 
                 observed=iop_log)

    trace_log = pm.sample(draws=1000, 
                          tune=1000, 
                          cores=None, 
                          chains=4,
                          idata_kwargs=dict(log_likelihood=True))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return x / y
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 190 seconds.


$$
\textbf{PART B}\\~\\
\text{Model With Logit and X = 490}
$$

In [12]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_logistic490:

    # Define x, y
    corn_log490 = standardize(x=490,
                            mu=corn_mean,
                            sig=corn_std)
    
    iop_log490 = pm.Data(name="iop_data", 
                       value=low_iop, 
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha", 
                      mu=0, 
                      sigma=2)
    betas = pm.Normal(name="beta", 
                      mu=0, 
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_log490, 
                                 r=betas)
    p = pm.math.invlogit(logist)

    pm.Bernoulli(name="low_iop",
                p=p,
                observed=iop_log490)

    trace_log490 = pm.sample(draws=1000,
                             tune=1000,
                             cores=None,
                             chains=4,
                             idata_kwargs=dict(log_likelihood=True))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return np.log1p(x)
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  return x / y
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 11 seconds.


In [13]:
with mod_logistic490:

    # Get Predictions
    preds_log490 = pm.sample_posterior_predictive(trace=trace_log490,
                                                predictions=True)
az.summary(data=preds_log490.predictions).mean()

Sampling: [low_iop]


mean            0.244807
sd              0.429957
hdi_3%          0.000000
hdi_97%         1.000000
mcse_mean       0.007000
mcse_sd         0.005000
ess_bulk     3945.971429
ess_tail     3902.364286
r_hat           1.000000
dtype: float64

$$
\textbf{PART C}\\~\\
\text{Model With Probit}
$$

In [14]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_probit:

    # Define x, y
    corn_standard_prob = standardize(x=corn_thickness,
                                mu=corn_mean,
                                sig=corn_std)
    corn_prob = pm.Data(name="corn_data",
                        value=corn_standard_prob,
                        mutable=True)
    iop_prob = pm.Data(name="iop_data",
                        value=low_iop,
                        mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha",
                      mu=0,
                      sigma=2)
    betas = pm.Normal(name="beta",
                      mu=0,
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_prob,
                                 r=betas)
    p = pm.math.invprobit(logist)

    pm.Bernoulli(name="low_iop",
                 p=p,
                 observed=iop_prob)

    trace_prob = pm.sample(draws=1000,
                           tune=1000,
                           cores=None,
                           chains=4,
                           idata_kwargs=dict(log_likelihood=True))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return x / y
  return np.log1p(x)
  return x / y
  return x / y
  return np.log1p(x)
  return x / y
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 232 seconds.


In [25]:
with mod_probit:

    # View Deviances
    print(az.waic(data=trace_prob,
                scale="deviance"))

with mod_logistic:

    # View Deviances
    print(az.waic(data=trace_log,
                scale="deviance"))

Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   135.71    12.03
p_waic            1.64        -
Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   136.65    11.48
p_waic            1.49        -


$$
\text{Model With Probit and X = 490}
$$

In [16]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_probit490:

    # Define x, y
    corn_probit490 = standardize(x=490,
                            mu=corn_mean,
                            sig=corn_std)
    iop_probit490 = pm.Data(name="iop_data",
                       value=low_iop,
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha",
                      mu=0,
                      sigma=2)
    betas = pm.Normal(name="beta",
                      mu=0,
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_probit490,
                                 r=betas)
    p = pm.math.invprobit(logist)
    
    pm.Bernoulli(name="low_iop",
                 p=p,
                 observed=iop_probit490)

    trace_prob490 = pm.sample(draws=1000,
                              tune=1000,
                              cores=None,
                              chains=4,
                              idata_kwargs=dict(log_likelihood=True))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return np.log1p(x)
  return np.log1p(x)
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  return x / y
  return x / y
  return np.log1p(x)
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  return x / y
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 13 seconds.


In [26]:
with mod_probit490:

    # View Deviances
    print(az.waic(data=trace_prob490,
                scale="deviance"))

with mod_logistic490:

    # View Deviances
    print(az.waic(data=trace_log490,
                scale="deviance"))

Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   157.34    11.66
p_waic            1.06        -
Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   157.17    11.63
p_waic            0.97        -


$\textbf{QUESTION 3}$

In [18]:
micronuclei = pd.read_csv('micronuclei.csv')
rad_dose = micronuclei['x']
freq = micronuclei['y']

In [19]:
print(rad_dose.describe())

count    6000.000000
mean        1.750000
std         1.406946
min         0.000000
25%         0.500000
50%         1.500000
75%         3.000000
max         4.000000
Name: x, dtype: float64


In [20]:
print(freq.describe())

count    6000.000000
mean        0.303000
std         0.634501
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         6.000000
Name: y, dtype: float64


$$
\textbf{PART A}
$$

In [21]:
# with pm.Model() as mod_poisson:
    
#     # Priors for the regression coefficients
#     beta = pm.Normal('beta', mu=0, sigma=2)

#     # Expected value of the outcome (lambda parameter) using the log link function
#     mu = pm.math.exp(pm.math.dot(rad_dose, beta))

#     # Likelihood (sampling distribution) of observations
#     likelihood = pm.Poisson('likelihood', mu=mu, observed=freq)

#     # Use the No-U-Turn Sampler
#     trace = pm.sample(draws=1000, tune=1000, cores=None, chains=2)

# # Summarize the trace
# print(pm.summary(trace))

$$
\textbf{PART B}
$$