### IMPORTS

In [1]:
import pymc as pm
import numpy as np
import pandas as pd
import arviz as az



$\textbf{QUESTION 1}$

In [2]:
babies = pd.read_csv('babies.csv')
cord_clamped = babies['x']
not_clamped = babies['y']

In [3]:
print(cord_clamped.describe())

count    16.000000
mean      9.643750
std       1.714631
min       8.000000
25%       8.350000
50%       9.150000
75%      10.300000
max      13.800000
Name: x, dtype: float64


In [4]:
print(not_clamped.describe())

count    16.00000
mean     12.09375
std       2.23591
min       8.20000
25%      11.00000
50%      12.05000
75%      13.52500
max      16.20000
Name: y, dtype: float64


In [5]:

with pm.Model() as model:
    
    # Define noninformative Gamma priors
    shape_prior = 0.001
    rate_prior = 0.001
    alpha1 = pm.Gamma(name="alpha1", alpha=shape_prior, beta=rate_prior)
    beta1 = pm.Gamma(name="beta1", alpha=shape_prior, beta=rate_prior)
    alpha2 = pm.Gamma(name="alpha2", alpha=shape_prior, beta=rate_prior)
    beta2 = pm.Gamma(name="beta2", alpha=shape_prior, beta=rate_prior)
    
    # Priors for the means
    # mean1 = pm.Gamma(name='mean1', alpha=shape_prior, beta=rate_prior)
    # mean2 = pm.Gamma(name='mean2', alpha=shape_prior, beta=rate_prior)
    mean1, mean2 = alpha1 / beta1, alpha2 / beta2 

    # Likelihoods for the data
    likelihood1 = pm.Gamma(name='likelihood1', alpha=alpha1, beta=beta1, observed=cord_clamped)
    likelihood2 = pm.Gamma(name='likelihood2', alpha=alpha2, beta=beta2, observed=not_clamped)

    # Difference in means
    diff = mean1 - mean2
    diff_means = pm.Deterministic(name='diff_means', var=diff)

    # Sampling
    trace = pm.sample(draws=1000, tune=1000, target_accept=0.90, cores=None)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha1, beta1, alpha2, beta2]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 141 seconds.


In [6]:
# View trace diff_means
diff_means_trace = az.summary(data=trace, var_names=['diff_means'], hdi_prob=.90)
print(f"Trace Summary for difference in means:\n{diff_means_trace}\n")

# Print trace summary
print(f"Trace Summary:\n{az.summary(data=trace, hdi_prob=.90)}\n")

# Check Credible Set
print(f"The 90% Credible Set for the difference of means:\n{diff_means_trace}")

Trace Summary for difference in means:
             mean     sd  hdi_5%  hdi_95%  mcse_mean  mcse_sd  ess_bulk  \
diff_means -2.461  0.754  -3.734   -1.296      0.012    0.008    4198.0   

            ess_tail  r_hat  
diff_means    2948.0    1.0  

Trace Summary:
              mean      sd  hdi_5%  hdi_95%  mcse_mean  mcse_sd  ess_bulk  \
alpha1      35.430  12.541  15.326   54.754      0.340    0.241    1303.0   
beta1        3.674   1.307   1.567    5.689      0.036    0.025    1297.0   
alpha2      27.894  10.154  12.114   43.668      0.308    0.221    1090.0   
beta2        2.306   0.845   1.002    3.613      0.026    0.018    1089.0   
diff_means  -2.461   0.754  -3.734   -1.296      0.012    0.008    4198.0   

            ess_tail  r_hat  
alpha1        1174.0    1.0  
beta1         1206.0    1.0  
alpha2        1168.0    1.0  
beta2         1148.0    1.0  
diff_means    2948.0    1.0  

The 90% Credible Set for the difference of means:
             mean     sd  hdi_5%  hdi_95

$$
\text{The 90\% credible set doesn't contain 0.} \Rightarrow \text{The difference is statistically significant.}
$$

$\textbf{QUESTION 2}$

In [7]:
intraocular_pressure = pd.read_excel('iop2.xlsx', header=None, names=['indicator', 'cornea_thickness'])
low_iop = intraocular_pressure['indicator']
corn_thickness = intraocular_pressure['cornea_thickness']
corn_mean = corn_thickness.mean()
corn_std = corn_thickness.std()

In [8]:
print(low_iop.describe())

count    140.000000
mean       0.242857
std        0.430349
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: indicator, dtype: float64


In [9]:
print(corn_thickness.describe())

count    140.000000
mean     484.657143
std       39.032319
min      386.000000
25%      456.750000
50%      482.500000
75%      513.000000
max      590.000000
Name: cornea_thickness, dtype: float64


In [10]:
# Borrow some code from Aaron's GitHub
def standardize(x, mu, sig):
        return (x - mu) / (2 * sig)

$$
\textbf{PART A}
$$

In [11]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_logistic:

    # Define x, y
    corn_standard = standardize(x=corn_thickness,
                                mu=corn_mean,
                                sig=corn_std)
    corn_data = pm.Data(name="corn_data", 
                        value=corn_standard, 
                        mutable=True)
    iop_data = pm.Data(name="iop_data", 
                       value=low_iop, 
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha", 
                      mu=0, 
                      sigma=2)
    betas = pm.Normal(name="beta", 
                      mu=0, 
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data, 
                                 r=betas)
    p = pm.math.invlogit(logist)

    pm.Bernoulli(name="low_iop", 
                 p=p, 
                 observed=iop_data)

    trace_log = pm.sample(draws=1000, 
                          tune=1000, 
                          cores=None, 
                          chains=4)

# Summarize the trace
print(az.summary(data=trace_log,
                 hdi_prob=.95))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 184 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -1.312  0.221    -1.747     -0.877      0.004    0.003    3548.0   
beta  -1.752  0.428    -2.602     -0.936      0.007    0.005    3314.0   

       ess_tail  r_hat  
alpha    2901.0    1.0  
beta     2817.0    1.0  


In [None]:
#  Get Predictions
# preds_log = pm.sample_posterior_predictive(trace=trace_log,
#                                              predictions=True)
# print(preds_log.predictions)

$$
\textbf{PART B}
$$

In [12]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_logistic_490:

    # Define x, y
    corn_data = standardize(x=490,
                            mu=corn_mean,
                            sig=corn_std)
    
    iop_data = pm.Data(name="iop_data", 
                       value=low_iop, 
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha", 
                      mu=0, 
                      sigma=2)
    betas = pm.Normal(name="beta", 
                      mu=0, 
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data, 
                                 r=betas)
    p = pm.math.invlogit(logist)

    pm.Bernoulli(name="low_iop",
                p=p,
                observed=iop_data)

    trace_log490 = pm.sample(draws=1000,
                             tune=1000,
                             cores=None,
                             chains=4)

# Summarize the trace
print(az.summary(data=trace_log490,
                 hdi_prob=.95))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 10 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -1.127  0.209    -1.552     -0.745      0.004    0.003    3321.0   
beta  -0.024  0.993    -1.831      1.970      0.018    0.015    3154.0   

       ess_tail  r_hat  
alpha    2516.0    1.0  
beta     2813.0    1.0  


In [13]:
# Get Predictions
# preds_log490 = pm.sample_posterior_predictive(trace=trace_log490,
#                                              predictions=True)
# print(preds_log490.predictions)

$$
\textbf{PART C}
$$

In [15]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_probit:

    # Define x, y
    corn_standard = standardize(x=corn_thickness,
                                mu=corn_mean,
                                sig=corn_std)
    corn_data = pm.Data(name="corn_data",
                        value=corn_standard,
                        mutable=True)
    iop_data = pm.Data(name="iop_data",
                        value=low_iop,
                        mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha",
                      mu=0,
                      sigma=2)
    betas = pm.Normal(name="beta",
                      mu=0,
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data,
                                 r=betas)
    p = pm.math.invprobit(logist)

    pm.Bernoulli(name="low_iop",
                 p=p,
                 observed=iop_data)

    trace_prob = pm.sample(draws=1000,
                           tune=1000,
                           cores=None,
                           chains=4,
                           idata_kwargs=dict(log_likelihood=True))

# Summarize the trace
print(az.summary(data=trace_prob,
                 hdi_prob=.95))

# View Deviances
print(az.waic(data=trace_prob,
              scale="deviance"))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return x / y
  return np.log1p(x)
  return x / y
  return np.log1p(x)
  return x / y
  return np.log1p(x)
  return np.log(x)
  return x / y
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  out = self.ufunc.reduce(input, axis=axis, dtype=acc_dtype)
  return x / y
  return np.log1p(x)
  return x / y
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 212 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -0.822  0.131    -1.069     -0.556      0.002    0.002    3408.0   
beta  -1.211  0.281    -1.752     -0.673      0.005    0.004    3357.0   

       ess_tail  r_hat  
alpha    2711.0    1.0  
beta     3013.0    1.0  
Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   135.68    11.99
p_waic            1.61        -


In [18]:
# Get Predictions
# preds_probs = pm.sample_posterior_predictive(trace=trace_prob,
#                                              predictions=True)
# print(preds_probs.predictions)

In [19]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_probit_490:

    # Define x, y
    corn_data = standardize(x=490,
                            mu=corn_mean,
                            sig=corn_std)
    iop_data = pm.Data(name="iop_data",
                       value=low_iop,
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha",
                      mu=0,
                      sigma=2)
    betas = pm.Normal(name="beta",
                      mu=0,
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data,
                                 r=betas)
    p = pm.math.invprobit(logist)
    
    pm.Bernoulli(name="low_iop",
                 logit_p=p,
                 observed=iop_data)

    trace_prob490 = pm.sample(draws=1000,
                              tune=1000,
                              cores=None,
                              chains=4,
                              idata_kwargs=dict(log_likelihood=True))

# Summarize the trace
print(az.summary(data=trace_prob490,
                 hdi_prob=.95))

# View Deviances
print(az.waic(data=trace_prob490,
              scale="deviance"))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 9 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -3.023  0.965    -4.805     -1.390      0.028    0.022    1708.0   
beta  -0.071  0.977    -2.032      1.785      0.018    0.018    3039.0   

       ess_tail  r_hat  
alpha    1444.0    1.0  
beta     2462.0    1.0  
Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   194.71     0.09
p_waic            0.01        -


In [22]:
# Get Predictions
# preds_probs490 = pm.sample_posterior_predictive(trace=trace_prob490,
#                                                 predictions=True)
# print(preds_probs490.predictions)

$\textbf{QUESTION 3}$

In [23]:
micronuclei = pd.read_csv('micronuclei.csv')
rad_dose = micronuclei['x']
freq = micronuclei['y']

In [27]:
print(rad_dose.describe())

count    6000.000000
mean        1.750000
std         1.406946
min         0.000000
25%         0.500000
50%         1.500000
75%         3.000000
max         4.000000
Name: x, dtype: float64


In [26]:
print(freq.describe())

count    6000.000000
mean        0.303000
std         0.634501
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         6.000000
Name: y, dtype: float64


$$
\textbf{PART A}
$$

In [None]:
# with pm.Model() as mod_poisson:
    
#     # Priors for the regression coefficients
#     beta = pm.Normal('beta', mu=0, sigma=2)

#     # Expected value of the outcome (lambda parameter) using the log link function
#     mu = pm.math.exp(pm.math.dot(rad_dose, beta))

#     # Likelihood (sampling distribution) of observations
#     likelihood = pm.Poisson('likelihood', mu=mu, observed=freq)

#     # Use the No-U-Turn Sampler
#     trace = pm.sample(draws=1000, tune=1000, cores=None, chains=2)

# # Summarize the trace
# print(pm.summary(trace))

$$
\textbf{PART B}
$$