### IMPORTS

In [1]:
import pymc as pm
import numpy as np
import pandas as pd
import arviz as az



$\textbf{QUESTION 1}$

In [2]:
babies = pd.read_csv('babies.csv')
cord_clamped = babies['x']
not_clamped = babies['y']

In [3]:
print(cord_clamped.describe())

count    16.000000
mean      9.643750
std       1.714631
min       8.000000
25%       8.350000
50%       9.150000
75%      10.300000
max      13.800000
Name: x, dtype: float64


In [4]:
print(not_clamped.describe())

count    16.00000
mean     12.09375
std       2.23591
min       8.20000
25%      11.00000
50%      12.05000
75%      13.52500
max      16.20000
Name: y, dtype: float64


In [5]:

with pm.Model() as model:
    
    # Define noninformative Gamma priors
    shape_prior = 0.001
    rate_prior = 0.001
    alpha1 = pm.Gamma(name="alpha1", alpha=shape_prior, beta=rate_prior)
    beta1 = pm.Gamma(name="beta1", alpha=shape_prior, beta=rate_prior)
    alpha2 = pm.Gamma(name="alpha2", alpha=shape_prior, beta=rate_prior)
    beta2 = pm.Gamma(name="beta2", alpha=shape_prior, beta=rate_prior)
    
    # Priors for the means
    # mean1 = pm.Gamma(name='mean1', alpha=shape_prior, beta=rate_prior)
    # mean2 = pm.Gamma(name='mean2', alpha=shape_prior, beta=rate_prior)
    mean1, mean2 = alpha1 / beta1, alpha2 / beta2 

    # Likelihoods for the data
    likelihood1 = pm.Gamma(name='likelihood1', alpha=alpha1, beta=beta1, observed=cord_clamped)
    likelihood2 = pm.Gamma(name='likelihood2', alpha=alpha2, beta=beta2, observed=not_clamped)

    # Difference in means
    diff = mean1 - mean2
    diff_means = pm.Deterministic(name='diff_means', var=diff)

    # Sampling
    trace = pm.sample(draws=1000, tune=1000, target_accept=0.90, cores=None)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha1, beta1, alpha2, beta2]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 127 seconds.


In [6]:
# View trace diff_means
diff_means_trace = az.summary(data=trace, var_names=['diff_means'], hdi_prob=.90)

# Print trace summary
print(f"Trace Summary:\n{az.summary(data=trace, hdi_prob=.90)}\n")

# Check Credible Set
print(f"The 90% Credible Set for the difference of means:\n{diff_means_trace}")

Trace Summary for difference in means:
             mean     sd  hdi_5%  hdi_95%  mcse_mean  mcse_sd  ess_bulk  \
diff_means -2.464  0.764  -3.709   -1.247      0.011    0.008    4472.0   

            ess_tail  r_hat  
diff_means    3351.0    1.0  

Trace Summary:
              mean      sd  hdi_5%  hdi_95%  mcse_mean  mcse_sd  ess_bulk  \
alpha1      34.983  12.705  13.837   54.055      0.350    0.251    1314.0   
beta1        3.627   1.327   1.433    5.682      0.037    0.026    1302.0   
alpha2      27.856  10.352  11.500   43.233      0.283    0.200    1378.0   
beta2        2.303   0.866   1.043    3.708      0.024    0.017    1320.0   
diff_means  -2.464   0.764  -3.709   -1.247      0.011    0.008    4472.0   

            ess_tail  r_hat  
alpha1        1071.0    1.0  
beta1         1150.0    1.0  
alpha2        1632.0    1.0  
beta2         1560.0    1.0  
diff_means    3351.0    1.0  

The 90% Credible Set for the difference of means:
             mean     sd  hdi_5%  hdi_95

$$
\text{The 90\% credible set doesn't contain 0.} \Rightarrow \text{The difference is statistically significant.}
$$

$\textbf{QUESTION 2}$

In [7]:
intraocular_pressure = pd.read_excel('iop2.xlsx', header=None, names=['indicator', 'cornea_thickness'])
low_iop = intraocular_pressure['indicator']
corn_thickness = intraocular_pressure['cornea_thickness']
corn_mean = corn_thickness.mean()
corn_std = corn_thickness.std()

In [8]:
print(low_iop.describe())

count    140.000000
mean       0.242857
std        0.430349
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: indicator, dtype: float64


In [9]:
print(corn_thickness.describe())

count    140.000000
mean     484.657143
std       39.032319
min      386.000000
25%      456.750000
50%      482.500000
75%      513.000000
max      590.000000
Name: cornea_thickness, dtype: float64


In [10]:
# Borrow some code from Aaron's GitHub
def standardize(x, mu, sig):
        return (x - mu) / (2 * sig)

$$
\textbf{PART A}
$$

In [11]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_logistic:

    # Define x, y
    corn_standard = standardize(x=corn_thickness,
                                mu=corn_mean,
                                sig=corn_std)
    corn_data = pm.Data(name="corn_data", 
                        value=corn_standard, 
                        mutable=True)
    iop_data = pm.Data(name="iop_data", 
                       value=low_iop, 
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha", 
                      mu=0, 
                      sigma=2)
    betas = pm.Normal(name="beta", 
                      mu=0, 
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data, 
                                 r=betas)
    p = pm.math.invlogit(logist)

    pm.Bernoulli(name="low_iop", 
                 p=p, 
                 observed=iop_data)

    trace_log = pm.sample(draws=1000, 
                          tune=1000, 
                          cores=None, 
                          chains=4)

# Summarize the trace
print(az.summary(data=trace_log,
                 hdi_prob=.95))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return x / y
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 191 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -1.313  0.221    -1.750     -0.894      0.004    0.003    3291.0   
beta  -1.755  0.435    -2.541     -0.868      0.007    0.005    3443.0   

       ess_tail  r_hat  
alpha    2833.0    1.0  
beta     2823.0    1.0  


$$
\textbf{PART B}
$$

In [13]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_logistic_490:

    # Define x, y
    corn_data = standardize(x=490,
                            mu=corn_mean,
                            sig=corn_std)
    
    iop_data = pm.Data(name="iop_data", 
                       value=low_iop, 
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha", 
                      mu=0, 
                      sigma=2)
    betas = pm.Normal(name="beta", 
                      mu=0, 
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data, 
                                 r=betas)
    p = pm.math.invlogit(logist)

    pm.Bernoulli(name="low_iop",
                p=p,
                observed=iop_data)

    trace_log490 = pm.sample(draws=1000,
                             tune=1000,
                             cores=None,
                             chains=4)

# Summarize the trace
print(az.summary(data=trace_log490,
                 hdi_prob=.95))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 10 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -1.136  0.203    -1.539     -0.756      0.003    0.002    3495.0   
beta  -0.003  0.985    -1.936      1.911      0.016    0.015    3603.0   

       ess_tail  r_hat  
alpha    3166.0    1.0  
beta     2771.0    1.0  


In [14]:
# Get Predictions
# preds_log490 = pm.sample_posterior_predictive(trace=trace_log490,
#                                              predictions=True)
# print(preds_log490.predictions)

$$
\textbf{PART C}
$$

In [15]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_probit:

    # Define x, y
    corn_standard = standardize(x=corn_thickness,
                                mu=corn_mean,
                                sig=corn_std)
    corn_data = pm.Data(name="corn_data",
                        value=corn_standard,
                        mutable=True)
    iop_data = pm.Data(name="iop_data",
                        value=low_iop,
                        mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha",
                      mu=0,
                      sigma=2)
    betas = pm.Normal(name="beta",
                      mu=0,
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data,
                                 r=betas)
    p = pm.math.invprobit(logist)

    pm.Bernoulli(name="low_iop",
                 p=p,
                 observed=iop_data)

    trace_prob = pm.sample(draws=1000,
                           tune=1000,
                           cores=None,
                           chains=4,
                           idata_kwargs=dict(log_likelihood=True))

# Summarize the trace
print(az.summary(data=trace_prob,
                 hdi_prob=.95))

# View Deviances
print(az.waic(data=trace_prob,
              scale="deviance"))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return x / y
  return x / y
  return np.log1p(x)
  return np.log1p(x)
  return x / y
  return np.log(x)
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  out = self.ufunc.reduce(input, axis=axis, dtype=acc_dtype)
  return x / y
  return x / y
  return np.log1p(x)
  out = self.ufunc.reduce(input, axis=axis, dtype=acc_dtype)
  return x / y
  return np.log(x)
  return x / y
  return np.log1p(x)
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 206 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -0.824  0.132    -1.096     -0.575      0.002    0.002    3537.0   
beta  -1.220  0.279    -1.757     -0.654      0.005    0.003    3536.0   

       ess_tail  r_hat  
alpha    2832.0    1.0  
beta     2876.0    1.0  
Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   135.75    12.03
p_waic            1.66        -


In [17]:
# Borrow some code from Aaron's GitHub
with pm.Model() as mod_probit_490:

    # Define x, y
    corn_data = standardize(x=490,
                            mu=corn_mean,
                            sig=corn_std)
    iop_data = pm.Data(name="iop_data",
                       value=low_iop,
                       mutable=False)

    # Define alpha, beta for logistic regression
    alpha = pm.Normal(name="alpha",
                      mu=0,
                      sigma=2)
    betas = pm.Normal(name="beta",
                      mu=0,
                      sigma=1)

    logist = alpha + pm.math.dot(l=corn_data,
                                 r=betas)
    p = pm.math.invprobit(logist)
    
    pm.Bernoulli(name="low_iop",
                 p=p,
                 observed=iop_data)

    trace_prob490 = pm.sample(draws=1000,
                              tune=1000,
                              cores=None,
                              chains=4,
                              idata_kwargs=dict(log_likelihood=True))

# Summarize the trace
print(az.summary(data=trace_prob490,
                 hdi_prob=.95))

# View Deviances
print(az.waic(data=trace_prob490,
              scale="deviance"))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


  return np.log1p(x)
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  return x / y
  return x / y
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  return np.log1p(x)
  variables = ufunc(*ufunc_args, **ufunc_kwargs)
  return x / y
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 12 seconds.


        mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
alpha -0.695  0.132    -0.948     -0.434      0.003    0.002    2316.0   
beta  -0.005  1.000    -1.951      1.903      0.020    0.017    2403.0   

       ess_tail  r_hat  
alpha    2394.0    1.0  
beta     2196.0    1.0  
Computed from 4000 posterior samples and 140 observations log-likelihood matrix.

              Estimate       SE
deviance_waic   157.21    11.63
p_waic            0.99        -


$\textbf{QUESTION 3}$

In [19]:
micronuclei = pd.read_csv('micronuclei.csv')
rad_dose = micronuclei['x']
freq = micronuclei['y']

In [20]:
print(rad_dose.describe())

count    6000.000000
mean        1.750000
std         1.406946
min         0.000000
25%         0.500000
50%         1.500000
75%         3.000000
max         4.000000
Name: x, dtype: float64


In [21]:
print(freq.describe())

count    6000.000000
mean        0.303000
std         0.634501
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         6.000000
Name: y, dtype: float64


$$
\textbf{PART A}
$$

In [22]:
# with pm.Model() as mod_poisson:
    
#     # Priors for the regression coefficients
#     beta = pm.Normal('beta', mu=0, sigma=2)

#     # Expected value of the outcome (lambda parameter) using the log link function
#     mu = pm.math.exp(pm.math.dot(rad_dose, beta))

#     # Likelihood (sampling distribution) of observations
#     likelihood = pm.Poisson('likelihood', mu=mu, observed=freq)

#     # Use the No-U-Turn Sampler
#     trace = pm.sample(draws=1000, tune=1000, cores=None, chains=2)

# # Summarize the trace
# print(pm.summary(trace))

$$
\textbf{PART B}
$$