# Title


If you are not familiar with PyMC, you can [start with this chapter from *Think Bayes*](https://allendowney.github.io/ThinkBayes2/chap19.html), especially the World Cup Problem. Or you can [run that chapter on Colab](https://colab.research.google.com/github/AllenDowney/ThinkBayes2/blob/master/notebooks/chap19_v3.ipynb).

You can read [the slides I used to present this example](COMING SOON).

[Click here to run this notebook on Colab](https://colab.research.google.com/github/AllenDowney/SurveyDataPyMC/blob/main/notebooks/01_tutorial.ipynb)

In [91]:
# Get utils.py

from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Downloaded ' + local)
        
download('https://github.com/AllenDowney/SurveyDataPyMC/raw/main/notebooks/utils.py')

In [92]:
try:
    import empiricaldist
except ImportError:
    !pip install empiricaldist

In [93]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm

from utils import decorate, value_counts

In [94]:
# Make the figures smaller to save some screen real estate

plt.rcParams['figure.dpi'] = 75
plt.rcParams['figure.figsize'] = [6, 3.5]
plt.rcParams['axes.titlelocation'] = 'left'

In [95]:
# Get the GSS data

# This dataset is prepared in GssExtract/notebooks/02_make_extract-2022_3a.ipynb

DATA_PATH = "https://github.com/AllenDowney/GssExtract/raw/main/data/interim/"
filename = "gss_extract_2022_3a.hdf"
download(DATA_PATH + filename)

In [96]:
gss = pd.read_hdf(filename, "gss")
gss.shape

(72390, 57)

In [97]:
# https://gssdataexplorer.norc.org/variables/452/vshow

question = """Taken all together, how would you say things are these days--
would you say that you are very happy, pretty happy, or not too happy?
"""

# 1 = very happy
# 2 = pretty happy
# 3 = not too happy

title = "Are you happy?"
subtitle = "Percent saying very happy"
ylim = [10, 45]

In [98]:
value_counts(gss['happy'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,21550
2.0,37446
3.0,8681
,4713


In [99]:
missing = gss['happy'].isna()
gss['y1'] = (gss['happy'] == 1).astype(float).mask(missing)
gss['y2'] = (gss['happy'] == 2).astype(float).mask(missing)
gss['y3'] = (gss['happy'] == 3).astype(float).mask(missing)
value_counts(gss['y1'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
0.0,46127
1.0,21550
,4713


In [100]:
# shift so the codes are 0, 1, 2
data = gss['happy'].values - 1
pd.Series(data).value_counts().sort_index()

0.0    21550
1.0    37446
2.0     8681
Name: count, dtype: int64

In [101]:
gss['cohort'].describe()

count    71987.000000
mean      1991.908095
std        561.717010
min       1883.000000
25%       1938.000000
50%       1954.000000
75%       1968.000000
max       9999.000000
Name: cohort, dtype: float64

In [102]:
bins = [1928, 1946, 1965, 1981, 1997, 2013, 2025] 
labels = ['Silent', 'Boomer', 'GenX', 'Millennial', 'GenZ', 'Alpha']

# Assign each cohort to a generation
gss['generation'] = pd.cut(gss['cohort'], bins=bins, labels=labels, right=False)

In [109]:
# gss_clean = gss.dropna(subset=['generation', 'sex', 'happy']).sample(10000)
gss_clean = gss.dropna(subset=['generation', 'sex', 'happy'])
gss_clean.shape

(56641, 61)

In [110]:
gss_clean['generation'].value_counts()

generation
Boomer        24087
Silent        13348
GenX          12106
Millennial     6049
GenZ           1051
Alpha             0
Name: count, dtype: int64

In [111]:
pd.crosstab(gss_clean['year'], gss_clean['generation'])

generation,Silent,Boomer,GenX,Millennial,GenZ
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1972,484,310,0,0,0
1973,497,327,0,0,0
1974,432,374,0,0,0
1975,470,413,0,0,0
1976,476,431,0,0,0
1977,470,451,0,0,0
1978,444,554,0,0,0
1980,406,569,0,0,0
1982,491,805,0,0,0
1983,414,726,21,0,0


In [112]:
# Convert generation to a categorical variable
generation = pd.Categorical(gss_clean['generation'], categories=labels)
generation_codes = generation.codes

In [113]:
years = gss_clean['year'].value_counts().sort_index()
year_labels = years.index.values

In [114]:
# Convert year to a categorical variable
year = pd.Categorical(gss_clean['year'], categories=year_labels)
year_codes = year.codes

In [116]:
# Independent variable (sex)
sex = gss_clean['sex'].values

In [117]:
# Dependent variable (happiness levels)
y = gss_clean['happy'].values - 1
pd.Series(y).value_counts()

1.0    31900
0.0    17384
2.0     7357
Name: count, dtype: int64

## Model 1

In [118]:
# Build the model
with pm.Model() as ordered_logistic_model1:
    
    # Priors for the coefficients
    intercept = pm.Normal('intercept', mu=0, sigma=2)
    beta_sex = pm.Normal('beta_sex', mu=0, sigma=2)
    
    # Priors for the generation coefficients (one per generation)
    beta_gen = pm.Normal('beta_gen', mu=0, sigma=2, shape=len(labels))
    
    # Priors for the cutpoints (thresholds) between categories
    cutpoints = pm.Normal('cutpoints', mu=np.array([-1, 1]), sigma=2, shape=2,
                          transform=pm.distributions.transforms.ordered)
    
    # Linear combination of coefficients and independent variables
    eta = intercept + beta_sex * sex + beta_gen[generation_codes]
    
    # Likelihood: ordered logistic regression
    y_obs = pm.OrderedLogistic('y_obs', eta=eta, cutpoints=cutpoints, 
                               compute_p=False, observed=y)

In [119]:
with ordered_logistic_model1:
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_sex, beta_gen, cutpoints]


Output()

ValueError: Not enough samples to build a trace.

In [52]:
pm.summary(idata)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
intercept,-0.135,1.269,-2.368,2.341,0.038,0.03,1110.0,1226.0,1.0
beta_sex,-0.05,0.053,-0.147,0.05,0.001,0.001,1971.0,1400.0,1.0
beta_gen[0],-0.523,0.877,-2.188,1.066,0.032,0.024,744.0,796.0,1.01
beta_gen[1],-0.377,0.877,-1.973,1.273,0.032,0.025,749.0,837.0,1.0
beta_gen[2],-0.249,0.876,-1.797,1.451,0.032,0.025,751.0,858.0,1.0
beta_gen[3],-0.125,0.878,-1.746,1.488,0.032,0.025,749.0,871.0,1.01
beta_gen[4],0.845,0.895,-0.854,2.462,0.032,0.023,786.0,888.0,1.01
beta_gen[5],0.075,1.954,-3.703,3.607,0.053,0.046,1368.0,1087.0,1.0
cutpoints[0],-1.334,1.177,-3.627,0.82,0.033,0.025,1250.0,1166.0,1.0
cutpoints[1],1.42,1.177,-0.832,3.633,0.033,0.025,1250.0,1209.0,1.0


In [80]:
import arviz as az

az.to_netcdf(idata, "ordered_logistic_model1.nc")

'ordered_logistic_model_idata.nc'

In [81]:
!ls -lh ordered_logistic_model1.nc

-rw-rw-r-- 1 downey downey 3.3M Aug 29 20:32 ordered_logistic_model_idata.nc


## Model 2

In [120]:
# Build the model
with pm.Model() as ordered_logistic_model2:
    
    # Priors for the coefficients
    intercept = pm.Normal('intercept', mu=0, sigma=2)
    
    # Priors for the generation coefficients (one per generation)
    beta_gen = pm.Normal('beta_gen', mu=0, sigma=2, shape=len(labels))
    
    # Priors for the generation coefficients (one per generation)
    beta_year = pm.Normal('beta_year', mu=0, sigma=2, shape=len(year_labels))
    
    # Priors for the cutpoints (thresholds) between categories
    cutpoints = pm.Normal('cutpoints', mu=np.array([-1, 1]), sigma=2, shape=2,
                          transform=pm.distributions.transforms.ordered)
    
    # Linear combination of coefficients and independent variables
    eta = intercept + beta_year[year_codes] + beta_gen[generation_codes]
    
    # Likelihood: ordered logistic regression
    y_obs = pm.OrderedLogistic('y_obs', eta=eta, cutpoints=cutpoints, 
                               compute_p=False, observed=y)

In [121]:
with ordered_logistic_model2:
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_gen, beta_year, cutpoints]


Output()

ValueError: Not enough samples to build a trace.

In [61]:
pm.summary(idata)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
intercept,-0.149,1.245,-2.58,2.052,0.051,0.036,607.0,999.0,1.0
beta_gen[0],-0.469,0.868,-1.961,1.315,0.047,0.034,338.0,665.0,1.01
beta_gen[1],-0.33,0.866,-1.882,1.37,0.048,0.034,332.0,663.0,1.01
beta_gen[2],-0.171,0.866,-1.74,1.508,0.047,0.034,336.0,688.0,1.01
beta_gen[3],-0.107,0.87,-1.661,1.584,0.048,0.034,334.0,612.0,1.01
beta_gen[4],0.708,0.884,-1.044,2.293,0.047,0.034,348.0,684.0,1.0
beta_gen[5],0.009,1.984,-4.011,3.601,0.052,0.043,1490.0,1431.0,1.0
beta_year[0],0.344,0.411,-0.357,1.168,0.037,0.026,122.0,497.0,1.03
beta_year[1],-0.221,0.397,-0.993,0.466,0.039,0.027,106.0,350.0,1.03
beta_year[2],-0.176,0.414,-0.947,0.575,0.035,0.025,142.0,393.0,1.03


In [80]:
import arviz as az

az.to_netcdf(idata, "ordered_logistic_model2.nc")

'ordered_logistic_model_idata.nc'

In [81]:
!ls -lh ordered_logistic_model2.nc

-rw-rw-r-- 1 downey downey 3.3M Aug 29 20:32 ordered_logistic_model_idata.nc


## Model 3

With a different parameter for each year, generation pair

In [122]:

with pm.Model() as interaction_model:
    
    # Priors for the global intercept
    intercept = pm.Normal('intercept', mu=0, sigma=2)
    
    # Hyperpriors for the group-level means and standard deviations
    mu_gen_year = pm.Normal('mu_gen_year', mu=0, sigma=2)
    sigma_gen_year = pm.HalfNormal('sigma_gen_year', sigma=2)
    
    # Group-level effects with hierarchical structure: a coefficient for each year-generation combination
    beta_gen_year = pm.Normal('beta_gen_year', mu=mu_gen_year, sigma=sigma_gen_year, 
                              shape=(len(year_labels), len(generation_labels)))
    
    # Priors for the cutpoints (thresholds) between categories
    cutpoints = pm.Normal('cutpoints', mu=np.array([-1, 1]), sigma=2, shape=2,
                          transform=pm.distributions.transforms.ordered)
    
    # Linear combination of coefficients and independent variables
    eta = intercept + beta_gen_year[year_codes, generation_codes]
    
    # Likelihood: ordered logistic regression
    y_obs = pm.OrderedLogistic('y_obs', eta=eta, cutpoints=cutpoints, 
                               compute_p=False, observed=y)

In [123]:
with interaction_model:
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, mu_gen_year, sigma_gen_year, beta_gen_year, cutpoints]


Output()

  self.vm()
  self.vm()


ValueError: Not enough samples to build a trace.

In [77]:
pm.summary(idata)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
intercept,-0.265,1.365,-2.866,2.341,0.326,0.235,18.0,165.0,1.17
mu_gen_year,-0.131,0.960,-1.951,1.290,0.459,0.349,5.0,11.0,2.93
"beta_gen_year[0, 0]",-0.025,0.981,-1.874,1.684,0.460,0.349,5.0,11.0,2.61
"beta_gen_year[0, 1]",-0.018,0.987,-1.911,1.644,0.460,0.348,5.0,12.0,2.51
"beta_gen_year[0, 2]",-0.127,0.996,-2.057,1.536,0.461,0.349,5.0,11.0,2.46
...,...,...,...,...,...,...,...,...,...
"beta_gen_year[33, 4]",0.093,0.980,-1.904,1.613,0.458,0.347,5.0,11.0,2.56
"beta_gen_year[33, 5]",-0.132,0.998,-2.036,1.585,0.457,0.346,5.0,11.0,2.31
sigma_gen_year,0.276,0.041,0.202,0.355,0.003,0.002,199.0,381.0,1.01
cutpoints[0],-1.216,1.210,-3.564,0.923,0.048,0.034,631.0,1129.0,1.02


In [80]:
import arviz as az

az.to_netcdf(idata, "interaction_model.nc")

'ordered_logistic_model_idata.nc'

In [81]:
!ls -lh interaction_model.nc

-rw-rw-r-- 1 downey downey 3.3M Aug 29 20:32 ordered_logistic_model_idata.nc


In [82]:
idata = az.from_netcdf("interaction_model.nc")

## Model 4

Hierarchical interaction model

In [127]:
with pm.Model() as hierarchical_model:
    
    # Priors for the global intercept
    intercept = pm.Normal('intercept', mu=0, sigma=2)
    
    # Top-level: Hyperpriors for the year-level effects
    mu_year = pm.Normal('mu_year', mu=0, sigma=2)
    sigma_year = pm.HalfNormal('sigma_year', sigma=2)
    beta_year = pm.Normal('beta_year', mu=mu_year, sigma=sigma_year, shape=len(year_labels))
    
    # Second-level: Hyperpriors for generation effects within each year
    mu_gen = pm.Normal('mu_gen', mu=0, sigma=2, shape=len(year_labels))
    sigma_gen = pm.HalfNormal('sigma_gen', sigma=2, shape=len(year_labels))
    beta_gen = pm.Normal('beta_gen', mu=mu_gen[:, None], sigma=sigma_gen[:, None], 
                         shape=(len(year_labels), len(generation_labels)))
    
    # Priors for the cutpoints (thresholds) between categories
    cutpoints = pm.Normal('cutpoints', mu=np.array([-1, 1]), sigma=2, shape=2,
                          transform=pm.distributions.transforms.ordered)
    
    # Linear combination of coefficients and independent variables
    eta = intercept + beta_year[year_codes] + beta_gen[year_codes, generation_codes]
    
    # Likelihood: ordered logistic regression
    y_obs = pm.OrderedLogistic('y_obs', eta=eta, cutpoints=cutpoints, 
                               compute_p=False, observed=y)

In [128]:
with hierarchical_model:
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, mu_year, sigma_year, beta_year, mu_gen, sigma_gen, beta_gen, cutpoints]


Output()

ValueError: Not enough samples to build a trace.

In [None]:
pm.summary(idata)

In [80]:
import arviz as az

az.to_netcdf(idata, "hierarchical_model.nc")

'ordered_logistic_model_idata.nc'

In [81]:
!ls -lh hierarchical_model.nc

-rw-rw-r-- 1 downey downey 3.3M Aug 29 20:32 ordered_logistic_model_idata.nc


In [82]:
idata = az.from_netcdf("hierarchical_model.nc")