In [1]:
# Chapter 8: Modeling Explicit and Latent Hierarchy in Data
# Applying Mixed models
# **Note: full Python implementation of a mixed model is not possible with current libraries. R is required.

In [9]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.bayes_mixed_glm import BinomialBayesMixedGLM



In [13]:
!pip install pymer4


Collecting pymer4
  Downloading pymer4-0.9.2-py3-none-any.whl.metadata (4.1 kB)
Downloading pymer4-0.9.2-py3-none-any.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ------------- -------------------------- 0.8/2.3 MB 5.5 MB/s eta 0:00:01
   ---------------------------------------- 2.3/2.3 MB 6.5 MB/s eta 0:00:00
Installing collected packages: pymer4
Successfully installed pymer4-0.9.2


In [3]:
# load the speed dating dataset
df = pd.read_csv("speed_dating.csv")

# preview the structure
df.head()

Unnamed: 0,iid,gender,match,samerace,race,goal,dec,attr,intel,prob,agediff
0,1,0,0,0,4.0,2.0,1,6.0,7.0,6.0,6.0
1,1,0,0,0,4.0,2.0,1,7.0,7.0,5.0,1.0
2,1,0,1,1,4.0,2.0,1,5.0,9.0,,1.0
3,1,0,1,0,4.0,2.0,1,7.0,8.0,6.0,2.0
4,1,0,1,0,4.0,2.0,1,5.0,7.0,6.0,3.0


In [4]:
# data summary

# column summary
print("==== Column Names ====")
print(df.columns.tolist())
print("\n")

# data types
print("==== Data Types ====")
print(df.dtypes)
print("\n")

# missing values, by column
print("==== Missing Values by Column ====")
print(df.isna().sum())
print("\n")

# descriptive statistics (numeric columns only)
print("==== Descriptive Statistics ====")
print(df.describe(include='all'))


==== Column Names ====
['iid', 'gender', 'match', 'samerace', 'race', 'goal', 'dec', 'attr', 'intel', 'prob', 'agediff']


==== Data Types ====
iid           int64
gender        int64
match         int64
samerace      int64
race        float64
goal        float64
dec           int64
attr        float64
intel       float64
prob        float64
agediff     float64
dtype: object


==== Missing Values by Column ====
iid           0
gender        0
match         0
samerace      0
race         63
goal         79
dec           0
attr        202
intel       296
prob        309
agediff     198
dtype: int64


==== Descriptive Statistics ====
               iid       gender        match     samerace         race  \
count  8378.000000  8378.000000  8378.000000  8378.000000  8315.000000   
mean    283.675937     0.500597     0.164717     0.395799     2.757186   
std     158.583367     0.500029     0.370947     0.489051     1.230905   
min       1.000000     0.000000     0.000000     0.000000     1.0

In [5]:
# convert variables to categorical dtype
categorical_cols = ['race', 'goal', 'gender']
for col in categorical_cols:
    df[col] = df[col].astype('category')

print("==== Updated Data Types ====")
print(df.dtypes)
print("\n")

# missing value percentages
print("==== Missing Value Percentages ====")
missing_percent = (df.isna().sum() / len(df)) * 100
print(missing_percent)


==== Updated Data Types ====
iid            int64
gender      category
match          int64
samerace       int64
race        category
goal        category
dec            int64
attr         float64
intel        float64
prob         float64
agediff      float64
dtype: object


==== Missing Value Percentages ====
iid         0.000000
gender      0.000000
match       0.000000
samerace    0.000000
race        0.751969
goal        0.942946
dec         0.000000
attr        2.411077
intel       3.533063
prob        3.688231
agediff     2.363333
dtype: float64


In [6]:
# fit the standard binomial logistic regression model ignoring individual-level grouping

# Columns required for our initial logistic model
model_cols = ['dec', 'agediff', 'samerace', 'attr', 'intel', 'prob']

# Drop rows with missing values in these columns
df_clean = df.dropna(subset=model_cols)

# Show how many rows remain
print("Original dataset size:", len(df))
print("Cleaned dataset size:", len(df_clean))
print("Rows dropped:", len(df) - len(df_clean))


Original dataset size: 8378
Cleaned dataset size: 7789
Rows dropped: 589


In [7]:
# fit the standard binomial logistic regression model ignoring individual-level grouping

# fit the logistic regression model
model_logit = smf.glm(
    formula="dec ~ agediff + samerace + attr + intel + prob",
    data=df_clean,
    family=sm.families.Binomial()
).fit()

# show the summary
model_logit.summary()

0,1,2,3
Dep. Variable:,dec,No. Observations:,7789.0
Model:,GLM,Df Residuals:,7783.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4041.5
Date:,"Mon, 24 Nov 2025",Deviance:,8082.9
Time:,19:14:12,Pearson chi2:,8040.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.2805
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-5.8129,0.184,-31.534,0.000,-6.174,-5.452
agediff,-0.0105,0.009,-1.165,0.244,-0.028,0.007
samerace,-0.0934,0.056,-1.677,0.094,-0.203,0.016
attr,0.6611,0.019,34.111,0.000,0.623,0.699
intel,-0.0045,0.021,-0.216,0.829,-0.045,0.036
prob,0.2706,0.015,18.575,0.000,0.242,0.299


In [8]:
# compute log-odds and odds

# extract coefficients (log-odds)
log_odds = model_logit.params

# compute odds ratios
odds_ratios = np.exp(log_odds)

# compute percent change in odds
pct_change = (odds_ratios - 1) * 100

# combine into a single summary table
summary_table = pd.DataFrame({
    "log_odds (coef)": log_odds,
    "odds_ratio": odds_ratios,
    "%_change_in_odds": pct_change
})

summary_table


Unnamed: 0,log_odds (coef),odds_ratio,%_change_in_odds
Intercept,-5.8129,0.002989,-99.701125
agediff,-0.010518,0.989537,-1.046268
samerace,-0.093422,0.910809,-8.919069
attr,0.661139,1.936997,93.69973
intel,-0.004485,0.995525,-0.447489
prob,0.270553,1.310689,31.068872


In [16]:
# **Note: full Python implementation of a mixed model is not possible with current libraries. R is required.
mixed_mod = BinomialBayesMixedGLM.from_formula(
    "dec ~ agediff + samerace + attr + intel + prob",
    {"iid": "0 + C(iid)"},
    df_clean
)

mixed_res = mixed_mod.fit_vb()
print(mixed_res.summary())


               Binomial Mixed GLM Results
          Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
--------------------------------------------------------
Intercept    M   -12.3923   0.0341                      
agediff      M    -0.0387   0.0073                      
samerace     M     0.1905   0.0529                      
attr         M     1.0548   0.0051                      
intel        M     0.2893   0.0045                      
prob         M     0.6034   0.0059                      
iid          V     0.7996   0.0304 2.225   2.094   2.364
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations
