In [1]:
"""
    Linear mixed effects modeling for CPIG items
"""
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

In [2]:
df = pd.read_csv("/home/aml7990/Code/creativity-item-generation/item_evaluation/linear_regression/Summary_CPS_trials.csv")
# df = df[["Item Gen Model", "Item Response Gen Model", "Shot Selection Method", "Response Prompt Type", "Mean Originality - Round 5"]]
df.dropna(inplace=True)
df = pd.melt(df, id_vars=["Item Gen Model", "Item Response Gen Model", "Shot Selection Method", "Response Prompt Type"], var_name="Round", value_name="Mean Originality")
df.rename(columns={
    "Item Gen Model": "IGM",
    "Item Response Gen Model": "IRGM",
    "Shot Selection Method": "SSM",
    "Response Prompt Type": "RPT",
    "Mean Originality": "Originality"
}, inplace=True)
df

Unnamed: 0,IGM,IRGM,SSM,RPT,Round,Originality
0,meta-llama/Llama-3.1-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,CS,Baseline,Mean Originality - Round 1,1.419
1,meta-llama/Llama-3.1-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,CS,Baseline,Mean Originality - Round 1,1.414
2,meta-llama/Llama-3.1-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,CS,Baseline,Mean Originality - Round 1,1.355
3,meta-llama/Llama-3.1-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,CS,Demographic,Mean Originality - Round 1,1.305
4,meta-llama/Llama-3.1-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,CS,Demographic,Mean Originality - Round 1,1.398
...,...,...,...,...,...,...
171,lmsys/vicuna-13b-v1.5,meta-llama/Llama-2-7b-chat-hf,CS,Baseline,Mean Originality - Round 5,1.336
172,lmsys/vicuna-13b-v1.5,meta-llama/Llama-2-7b-chat-hf,CS,Psychometric,Mean Originality - Round 5,1.454
173,lmsys/vicuna-13b-v1.5,meta-llama/Llama-2-7b-chat-hf,CS,Psychometric,Mean Originality - Round 5,1.454
174,lmsys/vicuna-13b-v1.5,meta-llama/Llama-2-7b-chat-hf,CS,Psychometric,Mean Originality - Round 5,1.454


In [3]:
df["IGM"].value_counts()

IGM
gpt-4o-mini                              72
claude-3-haiku                           30
meta-llama/Llama-3.1-70B-Instruct        18
mistralai/Mistral-Large-Instruct-2407    18
lmsys/vicuna-13b-v1.5                    16
meta-llama/Llama-2-13b-chat-hf           14
meta-llama/Llama-2-70b-chat-hf            6
lmsys/vicuna-7b-v1.5                      2
Name: count, dtype: int64

In [4]:
df["IRGM"].value_counts()

IRGM
meta-llama/Llama-3.1-8B-Instruct    90
meta-llama/Llama-2-7b-chat-hf       50
meta-llama/Llama-3.2-3B-Instruct    18
lmsys/vicuna-7b-v1.5                18
Name: count, dtype: int64

In [5]:
# The simpliest possible model: simply treat all variables as independent
model1 = smf.ols(formula="Originality ~ C(IGM) + C(IRGM) + C(SSM) + C(RPT) + C(Round)", data=df)
res = model1.fit()
res.summary()

0,1,2,3
Dep. Variable:,Originality,R-squared:,0.797
Model:,OLS,Adj. R-squared:,0.778
Method:,Least Squares,F-statistic:,41.98
Date:,"Wed, 18 Dec 2024",Prob (F-statistic):,1.3400000000000002e-47
Time:,11:47:06,Log-Likelihood:,89.096
No. Observations:,176,AIC:,-146.2
Df Residuals:,160,BIC:,-95.46
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6448,0.099,6.533,0.000,0.450,0.840
C(IGM)[T.gpt-4o-mini],0.2499,0.089,2.794,0.006,0.073,0.426
C(IGM)[T.lmsys/vicuna-13b-v1.5],-0.0147,0.071,-0.208,0.836,-0.155,0.125
C(IGM)[T.lmsys/vicuna-7b-v1.5],0.1853,0.136,1.363,0.175,-0.083,0.454
C(IGM)[T.meta-llama/Llama-2-13b-chat-hf],-0.1051,0.072,-1.457,0.147,-0.248,0.037
C(IGM)[T.meta-llama/Llama-2-70b-chat-hf],0.3748,0.083,4.516,0.000,0.211,0.539
C(IGM)[T.meta-llama/Llama-3.1-70B-Instruct],0.1003,0.051,1.967,0.051,-0.000,0.201
C(IGM)[T.mistralai/Mistral-Large-Instruct-2407],0.1268,0.051,2.486,0.014,0.026,0.227
C(IRGM)[T.meta-llama/Llama-2-7b-chat-hf],0.2091,0.135,1.546,0.124,-0.058,0.476

0,1,2,3
Omnibus:,9.456,Durbin-Watson:,1.358
Prob(Omnibus):,0.009,Jarque-Bera (JB):,18.516
Skew:,-0.144,Prob(JB):,9.54e-05
Kurtosis:,4.563,Cond. No.,30.2


In [6]:
# add interactions between generator and responder
# the nas worry me in these models, I think its better to fit separate models that don't have them
model2 = smf.ols(formula="Originality ~ C(IGM) : C(IRGM) + C(SSM) + C(RPT) + C(Round)", data=df)
res = model2.fit()
res.summary()

0,1,2,3
Dep. Variable:,Originality,R-squared:,0.797
Model:,OLS,Adj. R-squared:,0.778
Method:,Least Squares,F-statistic:,41.98
Date:,"Wed, 18 Dec 2024",Prob (F-statistic):,1.3400000000000002e-47
Time:,11:55:16,Log-Likelihood:,89.096
No. Observations:,176,AIC:,-146.2
Df Residuals:,160,BIC:,-95.46
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7976,0.025,31.453,0.000,0.748,0.848
C(SSM)[T.Greedy],-0.1142,0.089,-1.277,0.203,-0.291,0.062
C(SSM)[T.Random],-0.2042,0.073,-2.779,0.006,-0.349,-0.059
C(RPT)[T.Demographic],-0.0123,0.031,-0.405,0.686,-0.073,0.048
C(RPT)[T.Psychometric],0.1929,0.030,6.391,0.000,0.133,0.252
C(Round)[T.Mean Originality - Round 5],0.3247,0.023,14.079,0.000,0.279,0.370
C(IRGM)[T.meta-llama/Llama-2-7b-chat-hf],0.0564,0.049,1.152,0.251,-0.040,0.153
C(IRGM)[T.meta-llama/Llama-3.1-8B-Instruct],0.4381,0.032,13.513,0.000,0.374,0.502
C(IRGM)[T.meta-llama/Llama-3.2-3B-Instruct],0.2060,0.018,11.255,0.000,0.170,0.242

0,1,2,3
Omnibus:,9.456,Durbin-Watson:,1.358
Prob(Omnibus):,0.009,Jarque-Bera (JB):,18.516
Skew:,-0.144,Prob(JB):,9.54e-05
Kurtosis:,4.563,Cond. No.,2.42e+16


In [7]:
# interaction between generator and shot selection method
model3 = smf.ols(formula="Originality ~ C(IGM) : C(SSM)  + C(IRGM)  + C(RPT) + C(Round)", data=df)
res = model3.fit()
res.summary()

0,1,2,3
Dep. Variable:,Originality,R-squared:,0.8
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,39.67
Date:,"Wed, 18 Dec 2024",Prob (F-statistic):,3.67e-47
Time:,11:55:44,Log-Likelihood:,90.089
No. Observations:,176,AIC:,-146.2
Df Residuals:,159,BIC:,-92.28
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7543,0.027,27.961,0.000,0.701,0.808
C(IRGM)[T.meta-llama/Llama-2-7b-chat-hf],0.1325,0.051,2.601,0.010,0.032,0.233
C(IRGM)[T.meta-llama/Llama-3.1-8B-Instruct],0.4814,0.033,14.662,0.000,0.417,0.546
C(IRGM)[T.meta-llama/Llama-3.2-3B-Instruct],0.3149,0.051,6.191,0.000,0.214,0.415
C(RPT)[T.Demographic],-0.0123,0.030,-0.406,0.686,-0.072,0.048
C(RPT)[T.Psychometric],0.1929,0.030,6.407,0.000,0.133,0.252
C(Round)[T.Mean Originality - Round 5],0.3247,0.023,14.114,0.000,0.279,0.370
C(SSM)[T.Greedy],0.0579,0.046,1.261,0.209,-0.033,0.149
C(SSM)[T.Random],-0.2698,0.088,-3.063,0.003,-0.444,-0.096

0,1,2,3
Omnibus:,8.809,Durbin-Watson:,1.365
Prob(Omnibus):,0.012,Jarque-Bera (JB):,16.159
Skew:,-0.152,Prob(JB):,0.00031
Kurtosis:,4.453,Cond. No.,1.44e+16


In [8]:
# interaction between prompt type and round
model4 = smf.ols(formula="Originality ~ C(IGM) : C(RPT) + C(IGM) + C(RPT) + C(Round) + C(IRGM) + C(SSM)", data=df)
res = model4.fit()
res.summary()

0,1,2,3
Dep. Variable:,Originality,R-squared:,0.821
Model:,OLS,Adj. R-squared:,0.791
Method:,Least Squares,F-statistic:,27.54
Date:,"Wed, 18 Dec 2024",Prob (F-statistic):,6.3e-44
Time:,11:57:26,Log-Likelihood:,100.04
No. Observations:,176,AIC:,-148.1
Df Residuals:,150,BIC:,-65.64
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6548,0.106,6.150,0.000,0.444,0.865
C(IGM)[T.gpt-4o-mini],0.2373,0.103,2.305,0.023,0.034,0.441
C(IGM)[T.lmsys/vicuna-13b-v1.5],0.0500,0.119,0.421,0.675,-0.185,0.285
C(IGM)[T.lmsys/vicuna-7b-v1.5],0.1157,0.077,1.496,0.137,-0.037,0.269
C(IGM)[T.meta-llama/Llama-2-13b-chat-hf],-0.1122,0.147,-0.765,0.445,-0.402,0.177
C(IGM)[T.meta-llama/Llama-2-70b-chat-hf],0.2104,0.057,3.692,0.000,0.098,0.323
C(IGM)[T.meta-llama/Llama-3.1-70B-Instruct],0.1087,0.086,1.268,0.207,-0.061,0.278
C(IGM)[T.mistralai/Mistral-Large-Instruct-2407],0.0475,0.086,0.554,0.580,-0.122,0.217
C(RPT)[T.Demographic],-0.0503,0.086,-0.587,0.558,-0.220,0.119

0,1,2,3
Omnibus:,2.001,Durbin-Watson:,1.352
Prob(Omnibus):,0.368,Jarque-Bera (JB):,1.793
Skew:,-0.011,Prob(JB):,0.408
Kurtosis:,3.494,Cond. No.,1.43e+16


In [12]:
# likelihood ratio test TODO: confirm this is accurate and the test is appropriate (assumes models are nested I belive)
from scipy.stats import chi2
np.random.seed(0)

full_ll = 100.04 # model4
reduced_ll = 89.096 # model1
full_params = model4.df_resid
reduced_params = model1.df_resid

# Calculate the likelihood ratio
lr = -2 * (reduced_ll - full_ll)
lrdf = (reduced_params - full_params)
p_value = 1 - chi2.sf(lr, df=lrdf)

print("Likelihood Ratio:", lr)
print("P-value:", p_value)

Likelihood Ratio: 21.888000000000005
P-value: 0.984314552076855
