In [24]:
'''
author: EdgardoCS @FSU Jena
date: 18.06.2025
'''

import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.genmod as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [25]:
input_data = 'output/data_sorted.xlsx'
columns = ['Id', 'Rating', 'Gender', 'Segment', 'Location', 'Type']

data = pd.read_excel(input_data, usecols=columns)

# focus on Female and Male for now
data = (data[data['Gender'].isin(['Female', 'Male'])])

In [29]:
# Define all factor levels
c1 = ['Self', 'Other']
c2 = ['Front', 'Back']
c3 = ['Armpits', 'Chest', 'Feet', 'Hair', 'Hands', 'Legs', 'Mouth', 'Neck', 'Pelvis']
c4 = ['Male', 'Female']

In [31]:
# Unique subjects
ids = data['Id'].unique()

# Build full grid for each subject × all combinations of the 4 factors
all_combos = pd.DataFrame([
    (i, g, t, l, s)
    for i in ids
    for g, t, l, s in itertools.product(c4, c1, c2, c3)
], columns=['Id', 'Gender', 'Type', 'Location', 'Segment'])

# Add indicator of whether that combination was rated
data['Marked'] = (data['Rating'] > 0).astype(int)

# Merge data with all possible combinations
merged = all_combos.merge(
    data[['Id', 'Gender', 'Type', 'Location', 'Segment', 'Marked']],
    on=['Id', 'Gender', 'Type', 'Location', 'Segment'],
    how='left'
)

# Fill missing with 0 = not marked
merged['Marked'] = merged['Marked'].fillna(0).astype(int)

In [32]:
# Logistic mixed-effects model
model = smf.glm(
    "Marked ~ Segment * Gender * Location * Type",
    data=merged,
    family=sm.families.Binomial()
)
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Marked   No. Observations:               173448
Model:                            GLM   Df Residuals:                   173376
Model Family:                Binomial   Df Model:                           71
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -66101.
Date:                Wed, 18 Jun 2025   Deviance:                   1.3220e+05
Time:                        16:18:55   Pearson chi2:                 1.73e+05
No. Iterations:                     7   Pseudo R-squ. (CS):            0.09603
Covariance Type:            nonrobust                                         
                                                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------