## demographic analysis

In [1]:
import pandas as pd
from scipy import stats
import numpy as np
from statsmodels.stats.multitest import multipletests
from lib.EventTracking import Events
import os
import glob

df = pd.read_csv('result\demographics.csv')
# 1 for ASD group, 0 for TD group; 1 for male, 0 for female
df['组别'] = df['组别'].replace({1: 'ASD', 0: 'TD'})
df['性别'] = df['性别'].replace({1: 'Male', 0: 'Female'})

print(f"数量：{len(df)}")

数量：184


In [2]:
# Calculate experimental interaction duration
df['Experimental_duration'] = None
events_path = r'result\Events'

# Iterate through each row of the dataframe
for index, row in df.iterrows():
    name = row['姓名']
    json_path = os.path.join(events_path, f'{name}.json')
    if os.path.exists(json_path):
        events = Events(json_path)
        # Calculate experimental duration (minutes)
        duration = (events.events[-1].EndTime - events.events[0].StartTime).total_seconds() / 60
        # Assign to the Experimental_duration column of the corresponding row
        df.at[index, 'Experimental_duration'] = duration
df['Experimental_duration'] = pd.to_numeric(df['Experimental_duration'])

# df.to_csv('result\demographics_with_duration.csv', index=False, encoding='utf-8-sig')


In [3]:
asd_group, td_group = df[df['组别'] == 'ASD'], df[df['组别'] == 'TD']

# Gender ratio
num_asd = len(asd_group)
num_td = len(td_group)
gender_asd_count = asd_group['性别'].value_counts().to_dict()
gender_td_count = td_group['性别'].value_counts().to_dict()
male_asd = gender_asd_count.get('Male', 0)
female_asd = gender_asd_count.get('Female', 0)
male_td = gender_td_count.get('Male', 0)
female_td = gender_td_count.get('Female', 0)

print(f'ASD组人数    : {num_asd:>5}')
print(f'TD组人数     : {num_td:>5}')
print(f'ASD组男女比例: 男 = {male_asd:>5}, 女 = {female_asd:>5}')
print(f'TD组男女比例 : 男 = {male_td:>5}, 女 = {female_td:>5}')

ASD组人数    :    99
TD组人数     :    85
ASD组男女比例: 男 =    83, 女 =    16
TD组男女比例 : 男 =    47, 女 =    38


In [4]:
# Total experimental duration
asd_total_duration =    asd_group['Experimental_duration'].sum() / 60
td_total_duration = td_group['Experimental_duration'].sum() / 60
print(f'ASD组实验总时长: {asd_total_duration:.2f}小时')
print(f'TD组实验总时长 : {td_total_duration:.2f}小时')
print(f'实验总时长: {asd_total_duration + td_total_duration:.2f}小时')


ASD组实验总时长: 40.06小时
TD组实验总时长 : 32.83小时
实验总时长: 72.90小时


In [8]:
# Group differences
from scipy.stats import chi2_contingency, fisher_exact

corrected_p_values = []
result = pd.DataFrame(columns=["item", "统计方法", "ASD mean±std", "TD mean±std", "统计量Z/T", "P值"])

# Age comparison
age_asd = asd_group['Age']
age_td = td_group['Age']
statistic, p_value = stats.mannwhitneyu(age_asd, age_td, alternative='two-sided')
corrected_p_values.append(p_value)
result.loc[len(result)] = ["Age", "Mann-Whitney U", f"{age_asd.mean():.2f}±{age_asd.std():.2f}", f"{age_td.mean():.2f}±{age_td.std():.2f}", statistic, p_value]

# Gender comparison
contingency_table = pd.crosstab(df['组别'], df['性别'])
chi2, p_chi2, dof, expected = chi2_contingency(contingency_table)
oddsratio, p_fisher = fisher_exact(contingency_table)

# Use Fisher's exact test if any expected frequency < 5, otherwise use chi-square
if (expected < 5).any():
    gender_p_value = p_fisher
    gender_method = "Fisher's exact test"
    gender_statistic = oddsratio
else:
    gender_p_value = p_chi2
    gender_method = "Chi-square test"
    gender_statistic = chi2

corrected_p_values.append(gender_p_value)
result.loc[len(result)] = [
    "Gender (Male vs Female)", 
    gender_method, 
    f"Male: {male_asd}, Female: {female_asd}", 
    f"Male: {male_td}, Female: {female_td}", 
    f"{gender_statistic:.4f}", 
    gender_p_value
]

# Other continuous variables
for col in ['ABC', '克氏', 'Experimental_duration']:
    asd_value = asd_group[col]
    td_value = td_group[col]
    
    stat_0, p_value_0 = stats.shapiro(asd_value)
    stat_1, p_value_1 = stats.shapiro(td_value)

    if p_value_0 > 0.05 and p_value_1 > 0.05:
        t_statistic, p_value = stats.ttest_ind(asd_value, td_value)
        result.loc[len(result.index)] = [col, 't-test' , f"{asd_value.mean():.2f}±{asd_value.std():.2f}", f"{td_value.mean():.2f}±{td_value.std():.2f}", t_statistic, p_value]
        corrected_p_values.append(p_value)
    else:
        statistic, p_value = stats.mannwhitneyu(asd_value, td_value, alternative='two-sided')
        result.loc[len(result.index)] = [col, 'Mann-Whitney U', f"{asd_value.mean():.2f}±{asd_value.std():.2f}", f"{td_value.mean():.2f}±{td_value.std():.2f}", statistic, p_value]
        corrected_p_values.append(p_value)

# Correct p-values
corrected_p_values = multipletests(corrected_p_values, method='fdr_bh')[1]

result['corrected_p_values'] = corrected_p_values
result['Significance'] = result['corrected_p_values'].apply(lambda p: '*' if p < 0.05 else '')

result

Unnamed: 0,item,统计方法,ASD mean±std,TD mean±std,统计量Z/T,P值,corrected_p_values,Significance
0,Age,Mann-Whitney U,75.93±17.99,78.41±18.88,3910.5,0.4102545,0.4102545,
1,Gender (Male vs Female),Chi-square test,"Male: 83, Female: 16","Male: 47, Female: 38",16.6207,4.564892e-05,7.608154e-05,*
2,ABC,Mann-Whitney U,38.43±23.48,1.72±3.49,8300.5,7.670033000000001e-31,3.835017e-30,*
3,克氏,Mann-Whitney U,10.75±5.39,1.13±1.62,8115.0,6.858173e-28,1.7145430000000003e-27,*
4,Experimental_duration,Mann-Whitney U,24.28±4.45,23.18±4.02,4837.0,0.08075221,0.1009403,
