In [1]:
"""
Gender Differences Analysis on Facial Expression Features (MANOVA)
分析性别差异对面部表情特征的影响（多变量分析）
"""

import pandas as pd
import numpy as np
from statsmodels.multivariate.manova import MANOVA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# 禁用输出缓冲
sys.stdout.reconfigure(encoding='utf-8') if hasattr(sys.stdout, 'reconfigure') else None

def load_demographics():
    """加载人口统计学数据"""
    df = pd.read_csv('result/demographics.csv')
    df['性别'] = df['性别'].replace({1: 'Male', 0: 'Female'})
    return df[['姓名', '性别']]

def load_features(feature_type):
    """加载特征数据"""
    file_map = {
        'emotion': 'result/machine_learning/merged_emotion_features.csv',
        'au_intensity': 'result/machine_learning/merged_au_intensities.csv',
        'au_correlation': 'result/machine_learning/merged_au_correlations.csv'
    }
    file_path = file_map.get(feature_type)
    if not file_path or not os.path.exists(file_path):
        return None
    return pd.read_csv(file_path)

def perform_manova(feature_cols, df, group_col='group', gender_col='性别', use_pca=True, n_components=0.95):
    """进行多变量方差分析（MANOVA），使用PCA降维处理高维特征"""
    # 准备数据
    data = df[feature_cols + [group_col, gender_col]].copy()
    
    # 处理缺失值：删除包含缺失值的行
    data = data.dropna(subset=feature_cols)
    
    # 检查是否有足够的样本
    if len(data) < 4:
        print(f"  警告: 删除缺失值后样本数不足 ({len(data)})")
        return None
    
    # 转换组别
    data[group_col] = data[group_col].replace({0: 'TD', 1: 'ASD'})
    
    # 检查样本量
    group_counts = data.groupby([group_col, gender_col]).size()
    if group_counts.min() < 2:
        print(f"  警告: 某个组的样本数不足 (最小: {group_counts.min()})")
        return None
    
    try:
        # 提取特征矩阵
        X = data[feature_cols].values
        
        # 检查是否有无穷大或NaN值
        if np.any(np.isinf(X)) or np.any(np.isnan(X)):
            print(f"  警告: 特征矩阵包含无穷大或NaN值")
            X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
        
        # 如果特征数量过多或使用PCA，进行降维
        n_samples, n_features = X.shape
        max_features = min(n_samples - 4, 50)  # MANOVA要求特征数 < 样本数-组数
        
        # 强制使用PCA（因为即使特征数不多，也可能存在线性相关）
        if use_pca or n_features > max_features or n_features >= n_samples:
            # 标准化
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            # PCA降维：保留95%方差或最多max_features个主成分
            if isinstance(n_components, float):
                pca = PCA(n_components=n_components, random_state=42)
            else:
                pca = PCA(n_components=min(n_components, max_features), random_state=42)
            
            X_pca = pca.fit_transform(X_scaled)
            n_components_used = X_pca.shape[1]
            
            # 创建主成分列名
            pc_cols = [f'PC{i+1}' for i in range(n_components_used)]
            data_pca = pd.DataFrame(X_pca, columns=pc_cols, index=data.index)
            data_pca[group_col] = data[group_col].values
            data_pca[gender_col] = data[gender_col].values
            
            # 使用主成分进行MANOVA
            formula = '(' + ' + '.join(pc_cols) + ')' + f' ~ C({group_col}) * C({gender_col})'
            manova = MANOVA.from_formula(formula, data=data_pca)
        else:
            # 如果特征数合理，直接使用原始特征
            print(f"  直接使用原始特征: {n_features} 个特征")
            formula = '(' + ' + '.join(feature_cols) + ')' + f' ~ C({group_col}) * C({gender_col})'
            manova = MANOVA.from_formula(formula, data=data)
        
        result = manova.mv_test()
        
        # 提取结果
        manova_results = {}
        
        # 提取组别主效应
        if f'C({group_col})' in result.results:
            group_result = result.results[f'C({group_col})']
            stat_table = group_result['stat']
            if 'Wilks\' lambda' in stat_table.index:
                # 检查列名（可能是'F'或'F Value'等）
                wilks_row = stat_table.loc['Wilks\' lambda']
                manova_results['group_Wilks'] = wilks_row.get('Value', np.nan)
                # 尝试不同的F值列名
                for f_col in ['F', 'F Value', 'F-value', 'F_value']:
                    if f_col in wilks_row.index:
                        manova_results['group_F'] = wilks_row[f_col]
                        break
                else:
                    manova_results['group_F'] = np.nan
                # 尝试不同的p值列名
                for p_col in ['Pr > F', 'p-value', 'p_value', 'P>F']:
                    if p_col in wilks_row.index:
                        manova_results['group_p'] = wilks_row[p_col]
                        break
                else:
                    manova_results['group_p'] = np.nan
        
        # 提取性别主效应
        if f'C({gender_col})' in result.results:
            gender_result = result.results[f'C({gender_col})']
            stat_table = gender_result['stat']
            if 'Wilks\' lambda' in stat_table.index:
                wilks_row = stat_table.loc['Wilks\' lambda']
                manova_results['gender_Wilks'] = wilks_row.get('Value', np.nan)
                for f_col in ['F', 'F Value', 'F-value', 'F_value']:
                    if f_col in wilks_row.index:
                        manova_results['gender_F'] = wilks_row[f_col]
                        break
                else:
                    manova_results['gender_F'] = np.nan
                for p_col in ['Pr > F', 'p-value', 'p_value', 'P>F']:
                    if p_col in wilks_row.index:
                        manova_results['gender_p'] = wilks_row[p_col]
                        break
                else:
                    manova_results['gender_p'] = np.nan
        
        # 提取交互效应
        interaction_key = f'C({group_col}):C({gender_col})'
        if interaction_key in result.results:
            interaction_result = result.results[interaction_key]
            stat_table = interaction_result['stat']
            if 'Wilks\' lambda' in stat_table.index:
                wilks_row = stat_table.loc['Wilks\' lambda']
                manova_results['interaction_Wilks'] = wilks_row.get('Value', np.nan)
                for f_col in ['F', 'F Value', 'F-value', 'F_value']:
                    if f_col in wilks_row.index:
                        manova_results['interaction_F'] = wilks_row[f_col]
                        break
                else:
                    manova_results['interaction_F'] = np.nan
                for p_col in ['Pr > F', 'p-value', 'p_value', 'P>F']:
                    if p_col in wilks_row.index:
                        manova_results['interaction_p'] = wilks_row[p_col]
                        break
                else:
                    manova_results['interaction_p'] = np.nan
        
        # 添加PCA信息
        if use_pca or n_features > max_features or n_features >= n_samples:
            manova_results['PCA_used'] = True
            manova_results['Original_features'] = n_features
            manova_results['PCA_components'] = n_components_used if 'n_components_used' in locals() else n_features
        else:
            manova_results['PCA_used'] = False
            manova_results['Original_features'] = n_features
            manova_results['PCA_components'] = n_features
        
        return manova_results
    except Exception as e:
        print(f"MANOVA执行错误: {e}")
        import traceback
        traceback.print_exc()
        return None

def main():
    """主函数"""
    demo_df = load_demographics()
    
    feature_configs = [
        {'type': 'emotion', 'name': '情绪特征', 'exclude_cols': ['姓名', 'group', 'ABC', '克氏']},
        {'type': 'au_intensity', 'name': 'AU强度特征', 'exclude_cols': ['姓名', 'group']},
        {'type': 'au_correlation', 'name': 'AU协同性特征', 'exclude_cols': ['姓名', 'group']}
    ]
    
    manova_results = []
    
    for config in feature_configs:
        df = load_features(config['type'])
        if df is None:
            print(f"  警告: 未找到 {config['type']} 特征文件")
            continue
        
        feature_cols = [col for col in df.columns if col not in config['exclude_cols']]
        
        # 多变量分析（MANOVA）- 将同一类型的所有特征作为整体
        df_merged = df.merge(demo_df[['姓名', '性别']], on='姓名', how='inner')
        
        if 'group' not in df_merged.columns:
            print(f"  警告: 未找到group列")
            continue
        
        if len(feature_cols) <= 1:
            print(f"  警告: 特征数量不足 ({len(feature_cols)})")
            continue
        
        manova_result = perform_manova(feature_cols, df_merged, 'group', '性别')
        if manova_result:
            manova_result['Feature_Type'] = config['name']
            manova_result['N_Features'] = len(feature_cols)
            manova_results.append(manova_result)
        else:
            print(f"  MANOVA分析失败或样本量不足")
    
    # 保存结果
    output_dir = 'result/gender_group_interaction'
    os.makedirs(output_dir, exist_ok=True)
    
    # MANOVA结果表（多变量分析）
    if manova_results:
        manova_df = pd.DataFrame(manova_results)
        # 重新排列列顺序
        cols_order = ['Feature_Type',
                     'group_Wilks', 'group_F', 'group_p',
                     'gender_Wilks', 'gender_F', 'gender_p',
                     'interaction_Wilks', 'interaction_F', 'interaction_p']
        manova_df = manova_df[[c for c in cols_order if c in manova_df.columns]]
        
        # 标记显著性
        manova_df['gender_significant'] = manova_df['gender_p'].apply(lambda p: '*' if pd.notna(p) and p < 0.05 else '')
        manova_df['interaction_significant'] = manova_df['interaction_p'].apply(lambda p: '*' if pd.notna(p) and p < 0.05 else '')
        
        print("\n" + "="*100)
        print("多变量分析 (MANOVA) 结果:")
        print("="*100)
        print(manova_df.to_string(index=False))
        manova_df.to_csv(f'{output_dir}/summary_manova.csv', index=False, encoding='utf-8-sig')
        print(f"\n结果已保存至: {output_dir}/summary_manova.csv")
    else:
        print("\n警告: 没有获得任何MANOVA结果")

if __name__ == '__main__':
    main()



多变量分析 (MANOVA) 结果:
Feature_Type  group_Wilks  group_F      group_p  gender_Wilks  gender_F  gender_p  interaction_Wilks  interaction_F  interaction_p gender_significant interaction_significant
        情绪特征     0.808335 1.618544 4.558000e-02      0.828898  1.409045  0.113801           0.855218       1.155609       0.293601                                           
      AU强度特征     0.276838 3.685096 6.226385e-08      0.533319  1.234446  0.192761           0.542066       1.191761       0.234344                                           
     AU协同性特征     0.112384 1.696230 4.063505e-02      0.135763  1.367149  0.150719           0.167559       1.066964       0.431232                                           

结果已保存至: result/gender_group_interaction/summary_manova.csv
