## 情绪特征预处理

In [2]:
import pandas as pd
import os
import numpy as np
import scipy.stats as stats

def smooth_emotions2(emotion_series, window_size=10, min_duration=10):
    """
    使用基于规则的方法平滑情绪序列
    window_size: 滑动窗口大小
    min_duration: 最小持续帧数
    """
    emotions = emotion_series.values
    smoothed = emotions.copy()
    n = len(emotions)

    # 1. 使用滑动窗口多数投票进行初步平滑
    for i in range(n):
        start = max(0, i - window_size // 2)
        end = min(n, i + window_size // 2 + 1)
        window = emotions[start:end]

        # 如果当前帧是有效情绪,则进行多数投票平滑
        if emotions[i] != 'undefined':  # 只有在当前情绪不是'undefined'时进行处理
            # 获取窗口内有效情绪
            valid_emotions = window[window != 'undefined']
            
            # 如果有有效情绪,进行多数投票
            if len(valid_emotions) > 0:
                unique, counts = np.unique(valid_emotions, return_counts=True)
                majority = unique[counts.argmax()]
                smoothed[i] = majority
            else:
                smoothed[i] = emotions[i]
        else:
            smoothed[i] = emotions[i]  # 对于未定义情绪不做处理

    # 2. 对持续时间不足min_duration帧的情绪进行分割，循环处理直到不再有小于min_duration的片段
    while True:
        i = 0
        modified = False  # 记录是否进行过修改
        while i < n - 1:
            # 查找连续相同情绪段
            start = i
            while i < n - 1 and smoothed[i] == smoothed[i + 1]:
                i += 1
            end = i + 1  # 当前情绪段结束位置
            
            # 如果当前段的持续时间小于min_duration，进行分割
            if end - start < min_duration:
                # 判断前后情绪类别
                if start > 0:
                    prev_emotion = smoothed[start - 1]
                else:
                    prev_emotion = 'undefined'

                if end < n:
                    next_emotion = smoothed[end] 
                else:
                    next_emotion = 'undefined'

                # 分割当前段，前一半归为前一个情绪，后一半归为后一个情绪
                mid = start + (end - start) // 2
                smoothed[start:mid] = prev_emotion
                smoothed[mid:end] = next_emotion

                modified = True  # 标记有修改
                break  # 跳出循环进行下一轮检查

            i += 1

        # 如果没有进行任何修改，说明已经完成分割，退出循环
        if not modified:
            break

    return pd.Series(smoothed, index=emotion_series.index)

def process_and_save_smoothed_emotions(input_dir, output_dir, window_size=10, min_duration=10):
    
    # 创建输出目录
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    try:
        # 获取所有CSV文件
        csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
        if not csv_files:
            raise ValueError(f"在 {input_dir} 中没有找到CSV文件")
            
        print(f"开始处理{len(csv_files)}个文件的数据...")
        
        # 处理所有文件
        for i, csv_file in enumerate(csv_files, 1):
            input_file = os.path.join(input_dir, csv_file)
            output_file = os.path.join(output_dir, csv_file)
            
            print(f"\r处理进度：{i}/{len(csv_files)} ({i/len(csv_files)*100:.1f}%) - "
                  f"当前处理：{csv_file}", end='')
            # if csv_file != '白硕旻.csv':
            #     continue
            try:
                # 读取前两行标题
                with open(input_file, 'r') as f:
                    header_lines = [next(f) for _ in range(2)]
                
                # 读取数据
                df = pd.read_csv(input_file, skiprows=2)
                
                # 平滑情绪序列
                df['SmoothedEmotion'] = smooth_emotions2(df['Emotion'], window_size, min_duration)
                
                # 保存结果,包括前两行标题
                with open(output_file, 'w', newline='') as f:
                    # 写入原始的两行标题
                    f.writelines(header_lines)
                    f.write('\n')
                    # 写入处理后的数据
                    df.to_csv(f, index=False)
                
            except Exception as e:
                print(f"\n警告：处理 {csv_file} 时出错: {str(e)}")
                continue
        
        print("\n所有数据处理完成，平滑后的结果已保存至 result/Emotions_smooth/")
            
    except Exception as e:
        print(f"错误：{str(e)}")
        return False

# 执行处理
# process_and_save_smoothed_emotions(input_dir="result/Emotions", output_dir="result/Emotions_smooth2", window_size=10, min_duration=10)
# process_and_save_smoothed_emotions(input_dir="result/Emotions", output_dir="result/Emotions_smooth3", window_size=5, min_duration=3)


## 提取特征

In [80]:
import pandas as pd
import os
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# 提取特征
def analyze_emotion_features(emotion_dir, feature_type, remove_outlier=True, demographic_csv=r'result/demographics.csv', output_dir="result/Emotion_features", is_save = True):
    # 定义常量
    EMOTIONS = ['neutral', 'happy', 'sad', 'surprise', 'anger']
    FRAME_DURATION = 20  # 每帧20毫秒
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    # 读取情绪CSV数据
    csv_files = [f for f in os.listdir(emotion_dir) if f.endswith('.csv')]
    print(f"开始处理{len(csv_files)}个文件...")

    # 存储所有被试的结果
    results = []

    for file in csv_files:
        try:
            # 获取被试姓名
            subject_name = os.path.splitext(file)[0]
            
            # 读取数据
            df = pd.read_csv(os.path.join(emotion_dir, file), skiprows=2)

            df = df[df['Emotion'].isin(EMOTIONS)]  # 过滤无效情绪
            df['SmoothedEmotion'] = smooth_emotions2(df['Emotion'])  # 平滑情绪
            
            # 计算总时长（秒）
            total_duration = len(df) * FRAME_DURATION / 1000
            
            # 初始化该被试的特征字典
            subject_features = {'Person': subject_name}

            # 计算GEV特征
            if feature_type == 'GEV':                
                gev_series = df.groupby('SmoothedEmotion').size() / len(df)
                for emotion in EMOTIONS:
                    gev = gev_series.get(emotion, 0)  # 如果情绪不存在，返回0
                    if emotion == 'happy':
                        subject_features[f'GEV_sad'] = gev
                    elif emotion == 'sad':
                        subject_features[f'GEV_happy'] = gev
                    else:
                        subject_features[f'GEV_{emotion}'] = gev

            # 计算发生频率（每秒变化次数）
            if feature_type == 'Frequency':
                emotion_sequence = df['SmoothedEmotion'].values
                changes = 0  # 情绪变化次数
                # 计算每种情绪的发生频率
                for emotion in EMOTIONS:
                    changes = 0
                    for i in range(1, len(emotion_sequence)):
                        if emotion_sequence[i] == emotion and emotion_sequence[i-1] != emotion:
                            changes += 1
                    # 计算频率（每秒变化的次数）
                    frequency = changes / total_duration
                    if emotion == 'happy':
                        subject_features[f'Frequency_sad'] = frequency
                    elif emotion == 'sad':
                        subject_features[f'Frequency_happy'] = frequency
                    else:
                        subject_features[f'Frequency_{emotion}'] = frequency

            # 计算平均持续时间
            if feature_type == 'Duration':
                # 初始化所有情绪的持续时间为0
                for emotion in EMOTIONS:
                    subject_features[f'Duration_{emotion}'] = 0
                # 检测情绪变化点
                emotion_segments = df['SmoothedEmotion'] != df['SmoothedEmotion'].shift()
                # 计算每种情绪出现的段数
                emotion_runs = df[emotion_segments]['SmoothedEmotion'].value_counts()
                # 计算每种情绪的总帧数
                emotion_total_frames = df.groupby('SmoothedEmotion').size()
                # 计算每种情绪的平均持续时间（毫秒）
                for emotion in EMOTIONS:
                    if emotion in emotion_runs.index and emotion in emotion_total_frames.index:
                        avg_duration = (emotion_total_frames[emotion] / emotion_runs[emotion]) * FRAME_DURATION
                        subject_features[f'Duration_{emotion}'] = avg_duration
                    else:
                        subject_features[f'Duration_{emotion}'] = 0
            
            # 计算平均识别概率
            if feature_type == 'Probability':
                mean_probability = df.groupby('SmoothedEmotion')['Probability'].mean().round(2)
                std_probability = df.groupby('SmoothedEmotion')['Probability'].std().round(2)
                for emotion in EMOTIONS:
                    mean = mean_probability.get(emotion, 0)  # 如果情绪不存在，返回0
                    std = std_probability.get(emotion, 0)
                    subject_features[f'Probability_mean_{emotion}'] = mean
                    subject_features[f'Probability_std_{emotion}'] = std
                
            # 状态转移矩阵  包括相同情绪 为了绘制弦图
            if feature_type == 'TransWithSelf':
                transition_counts = pd.crosstab(
                    df['SmoothedEmotion'], 
                    df['SmoothedEmotion'].shift(-1), 
                    normalize='index'
                )
                # 添加转移概率
                for e1 in EMOTIONS:
                    for e2 in EMOTIONS:
                            subject_features[f'TransWithSelf_{e1}_to_{e2}'] = transition_counts.get(e2, {}).get(e1, 0)

            # 状态转移矩阵  不包括相同情绪 计算转移概率组间相差的统计学使用这个
            if feature_type == 'TransWithOutSelf':
                # df = df[df['Emotion'].isin(EMOTIONS)]  # 过滤无效情绪
                # df['SmoothedEmotion'] = smooth_emotions2(df['Emotion'])  # 平滑情绪
                current_emotions = df['SmoothedEmotion'].iloc[:-1].reset_index(drop=True)  # 去掉最后一个
                next_emotions = df['SmoothedEmotion'].iloc[1:].reset_index(drop=True)      # 去掉第一个
                # 只保留不同情绪间的转移
                mask = current_emotions != next_emotions
                current_emotions = current_emotions[mask]
                next_emotions = next_emotions[mask]
                # 计算转移概率矩阵
                transition_counts = pd.crosstab(
                    current_emotions,
                    next_emotions,
                    normalize='index'
                )
                # 添加转移概率
                for e1 in EMOTIONS:
                    for e2 in EMOTIONS:
                        subject_features[f'TransWithOutSelf_{e1}_to_{e2}'] = transition_counts.get(e2, {}).get(e1, 0)

            # 状态转移组间比较 不包括相同情绪 绘制转移图的数据来源
            if feature_type == 'GroupCompareTransWithOutSelf':
                trans_data = analyze_emotion_features(emotion_dir, 'TransWithOutSelf', 
                                                      remove_outlier=remove_outlier, demographic_csv=demographic_csv, 
                                                      output_dir=output_dir, is_save = False)
                print(trans_data)
                # 获取该组的数据
                trans_data_ASD, trans_data_TD = trans_data[trans_data['组别'] == 1], trans_data[trans_data['组别'] == 0]


                
                EMOTION_ORDER = ['neutral', 'happy', 'sad', 'surprise', 'anger']
                n_emotions = len(EMOTION_ORDER)

                # 创建转移矩阵
                trans_matrix_data, trans_matrix_p_value = np.zeros((n_emotions, n_emotions)), np.zeros((n_emotions, n_emotions))

                # 填充转移矩阵
                for i, from_emotion in enumerate(EMOTION_ORDER):
                    for j, to_emotion in enumerate(EMOTION_ORDER):
                        if from_emotion == to_emotion: continue
                        col_name = f'TransWithOutSelf_{from_emotion}_to_{to_emotion}'
                        asd_data, td_data = trans_data_ASD[col_name], trans_data_TD[col_name]

                        # 计算均值差异
                        trans_matrix_data[i, j] = asd_data.mean() - td_data.mean()
                        _, p_value = stats.ttest_ind(asd_data, td_data)
                        trans_matrix_p_value[i, j] = p_value

                if is_save:
                    # 转换为DataFrame并添加行列标签
                    df_data = pd.DataFrame(trans_matrix_data, 
                                        index=EMOTION_ORDER, 
                                        columns=EMOTION_ORDER)
                    df_p_value = pd.DataFrame(trans_matrix_p_value, 
                                            index=EMOTION_ORDER, 
                                            columns=EMOTION_ORDER)
                    output_file = os.path.join(output_dir, 'TransBetweenGroups_analysis(From_Row_to_Col).xlsx')
                    with pd.ExcelWriter(output_file) as writer:
                        df_data.to_excel(writer, sheet_name='Mean_Difference')
                        df_p_value.to_excel(writer, sheet_name='P_Values')
                    print(f"分析结果已保存至：{output_file}")

                return trans_matrix_data, trans_matrix_p_value
            
            # 添加到结果列表
            results.append(subject_features)

            # 打印进度
            print(f"\r处理进度：{len(results)}/{len(csv_files)}", end='')

        except Exception as e:
            print(f"\n处理文件 {file} 时出错: {str(e)}")
            continue

    # 转换为DataFrame
    results_df = pd.DataFrame(results)
    
    # 读取demographic.csv并合并
    info_cols = ['姓名', '组别', 'ABC', 'S1', 'R', 'B', 'L', 'S2', '克氏', 'Age']
    demographic_df = pd.read_csv(demographic_csv)[info_cols]
    results_df = pd.merge(demographic_df, results_df, left_on='姓名', right_on='Person')
    results_df = results_df.drop('姓名', axis=1)

    feature_cols = [col for col in results_df.columns if col.startswith(feature_type)]
    
    # 数据去除奇异值
    if remove_outlier:
        for col in feature_cols:
            for group in [0, 1]:
                mask = results_df['组别'] == group
                group_data = results_df.loc[mask, col]  
                q1 = group_data.quantile(0.25)
                q3 = group_data.quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                results_df.loc[mask & (group_data < lower_bound), col] = np.nanmedian(group_data)
                results_df.loc[mask & (group_data > upper_bound), col] = np.nanmedian(group_data)

    print("\n处理完成！")
    
    if is_save:
        # 保存结果
        output_file = os.path.join(output_dir, f'{feature_type}.csv')
        results_df.to_csv(output_file, index=False, encoding='utf-8-sig')
        print(f"结果已保存至：{output_file}")

    # # 显示基本统计信息
    # print(f"\n{feature_type}特征统计：")
    # print(results_df[feature_cols].describe())
    
    return results_df

# 统计分析
def statistics_analysis(feature_type, feature_dir="result/Emotion_features"):
    """
    对情绪特征进行统计分析，包括：
    1. 描述性统计
    2. 正态性检验
    3. 方差齐性检验
    4. 组间差异检验（t检验或Mann-Whitney U检验）
    5. FDR校正
    6. 效应量计算
    7. 显著性标记
    """
    from statsmodels.stats.multitest import fdrcorrection
    
    # 定义情绪顺序
    EMOTION_ORDER = ['Neutral', 'Happy', 'Sad', 'Surprise', 'Anger']
    
    # 读取数据
    feature_df = pd.read_csv(os.path.join(feature_dir, f'{feature_type}.csv'))
    feature_cols = [col for col in feature_df.columns if col.startswith(feature_type)]

    print(feature_cols)
    
    # 存储统计结果
    stats_results = []
    
    for col in feature_cols:
        # 获取组别数据
        td_data = feature_df[feature_df['组别'] == 0][col]    # TD组（组别=0）
        asd_data = feature_df[feature_df['组别'] == 1][col]   # ASD组（组别=1）
        
        # 1. 描述性统计
        desc_td = f"{td_data.mean():.3f}±{td_data.std():.3f}"
        desc_asd = f"{asd_data.mean():.3f}±{asd_data.std():.3f}"
        
        # 2. 正态性检验 (Shapiro-Wilk test)
        _, p_normal_td = stats.shapiro(td_data)
        _, p_normal_asd = stats.shapiro(asd_data)
        is_normal = (p_normal_td > 0.05) and (p_normal_asd > 0.05)
        
        # 3. 方差齐性检验 (Levene's test)
        _, p_levene = stats.levene(td_data, asd_data)
        
        # 4. 根据正态性选择检验方法
        if is_normal:
            t_stat, p_value = stats.ttest_ind(td_data, asd_data, equal_var=(p_levene > 0.05))
            test_method = "t检验"
        else:
            u_stat, p_value = stats.mannwhitneyu(td_data, asd_data, alternative='two-sided')
            test_method = "Mann-Whitney U"
        
        # 存储结果
        stats_results.append({
            '特征': col.replace(f'{feature_type}_', '').capitalize(),  # 简化特征名
            'TD组': desc_td,
            'ASD组': desc_asd,
            '检验方法': test_method,
            'p值': p_value,
        })
    
    # 5. FDR校正
    p_values = [result['p值'] for result in stats_results]
    _, p_fdr = fdrcorrection(p_values)
    
    # 更新结果
    for i in range(len(stats_results)):
        stats_results[i]['FDR校正p值'] = p_fdr[i]
        
        # 添加显著性标记
        raw_p = stats_results[i]['p值']
        fdr_p = p_fdr[i]
        
        # 原始p值的显著性标记
        if raw_p < 0.001:
            raw_sig = '***'
        elif raw_p < 0.01:
            raw_sig = '**'
        elif raw_p < 0.05:
            raw_sig = '*'
        else:
            raw_sig = 'ns'
            
        # FDR校正后p值的显著性标记
        if fdr_p < 0.001:
            fdr_sig = '***'
        elif fdr_p < 0.01:
            fdr_sig = '**'
        elif fdr_p < 0.05:
            fdr_sig = '*'
        else:
            fdr_sig = 'ns'
            
        stats_results[i]['显著性'] = f"{raw_sig}/{fdr_sig}"
    
    # 转换为DataFrame并格式化
    results_df = pd.DataFrame(stats_results)
    
    # 格式化p值
    for col in ['p值', 'FDR校正p值']:
        results_df[col] = results_df[col].apply(lambda x: f"{x:.3f}" if x >= 0.001 else "<0.001")

    # 创建排序用的分类类型
    results_df['特征'] = pd.Categorical(
        results_df['特征'], 
        categories=EMOTION_ORDER, 
        ordered=True
    )
    
    # 按照指定顺序排序
    results_df = results_df.sort_values('特征')

    # 设置列名
    results_df.columns = ['特征', 'TD组 (M±SD)', 'ASD组 (M±SD)', '检验方法', 'p值', 'FDR校正p值', '显著性']
    
    # 打印结果表格
    print(f"\n{feature_type}特征组间差异分析结果：")
    print("="*100)
    
    # 设置pandas显示选项
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 120)
    pd.set_option('display.max_colwidth', 30)
    pd.set_option('display.colheader_justify', 'center')
    
    # 使用tabulate打印对齐的表格
    from tabulate import tabulate
    print(tabulate(results_df, headers='keys', tablefmt='pipe', showindex=False))
    
    print("="*100)
    
    return results_df


def plot_emotion_bars(feature_type, feature_dir="result/Emotion_features", show_points=True, save_fig=False):
    """
    绘制情绪特征的分组柱状图
    参数：
        feature_type: 特征类型 (如 'GEV')
        feature_dir: 特征数据目录
        show_points: 是否显示散点
        save_fig: 是否保存图片
    """
    import matplotlib.pyplot as plt
    
    # 设置中文字体
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False     # 用来正常显示负号
    
    # 定义情绪顺序和中文标签
    EMOTION_ORDER = ['Neutral', 'Happy', 'Sad', 'Surprise', 'Anger']
    EMOTION_LABELS = ['中性', '快乐', '悲伤', '惊讶', '愤怒']
    EMOTION_DICT = dict(zip(EMOTION_ORDER, EMOTION_LABELS))
    
    # 读取数据
    df = pd.read_csv(os.path.join(feature_dir, f'{feature_type}.csv'))
    
    # 创建图形
    plt.figure(figsize=(12, 6))
    
    # 设置柱状图的位置
    bar_width = 0.35
    x = np.arange(len(EMOTION_ORDER))
    
    # 存储均值和标准误
    means_td = []
    means_asd = []
    sems_td = []
    sems_asd = []
    
    # 计算每个情绪的均值和标准误
    for emotion in EMOTION_ORDER:
        col = f'{feature_type}_{emotion.lower()}'
        
        # TD组（组别=0）
        td_data = df[df['组别'] == 0][col]
        means_td.append(td_data.mean())
        sems_td.append(td_data.std() / np.sqrt(len(td_data)))
        
        # ASD组（组别=1）
        asd_data = df[df['组别'] == 1][col]
        means_asd.append(asd_data.mean())
        sems_asd.append(asd_data.std() / np.sqrt(len(asd_data)))
    
    # 设置颜色
    colors = ['#9AC9DB', '#2878B5']  # 浅蓝色(TD), 深蓝色(ASD)
    
    # 绘制柱状图 - TD组在左，ASD组在右
    plt.bar(x - bar_width/2, means_td, bar_width, label='TD组',
            yerr=sems_td, capsize=5, color=colors[0], alpha=0.8)
    plt.bar(x + bar_width/2, means_asd, bar_width, label='ASD组',
            yerr=sems_asd, capsize=5, color=colors[1], alpha=0.8)
    
    # 添加散点（如果需要）
    if show_points:
        for i, emotion in enumerate(EMOTION_ORDER):
            col = f'{feature_type}_{emotion.lower()}'
            
            # TD组散点（左边）
            td_data = df[df['组别'] == 0][col]
            plt.scatter(np.repeat(i - bar_width/2, len(td_data)), 
                       td_data, 
                       color=colors[0], 
                       alpha=0.3, 
                       s=30)
            
            # ASD组散点（右边）
            asd_data = df[df['组别'] == 1][col]
            plt.scatter(np.repeat(i + bar_width/2, len(asd_data)), 
                       asd_data, 
                       color=colors[1], 
                       alpha=0.3, 
                       s=30)
    
    # 添加网格线
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    
    # 设置图形属性
    plt.xlabel('情绪类别', fontsize=12)
    plt.ylabel(f'{feature_type}值', fontsize=12)
    plt.title(f'{feature_type}特征在TD组和ASD组的对比', fontsize=14, pad=20)
    
    # 设置x轴刻度
    plt.xticks(x, [EMOTION_DICT[emotion] for emotion in EMOTION_ORDER])
    
    # 添加图例
    plt.legend(loc='upper right')
    
    # 调整布局
    plt.tight_layout()
    
    # 保存图片
    if save_fig:
        # 确保目录存在
        os.makedirs('result/figures', exist_ok=True)
        plt.savefig(f'result/figures/{feature_type}_comparison.png', dpi=300, bbox_inches='tight')
    
    # 显示图形
    plt.show()
    

def plot_transition_matrix(feature_type, group, include_self_transition=False, is_standardized=False, feature_dir="result/Emotion_features"):
    """
    为指定组别绘制状态转移矩阵热力图
    
    参数:
        feature_type: 特征类型
        group: 组别(0或1)
        feature_dir: 特征数据目录
    """
    emotions = ['neutral', 'happy', 'sad', 'surprise', 'anger']
    n_emotions = len(emotions)
    
    # 创建转移矩阵
    trans_matrix = np.zeros((n_emotions, n_emotions))
    
    # 获取该组的数据
    df = pd.read_csv(os.path.join(feature_dir, f'{feature_type}.csv'))
    group_df = df[df['组别'] == group]
    
    # 填充转移矩阵
    for i, from_emotion in enumerate(emotions):
        for j, to_emotion in enumerate(emotions):
            if not include_self_transition and from_emotion == to_emotion:
                    continue
            col_name = f'{feature_type}_{from_emotion}_to_{to_emotion}'
            trans_matrix[i, j] = group_df[col_name].mean()

    # Sinkhorn-Knopp算法进行双向标准化
    if is_standardized:
        tolerance = 1e-10
        max_iter = 1000
        
        for _ in range(max_iter):
            # 行标准化
            row_sums = trans_matrix.sum(axis=1, keepdims=True)
            row_sums[row_sums == 0] = 1  # 避免除以0
            trans_matrix = trans_matrix / row_sums
            
            # 列标准化
            col_sums = trans_matrix.sum(axis=0, keepdims=True)
            col_sums[col_sums == 0] = 1  # 避免除以0
            trans_matrix = trans_matrix / col_sums
            
            # 检查是否收敛
            if np.all(np.abs(trans_matrix.sum(axis=1) - 1) < tolerance) and \
               np.all(np.abs(trans_matrix.sum(axis=0) - 1) < tolerance):
                break
    
    # 创建热力图
    plt.figure(figsize=(10, 8))
    sns.heatmap(trans_matrix, 
                annot=True,  # 显示数值
                fmt='.3f',   # 数值格式为3位小数
                cmap='YlOrRd',  # 使用YlOrRd配色
                xticklabels=emotions,
                yticklabels=emotions,
                vmin=0, 
                vmax=1)
    
    plt.title(f'组{group}的{feature_type}特征转移概率矩阵')
    plt.xlabel('转移到')
    plt.ylabel('转移自')
    
    # 设置中文字体
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    plt.tight_layout()
    plt.show()


In [1]:
# # 1. GEV特征
# analyze_emotion_features(emotion_dir="result/Emotions", feature_type="GEV", remove_outlier=False)
# statistics_analysis(feature_type="GEV")
# plot_emotion_bars(feature_type="GEV", show_points=False)

# # 2. Frequency特征
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="Frequency", remove_outlier=False)
# statistics_analysis(feature_type="Frequency")
# plot_emotion_bars(feature_type="Frequency", show_points=False)

# # # 3. Duration特征
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="Duration", remove_outlier=False)
# statistics_analysis(feature_type="Duration")
# plot_emotion_bars(feature_type="Duration", show_points=False)

# 4. 识别概率
# analyze_emotion_features(emotion_dir="result/Emotions", feature_type="Probability", remove_outlier=False)
# statistics_analysis(feature_type="Probability")
# plot_emotion_bars(feature_type="Probability", show_points=False)

# # 5. 状态转移矩阵 计算弦图
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="TransWithSelf", remove_outlier=False)
# plot_transition_matrix(feature_type="TransWithSelf", group=0, is_standardized=True)

# # 6. 状态转移矩阵  不包括相同情绪 计算转移概率组间相差的统计学使用这个
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="TransWithOutSelf", remove_outlier=False)
# plot_transition_matrix(feature_type="TransWithOutSelf", group=0)

# # 6.1. 状态转移差值统计学比较
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="GroupCompareTransWithOutSelf", remove_outlier=False)


## 特征汇总

In [83]:
# 汇总
def merge_emotion_features():
    # 读取各个特征文件
    gev_df = pd.read_csv('result/Emotion_features/GEV.csv')
    duration_df = pd.read_csv('result/Emotion_features/Duration.csv')
    frequency_df = pd.read_csv('result/Emotion_features/Frequency.csv')
    probability_df = pd.read_csv('result/Emotion_features/Probability.csv')
    trans_df = pd.read_csv('result/Emotion_features/TransWithOutSelf.csv')
    
    # 使用duration_df的基本信息列作为基础
    base_cols = ['组别', 'ABC', 'S1', 'R', 'B', 'L', 'S2', '克氏', 'Age', 'Person']
    result_df = duration_df[base_cols]
    
    # GEV特征
    result_df = result_df.merge(
        gev_df.drop(columns=base_cols),
        left_on='Person',
        right_on=gev_df['Person'],
        validate='1:1'
    )

    # 使用merge方法基于Person列合并数据
    # Duration特征
    result_df = result_df.merge(
        duration_df.drop(columns=base_cols), 
        left_index=True, 
        right_index=True,
        validate='1:1'
    )
    
    # Frequency特征
    result_df = result_df.merge(
        frequency_df.drop(columns=base_cols),
        left_on='Person',
        right_on=frequency_df['Person'],
        validate='1:1'
    )

    # # probability特征
    # result_df = result_df.merge(
    #     probability_df.drop(columns=base_cols),
    #     left_on='Person',
    #     right_on=probability_df['Person'],
    #     validate='1:1'
    # )
    
    # 转移概率特征
    result_df = result_df.merge(
        trans_df.drop(columns=base_cols),
        left_on='Person',
        right_on=trans_df['Person'],
        validate='1:1'
    )

    # 重命名列
    result_df = result_df.rename(columns={
        'Person': '姓名',
        '组别': 'group'
    })


    # 删除不需要的列
    cols_to_drop = ['S1', 'R', 'B', 'L', 'S2', 'Age']
    result_df = result_df.drop(columns=cols_to_drop)

    # 按照指定顺序重新排列基本信息列
    ordered_cols = ['姓名', 'group', 'ABC', '克氏']
    other_cols = [col for col in result_df.columns if col not in ordered_cols]
    result_df = result_df[ordered_cols + other_cols]

    # # 按姓名首字母排序
    # result_df = result_df.sort_values(by='姓名', key=lambda x: x.str[0])
    
    # 保存汇总结果
    output_file = 'result/machine_learning/merged_emotion_features.csv'
    result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"特征汇总完成，已保存至：{output_file}")
    
    # 检查是否有数据丢失
    print(f"原始数据行数: {len(duration_df)}")
    print(f"合并后数据行数: {len(result_df)}")
    
    return result_df

# 执行汇总
merged_df = merge_emotion_features()
print(merged_df.shape)

特征汇总完成，已保存至：result/machine_learning/merged_emotion_features_0.csv
原始数据行数: 184
合并后数据行数: 184
(184, 44)
