## Emotion Feature Preprocessing

In [None]:
import pandas as pd
import os
import numpy as np
import scipy.stats as stats

def smooth_emotions2(emotion_series, window_size=10, min_duration=10):
    """
    Smooth emotion sequence using rule-based method
    window_size: Sliding window size
    min_duration: Minimum duration frames
    """
    emotions = emotion_series.values
    smoothed = emotions.copy()
    n = len(emotions)

    # 1. Use sliding window majority voting for initial smoothing
    for i in range(n):
        start = max(0, i - window_size // 2)
        end = min(n, i + window_size // 2 + 1)
        window = emotions[start:end]

        # If current frame is valid emotion, perform majority voting smoothing
        if emotions[i] != 'undefined':  # Only process when current emotion is not 'undefined'
            # Get valid emotions in window
            valid_emotions = window[window != 'undefined']
            
            # If there are valid emotions, perform majority voting
            if len(valid_emotions) > 0:
                unique, counts = np.unique(valid_emotions, return_counts=True)
                majority = unique[counts.argmax()]
                smoothed[i] = majority
            else:
                smoothed[i] = emotions[i]
        else:
            smoothed[i] = emotions[i]  # No processing for undefined emotions

    # 2. Split emotions with duration less than min_duration frames, loop until no segments smaller than min_duration
    while True:
        i = 0
        modified = False  # Record whether any modifications were made
        while i < n - 1:
            # Find consecutive same emotion segments
            start = i
            while i < n - 1 and smoothed[i] == smoothed[i + 1]:
                i += 1
            end = i + 1  # Current emotion segment end position
            
            # If current segment duration is less than min_duration, split it
            if end - start < min_duration:
                # Determine previous and next emotion categories
                if start > 0:
                    prev_emotion = smoothed[start - 1]
                else:
                    prev_emotion = 'undefined'

                if end < n:
                    next_emotion = smoothed[end] 
                else:
                    next_emotion = 'undefined'

                # Split current segment, first half goes to previous emotion, second half goes to next emotion
                mid = start + (end - start) // 2
                smoothed[start:mid] = prev_emotion
                smoothed[mid:end] = next_emotion

                modified = True  # Mark as modified
                break  # Exit loop for next round of checking

            i += 1

        # If no modifications were made, splitting is complete, exit loop
        if not modified:
            break

    return pd.Series(smoothed, index=emotion_series.index)

def process_and_save_smoothed_emotions(input_dir, output_dir, window_size=10, min_duration=10):
    
    # Create output directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    try:
        # Get all CSV files
        csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
        if not csv_files:
            raise ValueError(f"在 {input_dir} 中没有找到CSV文件")
            
        print(f"开始处理{len(csv_files)}个文件的数据...")
        
        # Process all files
        for i, csv_file in enumerate(csv_files, 1):
            input_file = os.path.join(input_dir, csv_file)
            output_file = os.path.join(output_dir, csv_file)
            
            print(f"\r处理进度：{i}/{len(csv_files)} ({i/len(csv_files)*100:.1f}%) - "
                  f"当前处理：{csv_file}", end='')
            # if csv_file != '白硕旻.csv':
            #     continue
            try:
                # Read first two header lines
                with open(input_file, 'r') as f:
                    header_lines = [next(f) for _ in range(2)]
                
                # Read data
                df = pd.read_csv(input_file, skiprows=2)
                
                # Smooth emotion sequence
                df['SmoothedEmotion'] = smooth_emotions2(df['Emotion'], window_size, min_duration)
                
                # Save results, including first two header lines
                with open(output_file, 'w', newline='') as f:
                    # Write original two header lines
                    f.writelines(header_lines)
                    f.write('\n')
                    # Write processed data
                    df.to_csv(f, index=False)
                
            except Exception as e:
                print(f"\n警告：处理 {csv_file} 时出错: {str(e)}")
                continue
        
        print("\n所有数据处理完成，平滑后的结果已保存至 result/Emotions_smooth/")
            
    except Exception as e:
        print(f"错误：{str(e)}")
        return False

# Execute processing
# process_and_save_smoothed_emotions(input_dir="result/Emotions", output_dir="result/Emotions_smooth2", window_size=10, min_duration=10)
# process_and_save_smoothed_emotions(input_dir="result/Emotions", output_dir="result/Emotions_smooth3", window_size=5, min_duration=3)


## Extract features

In [None]:
import pandas as pd
import os
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Extract features
def analyze_emotion_features(emotion_dir, feature_type, remove_outlier=True, demographic_csv=r'result/demographics.csv', output_dir="result/Emotion_features", is_save = True):
    # Define constants
    EMOTIONS = ['neutral', 'happy', 'sad', 'surprise', 'anger']
    FRAME_DURATION = 20  # 20 milliseconds per frame
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Read emotion CSV data
    csv_files = [f for f in os.listdir(emotion_dir) if f.endswith('.csv')]
    print(f"开始处理{len(csv_files)}个文件...")

    # Store results for all subjects
    results = []

    for file in csv_files:
        try:
            # Get subject name
            subject_name = os.path.splitext(file)[0]
            
            # Read data
            df = pd.read_csv(os.path.join(emotion_dir, file), skiprows=2)

            df = df[df['Emotion'].isin(EMOTIONS)]  # Filter invalid emotions
            df['SmoothedEmotion'] = smooth_emotions2(df['Emotion'])  # Smooth emotions
            
            # Calculate total duration (seconds)
            total_duration = len(df) * FRAME_DURATION / 1000
            
            # Initialize feature dictionary for this subject
            subject_features = {'Person': subject_name}

            # Calculate GEV features
            if feature_type == 'GEV':                
                gev_series = df.groupby('SmoothedEmotion').size() / len(df)
                for emotion in EMOTIONS:
                    gev = gev_series.get(emotion, 0)  # If emotion doesn't exist, return 0
                    if emotion == 'happy':
                        subject_features[f'GEV_sad'] = gev
                    elif emotion == 'sad':
                        subject_features[f'GEV_happy'] = gev
                    else:
                        subject_features[f'GEV_{emotion}'] = gev

            # Calculate occurrence frequency (changes per second)
            if feature_type == 'Frequency':
                emotion_sequence = df['SmoothedEmotion'].values
                changes = 0  # Number of emotion changes
                # Calculate occurrence frequency for each emotion
                for emotion in EMOTIONS:
                    changes = 0
                    for i in range(1, len(emotion_sequence)):
                        if emotion_sequence[i] == emotion and emotion_sequence[i-1] != emotion:
                            changes += 1
                    # Calculate frequency (number of changes per second)
                    frequency = changes / total_duration
                    if emotion == 'happy':
                        subject_features[f'Frequency_sad'] = frequency
                    elif emotion == 'sad':
                        subject_features[f'Frequency_happy'] = frequency
                    else:
                        subject_features[f'Frequency_{emotion}'] = frequency

            # Calculate average duration
            if feature_type == 'Duration':
                # Initialize duration for all emotions to 0
                for emotion in EMOTIONS:
                    subject_features[f'Duration_{emotion}'] = 0
                # Detect emotion change points
                emotion_segments = df['SmoothedEmotion'] != df['SmoothedEmotion'].shift()
                # Calculate number of segments for each emotion
                emotion_runs = df[emotion_segments]['SmoothedEmotion'].value_counts()
                # Calculate total frames for each emotion
                emotion_total_frames = df.groupby('SmoothedEmotion').size()
                # Calculate average duration for each emotion (milliseconds)
                for emotion in EMOTIONS:
                    if emotion in emotion_runs.index and emotion in emotion_total_frames.index:
                        avg_duration = (emotion_total_frames[emotion] / emotion_runs[emotion]) * FRAME_DURATION
                        subject_features[f'Duration_{emotion}'] = avg_duration
                    else:
                        subject_features[f'Duration_{emotion}'] = 0
            
            # Calculate average recognition probability
            if feature_type == 'Probability':
                mean_probability = df.groupby('SmoothedEmotion')['Probability'].mean().round(2)
                std_probability = df.groupby('SmoothedEmotion')['Probability'].std().round(2)
                for emotion in EMOTIONS:
                    mean = mean_probability.get(emotion, 0)  # If emotion doesn't exist, return 0
                    std = std_probability.get(emotion, 0)
                    subject_features[f'Probability_mean_{emotion}'] = mean
                    subject_features[f'Probability_std_{emotion}'] = std
                
            # State transition matrix including same emotions for chord diagram
            if feature_type == 'TransWithSelf':
                transition_counts = pd.crosstab(
                    df['SmoothedEmotion'], 
                    df['SmoothedEmotion'].shift(-1), 
                    normalize='index'
                )
                # Add transition probabilities
                for e1 in EMOTIONS:
                    for e2 in EMOTIONS:
                            subject_features[f'TransWithSelf_{e1}_to_{e2}'] = transition_counts.get(e2, {}).get(e1, 0)

            # State transition matrix excluding same emotions for statistical analysis of group differences
            if feature_type == 'TransWithOutSelf':
                # df = df[df['Emotion'].isin(EMOTIONS)]  # Filter invalid emotions
                # df['SmoothedEmotion'] = smooth_emotions2(df['Emotion'])  # Smooth emotions
                current_emotions = df['SmoothedEmotion'].iloc[:-1].reset_index(drop=True)  # Remove last one
                next_emotions = df['SmoothedEmotion'].iloc[1:].reset_index(drop=True)      # Remove first one
                # Only keep transitions between different emotions
                mask = current_emotions != next_emotions
                current_emotions = current_emotions[mask]
                next_emotions = next_emotions[mask]
                # Calculate transition probability matrix
                transition_counts = pd.crosstab(
                    current_emotions,
                    next_emotions,
                    normalize='index'
                )
                # Add transition probabilities
                for e1 in EMOTIONS:
                    for e2 in EMOTIONS:
                        subject_features[f'TransWithOutSelf_{e1}_to_{e2}'] = transition_counts.get(e2, {}).get(e1, 0)

            # State transition group comparison excluding same emotions, data source for transition plots
            if feature_type == 'GroupCompareTransWithOutSelf':
                trans_data = analyze_emotion_features(emotion_dir, 'TransWithOutSelf', 
                                                      remove_outlier=remove_outlier, demographic_csv=demographic_csv, 
                                                      output_dir=output_dir, is_save = False)
                print(trans_data)
                # Get data for this group
                trans_data_ASD, trans_data_TD = trans_data[trans_data['组别'] == 1], trans_data[trans_data['组别'] == 0]


                
                EMOTION_ORDER = ['neutral', 'happy', 'sad', 'surprise', 'anger']
                n_emotions = len(EMOTION_ORDER)

                # Create transition matrix
                trans_matrix_data, trans_matrix_p_value = np.zeros((n_emotions, n_emotions)), np.zeros((n_emotions, n_emotions))

                # Fill transition matrix
                for i, from_emotion in enumerate(EMOTION_ORDER):
                    for j, to_emotion in enumerate(EMOTION_ORDER):
                        if from_emotion == to_emotion: continue
                        col_name = f'TransWithOutSelf_{from_emotion}_to_{to_emotion}'
                        asd_data, td_data = trans_data_ASD[col_name], trans_data_TD[col_name]

                        # Calculate mean difference
                        trans_matrix_data[i, j] = asd_data.mean() - td_data.mean()
                        _, p_value = stats.ttest_ind(asd_data, td_data)
                        trans_matrix_p_value[i, j] = p_value

                if is_save:
                    # Convert to DataFrame and add row/column labels
                    df_data = pd.DataFrame(trans_matrix_data, 
                                        index=EMOTION_ORDER, 
                                        columns=EMOTION_ORDER)
                    df_p_value = pd.DataFrame(trans_matrix_p_value, 
                                            index=EMOTION_ORDER, 
                                            columns=EMOTION_ORDER)
                    output_file = os.path.join(output_dir, 'TransBetweenGroups_analysis(From_Row_to_Col).xlsx')
                    with pd.ExcelWriter(output_file) as writer:
                        df_data.to_excel(writer, sheet_name='Mean_Difference')
                        df_p_value.to_excel(writer, sheet_name='P_Values')
                    print(f"分析结果已保存至：{output_file}")

                return trans_matrix_data, trans_matrix_p_value
            
            # Add to results list
            results.append(subject_features)

            # Print progress
            print(f"\r处理进度：{len(results)}/{len(csv_files)}", end='')

        except Exception as e:
            print(f"\n处理文件 {file} 时出错: {str(e)}")
            continue

    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Read demographic.csv and merge
    info_cols = ['姓名', '组别', 'ABC', 'S1', 'R', 'B', 'L', 'S2', '克氏', 'Age']
    demographic_df = pd.read_csv(demographic_csv)[info_cols]
    results_df = pd.merge(demographic_df, results_df, left_on='姓名', right_on='Person')
    results_df = results_df.drop('姓名', axis=1)

    feature_cols = [col for col in results_df.columns if col.startswith(feature_type)]
    
    # Remove outliers from data
    if remove_outlier:
        for col in feature_cols:
            for group in [0, 1]:
                mask = results_df['组别'] == group
                group_data = results_df.loc[mask, col]  
                q1 = group_data.quantile(0.25)
                q3 = group_data.quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                results_df.loc[mask & (group_data < lower_bound), col] = np.nanmedian(group_data)
                results_df.loc[mask & (group_data > upper_bound), col] = np.nanmedian(group_data)

    print("\n处理完成！")
    
    if is_save:
        # Save results
        output_file = os.path.join(output_dir, f'{feature_type}.csv')
        results_df.to_csv(output_file, index=False, encoding='utf-8-sig')
        print(f"结果已保存至：{output_file}")

    # # Display basic statistical information
    # print(f"\n{feature_type} feature statistics:")
    # print(results_df[feature_cols].describe())
    
    return results_df

# Statistical analysis
def statistics_analysis(feature_type, feature_dir="result/Emotion_features"):
    """
    Perform statistical analysis on emotion features, including:
    1. Descriptive statistics
    2. Normality test
    3. Homogeneity of variance test
    4. Group difference test (t-test or Mann-Whitney U test)
    5. FDR correction
    6. Effect size calculation
    7. Significance marking
    """
    from statsmodels.stats.multitest import fdrcorrection
    
    # Define emotion order
    EMOTION_ORDER = ['Neutral', 'Happy', 'Sad', 'Surprise', 'Anger']
    
    # Read data
    feature_df = pd.read_csv(os.path.join(feature_dir, f'{feature_type}.csv'))
    feature_cols = [col for col in feature_df.columns if col.startswith(feature_type)]

    print(feature_cols)
    
    # Store statistical results
    stats_results = []
    
    for col in feature_cols:
        # Get group data
        td_data = feature_df[feature_df['组别'] == 0][col]    # TD group (组别=0)
        asd_data = feature_df[feature_df['组别'] == 1][col]   # ASD group (组别=1)
        
        # 1. Descriptive statistics
        desc_td = f"{td_data.mean():.3f}±{td_data.std():.3f}"
        desc_asd = f"{asd_data.mean():.3f}±{asd_data.std():.3f}"
        
        # 2. Normality test (Shapiro-Wilk test)
        _, p_normal_td = stats.shapiro(td_data)
        _, p_normal_asd = stats.shapiro(asd_data)
        is_normal = (p_normal_td > 0.05) and (p_normal_asd > 0.05)
        
        # 3. Homogeneity of variance test (Levene's test)
        _, p_levene = stats.levene(td_data, asd_data)
        
        # 4. Choose test method based on normality
        if is_normal:
            t_stat, p_value = stats.ttest_ind(td_data, asd_data, equal_var=(p_levene > 0.05))
            test_method = "t检验"
        else:
            u_stat, p_value = stats.mannwhitneyu(td_data, asd_data, alternative='two-sided')
            test_method = "Mann-Whitney U"
        
        # Store results
        stats_results.append({
            '特征': col.replace(f'{feature_type}_', '').capitalize(),  # Simplify feature name
            'TD组': desc_td,
            'ASD组': desc_asd,
            '检验方法': test_method,
            'p值': p_value,
        })
    
    # 5. FDR correction
    p_values = [result['p值'] for result in stats_results]
    _, p_fdr = fdrcorrection(p_values)
    
    # Update results
    for i in range(len(stats_results)):
        stats_results[i]['FDR校正p值'] = p_fdr[i]
        
        # Add significance marking
        raw_p = stats_results[i]['p值']
        fdr_p = p_fdr[i]
        
        # Significance marking for raw p-values
        if raw_p < 0.001:
            raw_sig = '***'
        elif raw_p < 0.01:
            raw_sig = '**'
        elif raw_p < 0.05:
            raw_sig = '*'
        else:
            raw_sig = 'ns'
            
        # Significance marking for FDR-corrected p-values
        if fdr_p < 0.001:
            fdr_sig = '***'
        elif fdr_p < 0.01:
            fdr_sig = '**'
        elif fdr_p < 0.05:
            fdr_sig = '*'
        else:
            fdr_sig = 'ns'
            
        stats_results[i]['显著性'] = f"{raw_sig}/{fdr_sig}"
    
    # Convert to DataFrame and format
    results_df = pd.DataFrame(stats_results)
    
    # Format p-values
    for col in ['p值', 'FDR校正p值']:
        results_df[col] = results_df[col].apply(lambda x: f"{x:.3f}" if x >= 0.001 else "<0.001")

    # Create categorical type for sorting
    results_df['特征'] = pd.Categorical(
        results_df['特征'], 
        categories=EMOTION_ORDER, 
        ordered=True
    )
    
    # Sort by specified order
    results_df = results_df.sort_values('特征')

    # Set column names
    results_df.columns = ['特征', 'TD组 (M±SD)', 'ASD组 (M±SD)', '检验方法', 'p值', 'FDR校正p值', '显著性']
    
    # Print results table
    print(f"\n{feature_type}特征组间差异分析结果：")
    print("="*100)
    
    # Set pandas display options
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 120)
    pd.set_option('display.max_colwidth', 30)
    pd.set_option('display.colheader_justify', 'center')
    
    # Use tabulate to print aligned table
    from tabulate import tabulate
    print(tabulate(results_df, headers='keys', tablefmt='pipe', showindex=False))
    
    print("="*100)
    
    return results_df


def plot_emotion_bars(feature_type, feature_dir="result/Emotion_features", show_points=True, save_fig=False):
    """
    Plot grouped bar chart for emotion features
    Parameters:
        feature_type: Feature type (e.g. 'GEV')
        feature_dir: Feature data directory
        show_points: Whether to show scatter points
        save_fig: Whether to save figure
    """
    import matplotlib.pyplot as plt
    
    # Set Chinese font
    plt.rcParams['font.sans-serif'] = ['SimHei']  # For normal display of Chinese labels
    plt.rcParams['axes.unicode_minus'] = False     # For normal display of negative signs
    
    # Define emotion order and Chinese labels
    EMOTION_ORDER = ['Neutral', 'Happy', 'Sad', 'Surprise', 'Anger']
    EMOTION_LABELS = ['中性', '快乐', '悲伤', '惊讶', '愤怒']
    EMOTION_DICT = dict(zip(EMOTION_ORDER, EMOTION_LABELS))
    
    # Read data
    df = pd.read_csv(os.path.join(feature_dir, f'{feature_type}.csv'))
    
    # Create figure
    plt.figure(figsize=(12, 6))
    
    # Set bar chart positions
    bar_width = 0.35
    x = np.arange(len(EMOTION_ORDER))
    
    # Store means and standard errors
    means_td = []
    means_asd = []
    sems_td = []
    sems_asd = []
    
    # Calculate mean and standard error for each emotion
    for emotion in EMOTION_ORDER:
        col = f'{feature_type}_{emotion.lower()}'
        
        # TD group (组别=0)
        td_data = df[df['组别'] == 0][col]
        means_td.append(td_data.mean())
        sems_td.append(td_data.std() / np.sqrt(len(td_data)))
        
        # ASD group (组别=1)
        asd_data = df[df['组别'] == 1][col]
        means_asd.append(asd_data.mean())
        sems_asd.append(asd_data.std() / np.sqrt(len(asd_data)))
    
    # Set colors
    colors = ['#9AC9DB', '#2878B5']  # Light blue (TD), Dark blue (ASD)
    
    # Draw bar chart - TD group on left, ASD group on right
    plt.bar(x - bar_width/2, means_td, bar_width, label='TD组',
            yerr=sems_td, capsize=5, color=colors[0], alpha=0.8)
    plt.bar(x + bar_width/2, means_asd, bar_width, label='ASD组',
            yerr=sems_asd, capsize=5, color=colors[1], alpha=0.8)
    
    # Add scatter points (if needed)
    if show_points:
        for i, emotion in enumerate(EMOTION_ORDER):
            col = f'{feature_type}_{emotion.lower()}'
            
            # TD group scatter points (left)
            td_data = df[df['组别'] == 0][col]
            plt.scatter(np.repeat(i - bar_width/2, len(td_data)), 
                       td_data, 
                       color=colors[0], 
                       alpha=0.3, 
                       s=30)
            
            # ASD group scatter points (right)
            asd_data = df[df['组别'] == 1][col]
            plt.scatter(np.repeat(i + bar_width/2, len(asd_data)), 
                       asd_data, 
                       color=colors[1], 
                       alpha=0.3, 
                       s=30)
    
    # Add grid lines
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    
    # Set figure properties
    plt.xlabel('情绪类别', fontsize=12)
    plt.ylabel(f'{feature_type}值', fontsize=12)
    plt.title(f'{feature_type}特征在TD组和ASD组的对比', fontsize=14, pad=20)
    
    # Set x-axis ticks
    plt.xticks(x, [EMOTION_DICT[emotion] for emotion in EMOTION_ORDER])
    
    # Add legend
    plt.legend(loc='upper right')
    
    # Adjust layout
    plt.tight_layout()
    
    # Save figure
    if save_fig:
        # Ensure directory exists
        os.makedirs('result/figures', exist_ok=True)
        plt.savefig(f'result/figures/{feature_type}_comparison.png', dpi=300, bbox_inches='tight')
    
    # Display figure
    plt.show()
    

def plot_transition_matrix(feature_type, group, include_self_transition=False, is_standardized=False, feature_dir="result/Emotion_features"):
    """
    Plot state transition matrix heatmap for specified group
    
    Parameters:
        feature_type: Feature type
        group: Group (0 or 1)
        feature_dir: Feature data directory
    """
    emotions = ['neutral', 'happy', 'sad', 'surprise', 'anger']
    n_emotions = len(emotions)
    
    # Create transition matrix
    trans_matrix = np.zeros((n_emotions, n_emotions))
    
    # Get data for this group
    df = pd.read_csv(os.path.join(feature_dir, f'{feature_type}.csv'))
    group_df = df[df['组别'] == group]
    
    # Fill transition matrix
    for i, from_emotion in enumerate(emotions):
        for j, to_emotion in enumerate(emotions):
            if not include_self_transition and from_emotion == to_emotion:
                    continue
            col_name = f'{feature_type}_{from_emotion}_to_{to_emotion}'
            trans_matrix[i, j] = group_df[col_name].mean()

    # Sinkhorn-Knopp algorithm for bidirectional standardization
    if is_standardized:
        tolerance = 1e-10
        max_iter = 1000
        
        for _ in range(max_iter):
            # Row standardization
            row_sums = trans_matrix.sum(axis=1, keepdims=True)
            row_sums[row_sums == 0] = 1  # Avoid division by zero
            trans_matrix = trans_matrix / row_sums
            
            # Column standardization
            col_sums = trans_matrix.sum(axis=0, keepdims=True)
            col_sums[col_sums == 0] = 1  # Avoid division by zero
            trans_matrix = trans_matrix / col_sums
            
            # Check for convergence
            if np.all(np.abs(trans_matrix.sum(axis=1) - 1) < tolerance) and \
               np.all(np.abs(trans_matrix.sum(axis=0) - 1) < tolerance):
                break
    
    # Create heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(trans_matrix, 
                annot=True,  # Show values
                fmt='.3f',   # Value format to 3 decimal places
                cmap='YlOrRd',  # Use YlOrRd color scheme
                xticklabels=emotions,
                yticklabels=emotions,
                vmin=0, 
                vmax=1)
    
    plt.title(f'组{group}的{feature_type}特征转移概率矩阵')
    plt.xlabel('转移到')
    plt.ylabel('转移自')
    
    # Set Chinese font
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    plt.tight_layout()
    plt.show()


In [None]:
# # 1. GEV features
# analyze_emotion_features(emotion_dir="result/Emotions", feature_type="GEV", remove_outlier=False)
# statistics_analysis(feature_type="GEV")
# plot_emotion_bars(feature_type="GEV", show_points=False)

# # 2. Frequency features
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="Frequency", remove_outlier=False)
# statistics_analysis(feature_type="Frequency")
# plot_emotion_bars(feature_type="Frequency", show_points=False)

# # # 3. Duration features
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="Duration", remove_outlier=False)
# statistics_analysis(feature_type="Duration")
# plot_emotion_bars(feature_type="Duration", show_points=False)

# 4. Recognition probability
# analyze_emotion_features(emotion_dir="result/Emotions", feature_type="Probability", remove_outlier=False)
# statistics_analysis(feature_type="Probability")
# plot_emotion_bars(feature_type="Probability", show_points=False)

# # 5. State transition matrix for chord diagram
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="TransWithSelf", remove_outlier=False)
# plot_transition_matrix(feature_type="TransWithSelf", group=0, is_standardized=True)

# # 6. State transition matrix excluding same emotions for statistical analysis of group differences
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="TransWithOutSelf", remove_outlier=False)
# plot_transition_matrix(feature_type="TransWithOutSelf", group=0)

# # 6.1. Statistical comparison of state transition differences
# analyze_emotion_features(emotion_dir="result/Emotions_smooth2", feature_type="GroupCompareTransWithOutSelf", remove_outlier=False)


## Feature Summary

In [None]:
# Summary
def merge_emotion_features():
    # Read various feature files
    gev_df = pd.read_csv('result/Emotion_features/GEV.csv')
    duration_df = pd.read_csv('result/Emotion_features/Duration.csv')
    frequency_df = pd.read_csv('result/Emotion_features/Frequency.csv')
    probability_df = pd.read_csv('result/Emotion_features/Probability.csv')
    trans_df = pd.read_csv('result/Emotion_features/TransWithOutSelf.csv')
    
    # Use basic information columns from duration_df as base
    base_cols = ['组别', 'ABC', 'S1', 'R', 'B', 'L', 'S2', '克氏', 'Age', 'Person']
    result_df = duration_df[base_cols]
    
    # GEV features
    result_df = result_df.merge(
        gev_df.drop(columns=base_cols),
        left_on='Person',
        right_on=gev_df['Person'],
        validate='1:1'
    )

    # Use merge method to combine data based on Person column
    # Duration features
    result_df = result_df.merge(
        duration_df.drop(columns=base_cols), 
        left_index=True, 
        right_index=True,
        validate='1:1'
    )
    
    # Frequency features
    result_df = result_df.merge(
        frequency_df.drop(columns=base_cols),
        left_on='Person',
        right_on=frequency_df['Person'],
        validate='1:1'
    )

    # # probability features
    # result_df = result_df.merge(
    #     probability_df.drop(columns=base_cols),
    #     left_on='Person',
    #     right_on=probability_df['Person'],
    #     validate='1:1'
    # )
    
    # Transition probability features
    result_df = result_df.merge(
        trans_df.drop(columns=base_cols),
        left_on='Person',
        right_on=trans_df['Person'],
        validate='1:1'
    )

    # Rename columns
    result_df = result_df.rename(columns={
        'Person': '姓名',
        '组别': 'group'
    })


    # Remove unnecessary columns
    cols_to_drop = ['S1', 'R', 'B', 'L', 'S2', 'Age']
    result_df = result_df.drop(columns=cols_to_drop)

    # Reorder basic information columns in specified order
    ordered_cols = ['姓名', 'group', 'ABC', '克氏']
    other_cols = [col for col in result_df.columns if col not in ordered_cols]
    result_df = result_df[ordered_cols + other_cols]

    # # Sort by first letter of name
    # result_df = result_df.sort_values(by='姓名', key=lambda x: x.str[0])
    
    # Save summary results
    output_file = 'result/machine_learning/merged_emotion_features.csv'
    result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"特征汇总完成，已保存至：{output_file}")
    
    # Check for data loss
    print(f"原始数据行数: {len(duration_df)}")
    print(f"合并后数据行数: {len(result_df)}")
    
    return result_df

# Execute summary
merged_df = merge_emotion_features()
print(merged_df.shape)

特征汇总完成，已保存至：result/machine_learning/merged_emotion_features_0.csv
原始数据行数: 184
合并后数据行数: 184
(184, 44)
