## 进行数据过滤

数据过滤：

- 只保留 fold == 'all-folds'和test 的记录，忽略 Fold-1 到 Fold-5，以聚焦跨折叠的平均性能。

- 保留所有方法，包括基于物理的（Peak、fft）和监督学习的（ResNet、Transformer、Mamba2、inception_time 等）。

- 按 task（hr、resp Gonzalo_rr、spo2、BP_sys、BP_dia）和 ring_type（ring1、ring2）分组，提取 mae、rmse、mape、pearson 等指标。

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 1. 加载数据
file_path = "/root/RingTool/output/all_results.csv"
raw_df = pd.read_csv(file_path)

# 2. 添加 scene 列（如果未在 resultsCollection.ipynb 中添加）
if 'scene' not in raw_df.columns:
    raw_df['scene'] = raw_df['exp_name'].str.extract(r'-(stationary|motion|all)-')
    raw_df['scene'] = raw_df['scene'].fillna('unknown')

# 3. 过滤
mask_mode = raw_df['mode'].isin(['test', '5fold'])
mask_fold = (raw_df['fold'] == 'all-folds') | (raw_df['fold'].isna())
df = raw_df[mask_mode & mask_fold].copy()

# 4. 检查过滤结果
print(f"原始数据行数: {len(raw_df)}")
print(f"过滤后数据行数: {len(df)}")
print("\nMode 分布：")
print(df['mode'].value_counts())
print("\nFold 分布：")
print(df['fold'].value_counts(dropna=False))
print("\nScene 分布：")
print(df['scene'].value_counts())
print("\n方法分布：")
print(df['method_name'].value_counts())

原始数据行数: 990
过滤后数据行数: 189

Mode 分布：
test     137
5fold     52
Name: mode, dtype: int64

Fold 分布：
all-folds    115
NaN           74
Name: fold, dtype: int64

Scene 分布：
all           55
motion        53
stationary    49
unknown       32
Name: scene, dtype: int64

方法分布：
peak              36
fft               36
inception_time    30
resnet            30
transformer       30
mamba2            25
ratio              2
Name: method_name, dtype: int64


In [2]:
# 5. 去重
df_deduped = df.drop_duplicates(
    subset=['exp_name', 'task', 'ring_type', 'method_name', 'scene', 'mae_with_std', 'rmse_with_std', 
            'mape_with_std', 'pearson_with_std', 'mode'],
    keep='first'
)

# 6. 保存去重数据
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
df_deduped.to_csv(os.path.join(output_dir, 'deduped_results.csv'), index=False, encoding='utf-8-sig')
print(f"\n去重后记录数：{len(df_deduped)}")
print("\n去重后方法分布：")
print(df_deduped['method_name'].value_counts())
print("\n去重后 test 模式前几行：")
print(df_deduped[df_deduped['mode'] == 'test'][['exp_name', 'task', 'ring_type', 'scene', 'method_name', 'mae_with_std']].head(10))
print("\n去重后 5fold 模式前几行：")
print(df_deduped[df_deduped['mode'] == '5fold'][['exp_name', 'task', 'ring_type', 'scene', 'method_name', 'mae_with_std']].head(10))


去重后记录数：189

去重后方法分布：
peak              36
fft               36
inception_time    30
resnet            30
transformer       30
mamba2            25
ratio              2
Name: method_name, dtype: int64

去重后 test 模式前几行：
                                         exp_name            task ring_type  \
11  inception-time-ring1-samsung_hr-stationary-ir      samsung_hr     ring1   
23                                    peak-motion              hr     ring2   
24                                    peak-motion              hr     ring1   
25                                    peak-motion      samsung_hr     ring2   
26                                    peak-motion      samsung_hr     ring1   
27                                    peak-motion         oura_hr     ring2   
28                                    peak-motion         oura_hr     ring1   
29                                    peak-motion  samsung_hr_com     ring2   
30                                    peak-motion  samsung_hr_com     r

In [3]:
# 7. 提取均值和标准差
def extract_mean_std(col):
    try:
        matches = col.str.extract(r'(\d+\.\d+|\d+)±(\d+\.\d+|\d+)')
        if matches.empty:
            return pd.DataFrame({'mean': np.nan, 'std': np.nan})
        return matches.astype(float).rename(columns={0: 'mean', 1: 'std'})
    except:
        return pd.DataFrame({'mean': np.nan, 'std': np.nan})

df_deduped[['mae', 'mae_std']] = extract_mean_std(df_deduped['mae_with_std'])
df_deduped[['rmse', 'rmse_std']] = extract_mean_std(df_deduped['rmse_with_std'])
df_deduped[['mape', 'mape_std']] = extract_mean_std(df_deduped['mape_with_std'])
df_deduped[['pearson', 'pearson_std']] = extract_mean_std(df_deduped['pearson_with_std'])

# 8. 分组（添加 scene）
grouped = df_deduped.groupby(['task', 'ring_type', 'method_name', 'mode', 'scene'])[['mae', 'rmse', 'mape', 'pearson']].mean().reset_index()
grouped.to_csv(os.path.join(output_dir, 'results_grouped.csv'), index=False, encoding='utf-8-sig')
print(f"\n分组后记录数：{len(grouped)}")
print("\n分组后方法分布：")
print(grouped['method_name'].value_counts())

# 9. 去重分组数据
grouped_deduped = grouped.drop_duplicates(
    subset=['task', 'ring_type', 'method_name', 'scene', 'mae', 'rmse', 'mape', 'pearson'],
    keep='first'
)
grouped_deduped.to_csv(os.path.join(output_dir, 'results_grouped_deduped.csv'), index=False, encoding='utf-8-sig')
print(f"\n分组去重后记录数：{len(grouped_deduped)}")
print("\n分组去重后方法分布：")
print(grouped_deduped['method_name'].value_counts())


分组后记录数：176

分组后方法分布：
fft               36
peak              36
inception_time    30
transformer       30
resnet            30
mamba2            12
ratio              2
Name: method_name, dtype: int64

分组去重后记录数：176

分组去重后方法分布：
fft               36
peak              36
inception_time    30
transformer       30
resnet            30
mamba2            12
ratio              2
Name: method_name, dtype: int64


In [4]:
# 7. 提取均值和标准差
def extract_mean_std(col):
    try:
        matches = col.str.extract(r'(\d+\.\d+|\d+)±(\d+\.\d+|\d+)')
        if matches.empty:
            return pd.DataFrame({'mean': np.nan, 'std': np.nan})
        return matches.astype(float).rename(columns={0: 'mean', 1: 'std'})
    except:
        return pd.DataFrame({'mean': np.nan, 'std': np.nan})

df_deduped[['mae', 'mae_std']] = extract_mean_std(df_deduped['mae_with_std'])
df_deduped[['rmse', 'rmse_std']] = extract_mean_std(df_deduped['rmse_with_std'])
df_deduped[['mape', 'mape_std']] = extract_mean_std(df_deduped['mape_with_std'])
df_deduped[['pearson', 'pearson_std']] = extract_mean_std(df_deduped['pearson_with_std'])

# 8. 分组（添加 scene）
grouped = df_deduped.groupby(['task', 'ring_type', 'method_name', 'mode', 'scene'])[['mae', 'rmse', 'mape', 'pearson']].mean().reset_index()
grouped.to_csv(os.path.join(output_dir, 'results_grouped.csv'), index=False, encoding='utf-8-sig')
print(f"\n分组后记录数：{len(grouped)}")
print("\n分组后方法分布：")
print(grouped['method_name'].value_counts())

# 9. 去重分组数据
grouped_deduped = grouped.drop_duplicates(
    subset=['task', 'ring_type', 'method_name', 'scene', 'mae', 'rmse', 'mape', 'pearson'],
    keep='first'
)
grouped_deduped.to_csv(os.path.join(output_dir, 'results_grouped_deduped.csv'), index=False, encoding='utf-8-sig')
print(f"\n分组去重后记录数：{len(grouped_deduped)}")
print("\n分组去重后方法分布：")
print(grouped_deduped['method_name'].value_counts())


分组后记录数：176

分组后方法分布：
fft               36
peak              36
inception_time    30
transformer       30
resnet            30
mamba2            12
ratio              2
Name: method_name, dtype: int64

分组去重后记录数：176

分组去重后方法分布：
fft               36
peak              36
inception_time    30
transformer       30
resnet            30
mamba2            12
ratio              2
Name: method_name, dtype: int64


In [5]:

# 11. MAE 柱状图
tasks = df_deduped['task'].unique()
for task in tasks:
    for mode in ['test', '5fold']:
        task_mode_data = df_deduped[(df_deduped['task'] == task) & (df_deduped['mode'] == mode)]
        if task_mode_data.empty:
            continue
        plt.figure(figsize=(15, 8))
        ax = sns.barplot(
            data=task_mode_data,
            x='method_name',
            y='mae',
            hue='ring_type',
            dodge=True,
            errorbar=None,
            palette='Set2'
        )
        for p in ax.patches:
            height = p.get_height()
            if not pd.isna(height) and height > 0:
                ax.text(
                    p.get_x() + p.get_width() / 2.,
                    height + 0.005 * max(task_mode_data['mae'].fillna(0)) + 0.1,
                    f'{height:.2f}',
                    ha='center', va='bottom', fontsize=10, color='black'
                )
        plt.title(f'{task.upper()} - MAE Comparison ({mode} Mode)')
        plt.xlabel('Method')
        plt.ylabel('MAE')
        plt.xticks(rotation=45)
        plt.legend(title='Ring Type')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'mae_barplot_{task}_{mode}.png'), dpi=300)
        plt.close()
print(f"\n已生成 MAE 柱状图，保存到 {output_dir}/mae_barplot_<task>_<mode>.png")

scenes = df_deduped['scene'].unique()
print(f"\n场景列表：{scenes}")

# 12. MAE 热力图（按 scene 分组）
for mode in ['test', '5fold']:
    for scene in scenes:
        mode_scene_data = grouped_deduped[(grouped_deduped['mode'] == mode) & 
                                         (grouped_deduped['scene'] == scene)]
        if mode_scene_data.empty:
            continue
        pivot_mae = mode_scene_data.pivot_table(
            index='task',
            columns=['ring_type', 'method_name'],
            values='mae',
            aggfunc='mean'
        )
        plt.figure(figsize=(12, 6))
        sns.heatmap(pivot_mae, annot=True, fmt='.2f', cmap='YlOrRd')
        plt.title(f'MAE Heatmap ({mode} Mode, {scene} Scene)')
        plt.xlabel('Method (by Ring Type)')
        plt.ylabel('Task')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'mae_heatmap_{mode}_{scene}.png'), dpi=300)
        plt.close()
print(f"\n已生成 MAE 热力图，保存到 {output_dir}/mae_heatmap_<mode>_<scene>.png")


已生成 MAE 柱状图，保存到 output/mae_barplot_<task>_<mode>.png

场景列表：['motion' 'stationary' 'all' 'unknown']

已生成 MAE 热力图，保存到 output/mae_heatmap_<mode>_<scene>.png


In [6]:
import pandas as pd
import numpy as np
import os

# 1. 加载数据
file_path = "/root/RingTool/output/all_results.csv"
raw_df = pd.read_csv(file_path)

# 2. 添加 scene 列（如果未在 resultsCollection.ipynb 中添加）
if 'scene' not in raw_df.columns:
    raw_df['scene'] = raw_df['exp_name'].str.extract(r'-(stationary|motion|all)-')
    raw_df['scene'] = raw_df['scene'].fillna('unknown')

# 3. 过滤
mask_mode = raw_df['mode'].isin(['test', '5fold'])
mask_fold = (raw_df['fold'] == 'all-folds') | (raw_df['fold'].isna())
df = raw_df[mask_mode & mask_fold].copy()

# 4. 计算缺失的 all-folds 行
def compute_all_folds(df):
    all_folds_rows = []
    grouped = df.groupby('exp_name')
    for exp_name, group in grouped:
        if 'all-folds' not in group['fold'].values:
            folds = group[group['fold'].str.contains('Fold-', na=False)]
            if len(folds) > 0:  # 至少有 1 折
                def compute_mean_std(col):
                    try:
                        # 提取均值和标准差
                        means = [float(str(x).split('±')[0]) for x in folds[col] if '±' in str(x)]
                        stds = [float(str(x).split('±')[1]) for x in folds[col] if '±' in str(x)]
                        if means:
                            mean = sum(means) / len(means)
                            std = sum(stds) / len(stds)
                            return f"{mean:.2f}±{std:.2f}"
                        return None
                    except:
                        return None

                # 创建 all-folds 行
                all_folds_row = group.iloc[0].copy()
                all_folds_row['fold'] = 'all-folds'
                all_folds_row['mode'] = '5fold'
                all_folds_row['mae_with_std'] = compute_mean_std('mae_with_std')
                all_folds_row['rmse_with_std'] = compute_mean_std('rmse_with_std')
                all_folds_row['mape_with_std'] = compute_mean_std('mape_with_std')
                all_folds_row['pearson_with_std'] = compute_mean_std('pearson_with_std')
                all_folds_row['sample_len'] = folds['sample_len'].sum()
                all_folds_rows.append(all_folds_row)
    
    # 将 all-folds 行添加回原 DataFrame
    if all_folds_rows:
        return pd.concat([df, pd.DataFrame(all_folds_rows)], ignore_index=True)
    return df

# 添加 missing all-folds 行
df = compute_all_folds(df)

# 5. 去重
df_deduped = df.drop_duplicates(
    subset=['exp_name', 'task', 'ring_type', 'method_name', 'scene', 'mae_with_std', 'rmse_with_std', 
            'mape_with_std', 'pearson_with_std', 'mode'],
    keep='first'
)

# 6. 保存去重数据
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
df_deduped.to_csv(os.path.join(output_dir, 'deduped_results.csv'), index=False, encoding='utf-8-sig')
print(f"\n去重后记录数：{len(df_deduped)}")

# 7. 提取均值和标准差
def extract_mean_std(col):
    try:
        matches = col.str.extract(r'(\d+\.\d+|\d+)±(\d+\.\d+|\d+)')
        if matches.empty:
            return pd.DataFrame({'mean': np.nan, 'std': np.nan})
        return matches.astype(float).rename(columns={0: 'mean', 1: 'std'})
    except:
        return pd.DataFrame({'mean': np.nan, 'std': np.nan})

df_deduped[['mae', 'mae_std']] = extract_mean_std(df_deduped['mae_with_std'])
df_deduped[['rmse', 'rmse_std']] = extract_mean_std(df_deduped['rmse_with_std'])
df_deduped[['mape', 'mape_std']] = extract_mean_std(df_deduped['mape_with_std'])
df_deduped[['pearson', 'pearson_std']] = extract_mean_std(df_deduped['pearson_with_std'])

# 8. 分组（按 task、ring_type、method_name、mode 和 scene）
grouped = df_deduped.groupby(['task', 'ring_type', 'method_name', 'mode', 'scene'])[['mae', 'rmse', 'mape', 'pearson']].mean().reset_index()

# 9. 去重分组数据
grouped_deduped = grouped.drop_duplicates(
    subset=['task', 'ring_type', 'method_name', 'scene', 'mae', 'rmse', 'mape', 'pearson'],
    keep='first'
)
grouped_deduped.to_csv(os.path.join(output_dir, 'results_grouped_deduped.csv'), index=False, encoding='utf-8-sig')
print(f"\n分组去重后记录数：{len(grouped_deduped)}")

# 10. 保存分组结果
grouped.to_csv(os.path.join(output_dir, 'results_grouped.csv'), index=False, encoding='utf-8-sig')

# 11. MAE 绘图
import matplotlib.pyplot as plt
import seaborn as sns

# MAE 柱状图
tasks = df_deduped['task'].unique()
for task in tasks:
    for mode in ['test', '5fold']:
        task_mode_data = df_deduped[(df_deduped['task'] == task) & (df_deduped['mode'] == mode)]
        if task_mode_data.empty:
            continue
        plt.figure(figsize=(15, 8))
        ax = sns.barplot(
            data=task_mode_data,
            x='method_name',
            y='mae',
            hue='ring_type',
            dodge=True,
            errorbar=None,
            palette='Set2'
        )
        for p in ax.patches:
            height = p.get_height()
            if not pd.isna(height) and height > 0:
                ax.text(
                    p.get_x() + p.get_width() / 2. ,
                    height + 0.005 * max(task_mode_data['mae'].fillna(0)) + 0.1,
                    f'{height:.2f}',
                    ha='center', va='bottom', fontsize=10, color='black'
                )
        plt.title(f'{task.upper()} - MAE Comparison ({mode} Mode)')
        plt.xlabel('Method')
        plt.ylabel('MAE')
        plt.xticks(rotation=45)
        plt.legend(title='Ring Type')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'mae_barplot_{task}_{mode}.png'), dpi=300)
        plt.close()



去重后记录数：189

分组去重后记录数：176


In [7]:
# 12. MAE 热力图（按 scene 分组）
for mode in ['test', '5fold']:
    for scene in scenes:
        # 筛选出特定 mode 和 scene 的数据
        mode_scene_data = grouped_deduped[(grouped_deduped['mode'] == mode) & 
                                         (grouped_deduped['scene'] == scene)]
        if mode_scene_data.empty:
            continue
        # 创建热力图的透视表
        pivot_mae = mode_scene_data.pivot_table(
            index='task',  # 行：任务
            columns=['ring_type', 'method_name'],  # 列：按 ring_type 和 method_name 分类
            values='mae',  # 值：MAE
            aggfunc='mean'  # 聚合函数：计算均值
        )

        # 绘制热力图
        plt.figure(figsize=(12, 6))
        sns.heatmap(pivot_mae, annot=True, fmt='.2f', cmap='YlOrRd')  # 使用黄色到红色的色系
        plt.title(f'MAE Heatmap ({mode} Mode, {scene} Scene)')
        plt.xlabel('Method (by Ring Type)')
        plt.ylabel('Task')
        plt.tight_layout()
        
        # 保存热力图
        plt.savefig(os.path.join(output_dir, f'mae_heatmap_{mode}_{scene}.png'), dpi=300)
        plt.close()
