In [1]:
import pickle
import numpy as np
import neurokit2 as nk
import os
import pandas as pd

In [2]:
def categorize_emotion(row):
    if row['Arousal'] > 5 and row['Valence'] > 5:
        return 'HighArousal_HighValence'
    elif row['Arousal'] > 5 and row['Valence'] <= 5:
        return 'HighArousal_LowValence'
    elif row['Arousal'] <= 5 and row['Valence'] > 5:
        return 'LowArousal_HighValence'
    else:
        return 'LowArousal_LowValence'

In [6]:
import os
import pickle
import pandas as pd
import numpy as np
import neurokit2 as nk

data_dir = "data_preprocessed_python/"
all_results = []

sampling_rate = 128  # DEAP 数据的采样率

# 定义情感分类函数
def categorize_emotion(row):
    if row['Arousal'] > 5 and row['Valence'] > 5:
        return 'HighArousal_HighValence'
    elif row['Arousal'] > 5 and row['Valence'] <= 5:
        return 'HighArousal_LowValence'
    elif row['Arousal'] <= 5 and row['Valence'] > 5:
        return 'LowArousal_HighValence'
    else:
        return 'LowArousal_LowValence'

# 初始化一个日志列表，用于记录无法处理的文件和试验
error_log = []

# 提取每次试验的PRV（HRV代理）特征
for file_name in os.listdir(data_dir):
    if file_name.endswith(".dat"):  
        file_path = os.path.join(data_dir, file_name)
        print(f"Processing file: {file_path}")
        
        # 加载每个被试的数据
        try:
            with open(file_path, 'rb') as file:
                data = pickle.load(file, encoding='latin1')
        except Exception as e:
            print(f"Failed to load file {file_name}: {e}")
            error_log.append({"File": file_name, "Trial": "ALL", "Error": str(e)})
            continue
        
        # 提取信号和标签
        signals = data['data']   # shape: (40 trials, 40 channels, 8064 samples)
        labels = data['labels']
        
        # 提取Plethysmograph信号 (索引39为PPG通道)
        ppg_signals = signals[:, 38]
        
        # 提取每次试验的PRV特征
        hrv_features = []
        for trial_idx, trial_ppg in enumerate(ppg_signals):
            try:
                # 检查信号是否有效
                if trial_ppg is None or len(trial_ppg) == 0 or np.all(trial_ppg == 0):
                    print(f"Invalid PPG signal in file {file_name}, trial {trial_idx}")
                    error_log.append({"File": file_name, "Trial": trial_idx, "Error": "Invalid PPG signal"})
                    continue

                # 使用ppg_process处理PPG信号
                signals_ppg, info_ppg = nk.ppg_process(trial_ppg, sampling_rate=sampling_rate)

                # 检查PPG峰是否有效
                if 'PPG_Peaks' not in info_ppg or len(info_ppg['PPG_Peaks']) < 2:
                    print(f"No valid PPG peaks detected in file {file_name}, trial {trial_idx}")
                    error_log.append({"File": file_name, "Trial": trial_idx, "Error": "No valid PPG peaks detected"})
                    continue

                # 由于hrv_time函数是针对ECG R-peaks设计，这里我们假装PPG Peaks为R-peaks输入
                rpeaks = {"ECG_R_Peaks": info_ppg["PPG_Peaks"]}
                
                # 计算HRV（实际上是PRV）指标
                hrv = nk.hrv_time(rpeaks, sampling_rate=sampling_rate)
                hrv_features.append(hrv)

            except Exception as e:
                print(f"Error processing trial {trial_idx} in file {file_name}: {e}")
                error_log.append({"File": file_name, "Trial": trial_idx, "Error": str(e)})
                continue

        # 转为DataFrame格式
        if hrv_features:
            hrv_df = pd.concat(hrv_features, ignore_index=True)
        else:
            print(f"No valid HRV features extracted for file {file_name}")
            error_log.append({"File": file_name, "Trial": "ALL", "Error": "No valid HRV features"})
            continue
                           
        # 将情感标签转为DataFrame
        labels_df = pd.DataFrame(labels, columns=['Arousal', 'Valence', 'Dominance', 'Liking'])
        
        # 合并HRV特征和情感标签
        result_df = pd.concat([hrv_df, labels_df[["Arousal", "Valence"]]], axis=1)
        
        # 添加情感分类列
        result_df['Emotion_Category'] = result_df.apply(categorize_emotion, axis=1)
        
        # 添加被试编号列（如 s01, s02...）
        result_df['Subject'] = file_name.split('.')[0]  # 提取文件名作为被试编号
        
        # 将每个被试的结果添加到总列表中
        all_results.append(result_df)

# 合并所有被试的DataFrame
if all_results:
    final_result_df = pd.concat(all_results, ignore_index=True)
else:
    final_result_df = pd.DataFrame()

# 删除包含 NaN 的列和行
final_result_df = final_result_df.dropna(axis=1, how='all')  # 删除所有值为 NaN 的列
final_result_df = final_result_df.dropna(axis=0, how='any')  # 删除包含 NaN 的行

# 输出无法处理的文件日志
error_log_df = pd.DataFrame(error_log)
print("Error Log:")
print(error_log_df)

# 查看最终结果
print(final_result_df.head())

# 如果需要保存结果
final_result_df.to_csv("deap_prv_features.csv", index=False)
error_log_df.to_csv("deap_prv_error_log.csv", index=False)


Processing file: data_preprocessed_python/s01.dat
Processing file: data_preprocessed_python/s02.dat
Processing file: data_preprocessed_python/s03.dat
Processing file: data_preprocessed_python/s04.dat
Processing file: data_preprocessed_python/s05.dat
Processing file: data_preprocessed_python/s06.dat
Processing file: data_preprocessed_python/s07.dat
Processing file: data_preprocessed_python/s08.dat
Processing file: data_preprocessed_python/s09.dat
Processing file: data_preprocessed_python/s10.dat
Processing file: data_preprocessed_python/s11.dat
Processing file: data_preprocessed_python/s12.dat
Processing file: data_preprocessed_python/s13.dat
Processing file: data_preprocessed_python/s14.dat
Processing file: data_preprocessed_python/s15.dat
Processing file: data_preprocessed_python/s16.dat
Processing file: data_preprocessed_python/s17.dat
Processing file: data_preprocessed_python/s18.dat
Processing file: data_preprocessed_python/s19.dat
Processing file: data_preprocessed_python/s20.dat


In [15]:
final_result_df

Unnamed: 0,HRV_MeanNN,HRV_SDNN,HRV_RMSSD,HRV_SDSD,HRV_CVNN,HRV_CVSD,HRV_MedianNN,HRV_MadNN,HRV_MCVNN,HRV_IQRNN,...,HRV_pNN50,HRV_pNN20,HRV_MinNN,HRV_MaxNN,HRV_HTI,HRV_TINN,Arousal,Valence,Emotion_Category,Subject
0,923.973881,60.447814,78.978529,79.557900,0.065422,0.085477,921.87500,46.331250,0.050258,66.406250,...,37.313433,70.149254,757.8125,1117.1875,11.166667,171.8750,7.71,7.60,HighArousal_HighValence,s01
1,1015.368852,50.650216,53.559802,54.009237,0.049884,0.052749,1015.62500,46.331250,0.045618,62.500000,...,29.508197,73.770492,898.4375,1132.8125,10.166667,125.0000,8.10,7.31,HighArousal_HighValence,s01
2,910.615809,72.630760,48.489380,48.851811,0.079760,0.053249,898.43750,81.079687,0.090245,109.375000,...,35.294118,69.117647,781.2500,1078.1250,13.600000,78.1250,8.58,7.54,HighArousal_HighValence,s01
3,1013.191598,53.589061,76.280305,76.919997,0.052891,0.075287,1007.81250,57.914062,0.057465,78.125000,...,62.295082,83.606557,898.4375,1117.1875,12.200000,125.0000,4.94,6.01,LowArousal_HighValence,s01
4,958.618164,71.503630,95.333233,96.089137,0.074590,0.099449,949.21875,69.496875,0.073215,87.890625,...,48.437500,76.562500,742.1875,1140.6250,10.666667,156.2500,6.96,3.92,HighArousal_LowValence,s01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,723.473837,278.660153,415.687709,418.042013,0.385170,0.574572,664.06250,289.570312,0.436059,435.546875,...,89.534884,94.186047,304.6875,1343.7500,28.666667,62.5000,3.91,6.96,LowArousal_HighValence,s32
1276,719.840116,307.921801,423.836331,426.328604,0.427764,0.588792,671.87500,289.570312,0.430988,408.203125,...,89.534884,95.348837,304.6875,1843.7500,21.500000,562.5000,2.81,6.13,LowArousal_HighValence,s32
1277,686.885534,256.077432,347.226057,349.186217,0.372809,0.505508,640.62500,266.404687,0.415851,335.937500,...,88.764045,94.382022,304.6875,1460.9375,22.250000,257.8125,3.05,7.01,LowArousal_HighValence,s32
1278,714.260057,306.562952,397.236920,399.548344,0.429204,0.556152,609.37500,254.821875,0.418169,406.250000,...,90.804598,95.402299,312.5000,1601.5625,21.750000,125.0000,3.99,7.17,LowArousal_HighValence,s32


In [19]:
import pandas as pd

df = pd.read_csv("deap_prv_features.csv")

print("原始数据大小:", df.shape)


numeric_cols = df.select_dtypes(include=[float, int]).columns


clean_df = df.copy()

for col in numeric_cols:
    Q1 = clean_df[col].quantile(0.25)
    Q3 = clean_df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    clean_df = clean_df[(clean_df[col] >= lower_bound) & (clean_df[col] <= upper_bound)]


print("原始数据大小:", df.shape)
print("清理后数据大小:", clean_df.shape)
clean_df.to_csv("deap_prv_features.csv", index=False)


原始数据大小: (1280, 23)
原始数据大小: (1280, 23)
清理后数据大小: (1129, 23)


In [20]:
df = pd.read_csv("deap_prv_features.csv")

print("原始数据大小:", df.shape)

原始数据大小: (1129, 23)


In [21]:
df = pd.read_csv("deap_prv_features.csv")

df.describe()

Unnamed: 0,HRV_MeanNN,HRV_SDNN,HRV_RMSSD,HRV_SDSD,HRV_CVNN,HRV_CVSD,HRV_MedianNN,HRV_MadNN,HRV_MCVNN,HRV_IQRNN,...,HRV_Prc20NN,HRV_Prc80NN,HRV_pNN50,HRV_pNN20,HRV_MinNN,HRV_MaxNN,HRV_HTI,HRV_TINN,Arousal,Valence
count,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,...,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0
mean,819.388068,138.332082,185.885646,187.074808,0.17829,0.240102,804.535264,123.23538,0.168729,168.589806,...,708.790965,919.78244,46.527753,70.67441,565.814604,1231.219553,13.623153,198.33647,5.264641,5.138175
std,104.508494,102.262578,151.479252,152.391149,0.143998,0.210777,119.835527,109.526045,0.165553,151.495273,...,171.345192,105.900305,32.961402,21.852244,188.153622,302.244721,7.020999,110.586728,2.131799,2.036932
min,608.455882,10.031438,13.94179,14.049766,0.010805,0.015017,546.875,11.582812,0.012253,15.625,...,393.75,695.3125,0.0,12.121212,304.6875,734.375,2.75,0.0,1.0,1.0
25%,734.468006,52.191109,45.5811,45.853404,0.061649,0.052553,710.9375,40.539844,0.048346,54.6875,...,507.8125,845.3125,14.492754,53.521127,328.125,1000.0,8.3,109.375,3.9,3.74
50%,801.440747,100.593758,142.538353,143.501486,0.116866,0.166618,792.96875,63.705469,0.07444,85.9375,...,739.0625,921.875,43.076923,73.333333,609.375,1164.0625,11.0,187.5,5.04,5.13
75%,901.194853,251.308971,345.148808,347.169505,0.363199,0.497876,906.25,237.447656,0.360632,324.21875,...,843.75,984.375,84.090909,92.5,703.125,1429.6875,19.5,273.4375,7.05,6.94
max,1161.261792,438.197682,605.714225,610.354616,0.481608,0.692563,1171.875,451.729688,0.578214,578.125,...,1117.1875,1203.125,96.385542,98.958333,1007.8125,2148.4375,36.0,539.0625,9.0,9.0


In [44]:
with open('data_preprocessed_python/s24.dat', 'rb') as file:
    data = pickle.load(file, encoding='latin1')

# 数据结构
ecg_signal = data['data'][:, 37]
rpeaks = nk.ecg_findpeaks(trial_ecg, sampling_rate=sampling_rate)
hrv = nk.hrv_time(rpeaks, sampling_rate=sampling_rate)
print(hrv)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  out["MeanNN"] = np.nanmean(rri)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


IndexError: index 0 is out of bounds for axis 0 with size 0

In [48]:
for trial_idx, trial_ecg in enumerate(ecg_signal):
    try:
        # 检查信号是否有效
        if trial_ecg is None or len(trial_ecg) == 0 or np.all(trial_ecg == 0):
            print(f"Trial {trial_idx}: Invalid ECG signal, skipping.")
            continue

        # 滤波提升信号质量

        # 检测R波峰
        rpeaks = nk.ecg_findpeaks(trial_ecg, sampling_rate=sampling_rate)
        if 'ECG_R_Peaks' not in rpeaks or len(rpeaks['ECG_R_Peaks']) < 2:
            print(f"Trial {trial_idx}: No valid R-peaks detected, skipping.")
            continue

        # 计算HRV指标
        hrv = nk.hrv_time(rpeaks, sampling_rate=sampling_rate)
        print(f"Trial {trial_idx}: HRV features calculated successfully.")

    except Exception as e:
        print(f"Trial {trial_idx}: Error occurred - {e}")
        continue


Trial 0: HRV features calculated successfully.
Trial 1: No valid R-peaks detected, skipping.
Trial 2: No valid R-peaks detected, skipping.
Trial 3: HRV features calculated successfully.
Trial 4: No valid R-peaks detected, skipping.
Trial 5: No valid R-peaks detected, skipping.
Trial 6: No valid R-peaks detected, skipping.
Trial 7: HRV features calculated successfully.
Trial 8: No valid R-peaks detected, skipping.
Trial 9: HRV features calculated successfully.
Trial 10: HRV features calculated successfully.
Trial 11: HRV features calculated successfully.
Trial 12: HRV features calculated successfully.
Trial 13: HRV features calculated successfully.
Trial 14: HRV features calculated successfully.
Trial 15: No valid R-peaks detected, skipping.
Trial 16: No valid R-peaks detected, skipping.
Trial 17: No valid R-peaks detected, skipping.
Trial 18: No valid R-peaks detected, skipping.
Trial 19: No valid R-peaks detected, skipping.
Trial 20: No valid R-peaks detected, skipping.
Trial 21: No v

In [42]:
import pickle
import numpy as np

data_dir = "data_preprocessed_python/"

# 检查文件内容是否一致
for file_name in os.listdir(data_dir):
    if file_name.endswith(".dat"):
        file_path = os.path.join(data_dir, file_name)
        try:
            with open(file_path, 'rb') as file:
                data = pickle.load(file, encoding='latin1')
            
            # 检查文件结构
            print(f"Processing {file_name}")
            print(f"Data keys: {list(data.keys())}")
            print(f"Data shape: {data['data'].shape}, Labels shape: {data['labels'].shape}")
        
        except Exception as e:                      
            print(f"Error reading file {file_name}: {e}")

Processing s01.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s02.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s03.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s04.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s05.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s06.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s07.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s08.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s09.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing s10.dat
Data keys: ['labels', 'data']
Data shape: (40, 40, 8064), Labels shape: (40, 4)
Processing