### 袋內訓練集測試寫法

In [2]:
import librosa
import os
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed

### 訓練集使用函式庫 ###

# 計算MFCC1, MFCC2特徵的函式
def mfccs(signal, sr=500, n_mfcc=15, n_fft=128, hop_length=128, n_mels=40):
    try:
        mfcc = librosa.feature.mfcc(
            y=signal, 
            sr=sr,
            n_mfcc=n_mfcc,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels
        )
        delta2_mfccs = librosa.feature.delta(mfcc, order=2)
        return mfcc, delta2_mfccs
    except librosa.util.exceptions.ParameterError as e:
        print(f"Error processing signal: {e}")
        return None, None

# 并行处理每个信号，计算 MFCC 特征
def process_signal(filename, signal):
    mfcc, delta2_mfcc = mfccs(signal)
    if mfcc is not None:
        mfcc_flatten = mfcc.reshape(-1)
        delta2_mfcc_flatten = delta2_mfcc.reshape(-1)
        return {
            "filename": filename,
            "mfcc": mfcc_flatten,
            "delta2_mfcc": delta2_mfcc_flatten
        }
    return None

# 将信号转换为 MFCC 特征并保存
def raw_singals_compute_MFCC_each_file(input_filename: str, output_filename: str):
    df = pd.read_csv(input_filename + '.csv')
    filenames = df['filename']
    signals_data = df.iloc[:, 1:]

    signals = [(filenames[i], signals_data.iloc[i].values) for i in range(len(filenames))]

    # 并行处理计算 MFCC 特征
    results = Parallel(n_jobs=-1)(delayed(process_signal)(filename, signal) for filename, signal in tqdm(signals, desc="Processing MFCC"))

    # 过滤掉可能返回 None 的结果
    results = [res for res in results if res is not None]

    # 构建最终的 DataFrame
    mfcc_df = pd.DataFrame({
        "filename": [res["filename"] for res in results],
        **{f"MFCCS_{i+1}": [res["mfcc"][i] for res in results] for i in range(len(results[0]["mfcc"]))},
        **{f"MFCCS2_{i+1}": [res["delta2_mfcc"][i] for res in results] for i in range(len(results[0]["delta2_mfcc"]))}
    })

    # 保存到 CSV 文件
    mfcc_df.to_csv(output_filename + '.csv', index=False)

# 计算欧式距离中位数的函数
def calculate_median_distance(idx_i, mfccs):
    distances = np.linalg.norm(mfccs - mfccs[idx_i], axis=1)
    distances = np.delete(distances, idx_i)
    median_distance = np.median(distances)
    return median_distance

# 多线程加速计算欧式距离中位数
def mfcc_euclidean_distance_median_train(result):
    filenames = result['filename'].values
    mfccs = result.iloc[:, 1:].values

    median_distances = Parallel(n_jobs=-1, backend="threading")(
        delayed(calculate_median_distance)(idx_i, mfccs) for idx_i in tqdm(range(len(filenames)), desc="Calculating Euclidean distance median")
    )

    median_distances_df = pd.DataFrame({
        "filename": filenames,
        "euclidean_distance_med": median_distances
    })

    return median_distances_df

### 訓練集計算流程 ###

# 1. 将原始横轴的 ECG 信号转换为 MFCC1, MFCC2 特征
raw_singals_compute_MFCC_each_file('22-24M5_filtered_no_diabetes_mealstatus', '22-24M5_filtered_no_diabetes_mealstatus_mfcc_fs')

# 2. 将 1. 的数据进行欧式距离相似度计算
singal_to_mfcc_features = pd.read_csv('22-24M5_filtered_no_diabetes_mealstatus_mfcc_fs.csv') 
train_distances_median = mfcc_euclidean_distance_median_train(singal_to_mfcc_features)
train_distances_median.to_csv('240822 22-24M5_filtered_mfcc_fs_median_similarity.csv', index=False)


Processing MFCC: 100%|██████████| 20571/20571 [00:03<00:00, 5887.19it/s]
Calculating Euclidean distance median: 100%|██████████| 20571/20571 [25:38<00:00, 13.37it/s]


### 袋外OOB測試

In [3]:
import librosa
import os
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed

### 袋外集使用函式庫 ###

# 定义 MFCC 计算函数
def mfccs(combined_data, sr, n_mfcc, n_fft, hop_length, n_mels):
    cc = []
    ccc = []
    filenames = combined_data['filename'].tolist()  # 获取文件名列表

    for i in tqdm(range(len(combined_data)), desc='梅爾轉換'):
        y = combined_data.iloc[i, 1:].to_numpy().astype(float)
        
        # 检查并处理无限值和非数值
        y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)

        try:
            mfccs = librosa.feature.mfcc(
                y=y, 
                sr=sr,
                n_mfcc=n_mfcc,
                n_fft=n_fft,
                hop_length=hop_length,
                n_mels=n_mels
            )
            ll = mfccs.reshape(-1)
            cc.append(ll)

            delta2_mfccs = librosa.feature.delta(mfccs, order=2)
            vv = delta2_mfccs.reshape(-1)
            ccc.append(vv)
        except librosa.util.exceptions.ParameterError as e:
            print(f"Error processing row {i}: {e}")
            continue

    df_cc = pd.DataFrame(cc)
    cc_column_names = [f'MFCCS_{i+1}' for i in range(df_cc.shape[1])]
    df_cc.columns = cc_column_names

    df_ccc = pd.DataFrame(ccc)
    ccc_column_names = [f'MFCCS2_{i+1}' for i in range(df_ccc.shape[1])]
    df_ccc.columns = ccc_column_names

    return pd.concat([pd.Series(filenames, name='filename'), df_cc, df_ccc], axis=1).reset_index(drop=True)

# 计算信号的 MFCC 特征并保存
def raw_singals_compute_MFCC_each_file(input_filename: str, output_filename: str):
    df = pd.read_csv(input_filename + '.csv')
    filenames = df['filename']
    signals_data = df.iloc[:, 1:]

    signals = [(filenames[i], signals_data.iloc[i].values) for i in range(len(filenames))]

    # 清空文件内容以避免追加模式下的重复数据（如果文件存在）
    if os.path.exists(output_filename + '.csv'):
        os.remove(output_filename + '.csv')

    mfcc_results = []

    for filename, signal in signals:
        df_signal = pd.DataFrame(signal).transpose()
        df_signal.insert(0, 'filename', os.path.basename(filename))  # 添加文件名列
        mfcc_result = mfccs(df_signal, sr=500, n_mfcc=15, n_fft=128, hop_length=128, n_mels=40)

        # 生成从0开始的列名
        column_names = ['filename'] + [str(i) for i in range(mfcc_result.shape[1] - 1)]
        mfcc_result.columns = column_names

        # 保存到CSV文件
        mfcc_result.to_csv(output_filename + '.csv', index=False, mode='a', header=not os.path.exists(output_filename + '.csv'))

# 计算袋外测试集与信号集的欧氏距离中位数
def euclidean_distance_different_median(mfcc, oob_mfcc):
    mfccs_signal_oob = oob_mfcc.iloc[:, 1:].values  # 获取 OOB 数据的 MFCC 特征
    mfccs_signal = mfcc.iloc[:, 1:].values  # 获取信号数据的 MFCC 特征
    filename_signal_oob = oob_mfcc['filename'].values[0]

    median_distances = []

    for idx_i, _ in enumerate(mfccs_signal_oob):
        # 计算当前 OOB 数据与所有信号数据的欧氏距离
        distances = np.linalg.norm(mfccs_signal - mfccs_signal_oob[idx_i], axis=1)
        
        # 计算中位数
        median_distance = np.median(distances)
        
        # 保存结果
        median_distances.append({
            "filename": filename_signal_oob,
            "euclidean_distance_med": median_distance
        })
        
        # 手动释放内存
        del distances
        gc.collect()  # 强制进行垃圾回收

    median_distances_df = pd.DataFrame(median_distances)
    
    return median_distances_df

# 主函数，用于执行完整的 OOB 测试流程
def oob_testing_pipeline(input_filename: str, raw_ms_data: str):
    # 這邊有點問題怪怪der-------------------------------------------
    # 生成 MFCC 特征并保存
    # raw_singals_compute_MFCC_each_file(input_filename, output_filename)

    # 读取生成的 MFCC 特征数据
    raw_MFCC_MS = pd.read_csv(raw_ms_data + '.csv')
    OOB_MFCC_MS = pd.read_csv(input_filename + '.csv')
    # --------------------------------------------------------------
    OOB_results = []

    # 计算 OOB 数据集的欧氏距离中位数
    with tqdm(total=len(OOB_MFCC_MS), desc="計算袋外檔案的歐式距離中位數") as pbar:
        for i in range(len(OOB_MFCC_MS)):
            single_oob = OOB_MFCC_MS.iloc[[i]].reset_index(drop=True)  # 获取单笔 OOB 数据
            distances_median_df = euclidean_distance_different_median(raw_MFCC_MS, single_oob)
            OOB_results.append(distances_median_df)
            
            # 更新进度条
            pbar.update(1)

    # 将所有结果保存到 DataFrame 中并导出到 CSV 文件
    final_results_df = pd.concat(OOB_results, ignore_index=True)
    final_results_df.to_csv(input_filename + '_euclidean_distances_median.csv', index=False)

raw_singals_compute_MFCC_each_file('M7_CHA_6-16S', 'M7_CHA_6-16S_mfcc_fs')

# 调用主函数进行 OOB 测试
oob_testing_pipeline('M7_CHA_6-16S_mfcc_fs', '22-24M5_filtered_no_diabetes_mealstatus_mfcc_fs')



梅爾轉換: 100%|██████████| 1/1 [00:01<00:00,  1.97s/it]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.42it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 250.84it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.47it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 200.66it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 251.49it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.47it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.39it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.45it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 167.22it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.53it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.50it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 501.71it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.45it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.47it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.42it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 250.84it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 250.81it/s]
梅爾轉換: 100%|██████████| 1/1 [00:00<00:00, 334.42