### 袋外相似度計算(單筆加入原始MS資料進行中位數相似度計算)

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

df_a = pd.read_csv('./df_raw_ms/22-23feedback切訊號及過濾波_500.csv')
df_b = pd.read_csv('./df_raw_ms/24M1-M5feedback切訊號及過濾波_500.csv')
df_raw_ms = pd.concat([df_a, df_b], axis=0)

# 原始資料的filename
# df_raw_ms_filenames = df_raw_ms.iloc[:, 0].values # 'filename'

# 原始資料的嵌入向量
df_raw_ms_embeddings = df_raw_ms.iloc[:, 1:].values # 'single_5000'

# 計算df_raw_ms內部相似度的中位數
# precomputed_median = np.median(cosine_similarity(df_raw_ms_embeddings), axis=1)
# print('df_raw_ms內部相似度的中位數:', precomputed_median.shape)

# OOB data
df_oob = pd.read_csv('M7_CHA_6-16S.csv')
print('原始資料shape:', df_raw_ms.shape)
print('OOB資料shape:', df_oob.shape)
# ------------------------------------------------------------------------------------------------------------------------------------------------
from joblib import Parallel, delayed
from tqdm import tqdm

# 定義計算相似度的函數
def compute_similarity(row_oob):
    filename_oob = row_oob.iloc[0]
    embedding_oob = row_oob.iloc[1:].values.reshape(1, -1)
    oob_similarity_scores = cosine_similarity(embedding_oob, df_raw_ms_embeddings).flatten()
    median_similarity = np.median(oob_similarity_scores)
    print('OOB文件名:', filename_oob, '中位數相似度:', median_similarity)
    return {"filename": filename_oob, "SBERT_median_similarity": median_similarity}

# 使用並行計算處理袋外測試資料的每一行
oob_median_similarities = Parallel(n_jobs=4)(delayed(compute_similarity)(row_oob) for _, row_oob in tqdm(df_oob.iterrows(), total=df_oob.shape[0]))

# 將結果保存為DataFrame
oob_median_df = pd.DataFrame(oob_median_similarities)
print(oob_median_df.shape)
print(oob_median_df.head(5))

# 保存結果到CSV文件
oob_median_df.to_csv('2024M7 OOB_median_similarity.csv', index=False)

原始資料shape: (35817, 5001)
OOB資料shape: (888, 5001)


100%|██████████| 888/888 [08:15<00:00,  1.79it/s]


(888, 2)
                filename  SBERT_median_similarity
0  20240701030903_042169                -0.001030
1  20240701093119_011108                -0.000425
2  20240701095227_008400                 0.000147
3  20240701110216_031745                -0.003103
4  20240701114008_034928                -0.003772


### 袋內相似度計算(全部原始MS資料進行中位數相似度計算，取filename自身以外的中位數相似度)

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# 读取CSV文件到DataFrame中
df = pd.read_csv('22-24M5_filtered_no_diabetes_mealstatus.csv')

# 提取文件名和嵌入向量
filenames = df.iloc[:, 0].values
embeddings = df.iloc[:, 1:].values

# 计算余弦相似度矩阵
similarities = cosine_similarity(embeddings)

# 用于保存文件名和中位数相似度的列表
median_similarities = []

# 遍历每个文件
for idx_i, filename1 in enumerate(filenames):
    # 提取当前文件对其他文件的相似度（排除自身）
    similarity_scores = similarities[idx_i][np.arange(len(filenames)) != idx_i]
    
    # 计算当前文件的相似度中位数
    median_similarity = np.median(similarity_scores)
    
    # 将文件名和中位数相似度保存到列表中
    median_similarities.append({"filename": filename1, "SBERT_median_similarity": median_similarity})

# 将文件名和中位数相似度保存到CSV文件
median_df = pd.DataFrame(median_similarities)
median_df.to_csv('240822 22-24M5_filtered_SBERT.csv', index=False)
