## PCA+SVM

In [11]:
# PCA+SVM
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold,RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import os
from scipy.stats import loguniform, randint
from joblib import parallel_backend

In [None]:
train_df = pd.read_pickle("../B4PPI-main/data/medium_set/embeddings/train_data_with_embeddings.pkl")
#test1_df = pd.read_pickle("../B4PPI-main/data/medium_set/embeddings/val_data_with_embeddings.pkl")


In [12]:
train_df=pd.read_pickle('./first_three_rows.pkl')

In [14]:

def mean_pool(embedding_3d: np.ndarray) -> np.ndarray:
    """
    embedding_3d.shape == (seq_len, 960)
    对序列维做平均池化，返回固定 960 维向量。
    你也可以改成 max‑pool / attention‑pool 等。
    """
    return embedding_3d.mean(axis=0)

def build_feature_matrix(frame: pd.DataFrame) -> np.ndarray:
    """
    ① 解析 A、B 两个 embedding  
    ② 分别平均池化 → 960 维  
    ③ 拼接成 1920 维
    """
    pooled_A = []
    pooled_B = []

    for _, row in frame.iterrows():
        #print(row["embedding_A"])
        emb_a = mean_pool(row["embedding_A"][0])
        emb_b = mean_pool(row["embedding_B"][0])
        pooled_A.append(emb_a)
        pooled_B.append(emb_b)

    pooled_A = np.vstack(pooled_A)          # (n, 960)
    pooled_B = np.vstack(pooled_B)          # (n, 960)
    return np.hstack([pooled_A, pooled_B])  # (n, 1920)

def load_or_build(name, builder,num): #save 
    fname = f"{name}_{num}.npy"
    if os.path.exists(fname):
        return np.load(fname)
    arr = builder()
    np.save(fname, arr)
    return arr



In [None]:

num=1
X_train = load_or_build("X_train", lambda: build_feature_matrix(train_df),num)
y_train = load_or_build("y_train", lambda: train_df["isInteraction"].values.astype(int),num)
print(f"Train shape: {X_train.shape}")


Train shape: (10000, 1920)


In [None]:
# ────────────────────────────────────────────────────────────────
# 2. bulid pipe：StandardScaler --> PCA --> SVM
# ────────────────────────────────────────────────────────────────



# 1. 构建管道：StandardScaler --> PCA --> SVM
pca = PCA(n_components=0.95, svd_solver="full", random_state=42)
svm = SVC(probability=True, random_state=42)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca",    pca),
    ("svm",    svm),
])

# 2. 定义超参数空间
param_dist = {
    "svm__C"      : loguniform(0.01, 1000),
    "svm__kernel" : ['linear', 'poly', 'rbf', 'sigmoid'],
    "svm__degree" : randint(2, 6),
    "svm__gamma"  : loguniform(1e-5, 1),
}

# 3. 交叉验证设置
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4. 随机搜索超参 (限制进程数 + 控制预调度)
search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=50,
    cv=cv,
    scoring="roc_auc",
    n_jobs=4,                    # <= i7 的物理/逻辑核数
    pre_dispatch="4*n_jobs",     # 控制同时分发的任务数
    verbose=1,
    random_state=42
)

print("Searching best hyper-parameters ......")

# 5. （可选）如果多进程仍有问题，改用线程后端
with parallel_backend('threading', n_jobs=4):
    search.fit(X_train, y_train)

# 6. 输出结果
best_model = search.best_estimator_
print("Best parameters:", search.best_params_)
print("Best AUC:", search.best_score_)

#----------------



Searching best hyper-parameters ......
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'svm__C': np.float64(6.403036652671167), 'svm__degree': 4, 'svm__gamma': np.float64(0.00019069601332062687), 'svm__kernel': 'rbf'}
Best AUC: 0.6284645999999999


In [17]:
import joblib

# 训练完毕后，保存最优模型（Pipeline）
joblib.dump(best_model, "svm_pipeline.pkl")

# 如果你还想保存整个 GridSearchCV 对象（便于之后直接调用 search.predict 等）
joblib.dump(search, "grid_search_svm.pkl")

# ——加载时——
loaded_model = joblib.load("svm_pipeline.pkl")
# 或者
loaded_search = joblib.load("grid_search_svm.pkl")
best_pipeline = loaded_search.best_estimator_


In [7]:
test1_df = pd.read_pickle("../B4PPI-main/data/medium_set/embeddings/test1_data_with_embeddings.pkl")


In [18]:
test1_df=pd.read_pickle('./first_three_rows.pkl')

In [19]:

X_test1   = load_or_build("X_test1",   lambda: build_feature_matrix(test1_df),num)
y_test1   = load_or_build("y_test1",   lambda: test1_df["isInteraction"].values.astype(int),num)
print(f"Val shape:   {X_test1.shape}")

Val shape:   (2000, 1920)


In [20]:
y_test1_pred  = best_model.predict(X_test1)
y_test1_prob  = best_model.predict_proba(X_test1)[:, 1]

print("\n=== Classification report (validation) ===")
print(classification_report(y_test1, y_test1_pred, digits=4))

test1_auc = roc_auc_score(y_test1, y_test1_pred)
print(f"Validation ROC-AUC: {test1_auc:.4f}")



=== Classification report (validation) ===
              precision    recall  f1-score   support

           0     0.5702    0.5280    0.5483      1000
           1     0.5605    0.6020    0.5805      1000

    accuracy                         0.5650      2000
   macro avg     0.5654    0.5650    0.5644      2000
weighted avg     0.5654    0.5650    0.5644      2000

Validation ROC-AUC: 0.5650


In [21]:

test2_df = pd.read_pickle("../B4PPI-main/data/medium_set/embeddings/test2_data_with_embeddings.pkl")


In [None]:
test2_df=pd.read_pickle('./first_three_rows.pkl')

In [22]:

X_test2  = load_or_build("X_test2",  lambda: build_feature_matrix(test2_df),num)
y_test2  = load_or_build("y_test2",  lambda: test2_df["isInteraction"].values.astype(int),num)
print(f"Test shape:  {X_test2.shape}")


Test shape:  (10000, 1920)


In [23]:

y_test2_pred   = best_model.predict(X_test2)
y_test2_proba  = best_model.predict_proba(X_test2)[:, 1]

print("\n=== Classification report (test) ===")
print(classification_report(y_test2, y_test2_pred , digits=4))

test2_auc = roc_auc_score(y_test2, y_test2_pred)
print(f"Test ROC‑AUC: {test2_auc:.4f}")



=== Classification report (test) ===
              precision    recall  f1-score   support

           0     0.9403    0.5591    0.7013      9100
           1     0.1257    0.6411    0.2102       900

    accuracy                         0.5665     10000
   macro avg     0.5330    0.6001    0.4557     10000
weighted avg     0.8670    0.5665    0.6571     10000

Test ROC‑AUC: 0.6001
