In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats   import pearsonr, spearmanr

def pool_mean(x: np.ndarray) -> np.ndarray:
    return x.mean(axis=0)

def eval_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    p, _ = pearsonr(y_true, y_pred)
    s, _ = spearmanr(y_true, y_pred)
    return mae, rmse, p, s

molecule_embeddings = np.load('molecules_all_embeddings.npy', allow_pickle=True)
protein_embeddings  = np.load('proteins_all_embeddings.npy',   allow_pickle=True)

maes, rmses, ps, ss = [], [], [], []

for fold in range(1, 6):
    train = pd.read_csv(f'train_{fold}.csv')
    test  = pd.read_csv(f'test_{fold}.csv')

    def make_dataset(df):
        X, y = [], []
        for _, row in df.iterrows():
            mol  = molecule_embeddings[int(row.MoleculeIdx)]
            prot = protein_embeddings [int(row.ProteinIdx)]
            fv   = np.concatenate([pool_mean(mol), pool_mean(prot)])
            X.append(fv); y.append(row.Ki)
        return np.vstack(X), np.array(y)

    X_train, y_train = make_dataset(train)
    X_test,  y_test  = make_dataset(test)

    lr = LinearRegression().fit(X_train, y_train)
    y_pred = lr.predict(X_test)

    mae, rmse, p, s = eval_metrics(y_test, y_pred)
    maes.append(mae);   rmses.append(rmse)
    ps.append(p);       ss.append(s)

    print(f"Fold {fold} — MAE: {mae:.4f}, RMSE: {rmse:.4f}, Pearson: {p:.4f}, Spearman: {s:.4f}")

#5-fold average
print("\n=== 5-Fold Cross-Validation Summary ===")
print(f"MAE:      {np.mean(maes):.4f} ± {np.std(maes):.4f}")
print(f"RMSE:     {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
print(f"Pearson:  {np.mean(ps):.4f} ± {np.std(ps):.4f}")
print(f"Spearman: {np.mean(ss):.4f} ± {np.std(ss):.4f}")


Fold 1 — MAE: 0.9528, RMSE: 1.2367, Pearson: 0.2278, Spearman: 0.2306
Fold 2 — MAE: 0.8969, RMSE: 1.1475, Pearson: 0.3669, Spearman: 0.3513
Fold 3 — MAE: 0.8745, RMSE: 1.1282, Pearson: 0.2944, Spearman: 0.2804
Fold 4 — MAE: 0.8704, RMSE: 1.1192, Pearson: 0.2676, Spearman: 0.2626
Fold 5 — MAE: 0.9068, RMSE: 1.1510, Pearson: 0.2459, Spearman: 0.2552

=== 5-Fold Cross-Validation Summary ===
MAE:      0.9003 ± 0.0296
RMSE:     1.1565 ± 0.0418
Pearson:  0.2805 ± 0.0486
Spearman: 0.2760 ± 0.0409
