## Initialization

In [None]:
from plapt import Plapt
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import numpy as np

plapt = Plapt()

# Test_2016_290

In [2]:
benchmark_data = pd.read_csv("data/Test2016_290.csv")
    
# Extract sequences and smiles from benchmark dataset
prot_seqs = benchmark_data['seq'].tolist()
mol_smiles = benchmark_data['smiles_can'].tolist()
experimental_pKd = benchmark_data['neg_log10_affinity_M'].tolist()

In [3]:
predictions = plapt.predict_affinity(prot_seqs,mol_smiles)

In [None]:
predicted_pKd = [d['neg_log10_affinity_M'] for d in predictions]

# Ensure the lengths of the lists are the same
if len(predicted_pKd) != len(experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
predicted_pKd = np.array(predicted_pKd)
experimental_pKd = np.array(experimental_pKd)

# Calculate MSE
mse = np.mean((predicted_pKd - experimental_pKd) ** 2)

# Calculate MAE
mae = np.mean(np.abs(predicted_pKd - experimental_pKd))

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(predicted_pKd, experimental_pKd)
pearson_corr, _ = pearsonr(predicted_pKd, experimental_pKd)

from sklearn.metrics import r2_score
r2 = r2_score(experimental_pKd, predicted_pKd)
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")
print(f"Pearsons's Correlation: {pearson_corr}")

# CSAR HiQ 36

In [5]:
benchmark_data = pd.read_csv("data/CSAR-HiQ_36.csv")
    
# Extract sequences and smiles from benchmark dataset
prot_seqs = benchmark_data['seq'].tolist()
mol_smiles = benchmark_data['smiles_can'].tolist()
experimental_pKd = benchmark_data['neg_log10_affinity_M'].tolist()

In [6]:
predictions = plapt.predict_affinity(prot_seqs,mol_smiles)

In [None]:
predicted_pKd = [d['neg_log10_affinity_M'] for d in predictions]

# Ensure the lengths of the lists are the same
if len(predicted_pKd) != len(experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
predicted_pKd = np.array(predicted_pKd)
experimental_pKd = np.array(experimental_pKd)

# Calculate MSE
mse = np.mean((predicted_pKd - experimental_pKd) ** 2)

# Calculate MAE
mae = np.mean(np.abs(predicted_pKd - experimental_pKd))

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(predicted_pKd, experimental_pKd)
pearson_corr, _ = pearsonr(predicted_pKd, experimental_pKd)

from sklearn.metrics import r2_score
r2 = r2_score(experimental_pKd, predicted_pKd)
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")
print(f"Pearsons's Correlation: {pearson_corr}")
print(f"R^2: {r2}")

# Benchmark2k2101

In [11]:
benchmark_data = pd.read_csv("data/benchmark1k2101.csv")
prot_seqs = benchmark_data['seq']
mol_smiles = benchmark_data['smiles_can']
experimental_pKd = benchmark_data['neg_log10_affinity_M']

In [12]:
predictions = plapt.predict_affinity(prot_seqs,mol_smiles)

In [16]:
predicted_pKd = [d['neg_log10_affinity_M'] for d in predictions]

# Ensure the lengths of the lists are the same
if len(predicted_pKd) != len(experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
predicted_pKd = np.array(predicted_pKd)
experimental_pKd = np.array(experimental_pKd)

# Calculate MSE
mse = np.mean((predicted_pKd - experimental_pKd) ** 2)

# Calculate MAE
mae = np.mean(np.abs(predicted_pKd - experimental_pKd))

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(predicted_pKd, experimental_pKd)
pearson_corr, _ = pearsonr(predicted_pKd, experimental_pKd)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")
print(f"Pearsons's Correlation: {pearson_corr}")


MSE: 0.8505429506092742
MAE: 0.6883426504135132
RMSE: 0.9222488550327803
Spearman's Correlation: 0.8821411225023195
Pearsons's Correlation: 0.8816018986216324


## Recreate Custom Benchmark

In [None]:
from datasets import load_dataset, Dataset
import random
random.seed(2101)
benchmark_data = load_dataset("jglaser/binding_affinity")['train'].select(random.sample(range(10001,20001), 1000))

In [None]:
benchmark_data.to_csv("data/benchmark1k2101.csv")