# Dataset Version 1

This notebook generates a dataframe containing information about data first version dataset:

- Overlap of AbDb data (filtered and enriched abag data) with SAbDab data (affinity values for conformtions) 

In [1]:
import os
import pandas as pd
import warnings
from tqdm import tqdm_notebook

warnings.filterwarnings("ignore")
from abag_affinity.utils.config import read_yaml, get_data_paths

In [2]:
config = read_yaml("../../abag_affinity/config.yaml")

summary_path, pdb_path = get_data_paths(config, "SAbDab")
summary_df = pd.read_csv(summary_path, sep="\t")

abdb_summary_path, abdb_pdb_path = get_data_paths(config, "AbDb")
abdb_pdb_ids = os.listdir(abdb_pdb_path)
abdb_pdb_ids = set([ pdb_id.split("_")[0].lower() for pdb_id in abdb_pdb_ids])

In [3]:
# get overlap using pdb ids
sabdab_pdb_ids = set(summary_df["pdb"].unique())
overlapping_ids = abdb_pdb_ids.intersection(sabdab_pdb_ids)

#### Incorporate redundant files

In [4]:
# load information on redundant AbDb files
redunant_file_path = os.path.join(config["DATA"]["path"], config["DATA"]["AbDb"]["folder_path"], "Redundant_LH_Protein_Martin.txt")
with open(redunant_file_path) as f:
    lines = f.readlines()
    
redundant_ids = {}
all_ids = set()
for line in lines:
    pdb_ids = line.split(",")
    pdb_ids = [pdb_id.strip().lower().split("_")[0] for pdb_id in pdb_ids]
    pdb_ids = [pdb_id for pdb_id in pdb_ids if pdb_id.strip() != ""]
    all_ids.update(pdb_ids)
    for i, pdb_id in enumerate(pdb_ids):
        redundant_ids[pdb_id] = pdb_ids

In [5]:
for pdb_id in overlapping_ids:
    for redundant_id in redundant_ids[pdb_id]:
        all_ids.discard(redundant_id)

In [6]:
add_pdbs = set()

remaining_in_sabdab = all_ids.intersection(sabdab_pdb_ids)
while len(remaining_in_sabdab) > 0:
    pdb_id = remaining_in_sabdab.pop()
    add_pdbs.add(pdb_id)
    for redundant_id in redundant_ids[pdb_id]:
        remaining_in_sabdab.discard(redundant_id)

In [7]:
all_ids.intersection(sabdab_pdb_ids)

set()

In [8]:
# download extra pdb files from AbDb database
import urllib.request 

url = "http://www.abybank.org/abdb/Data/LH_Protein_Martin/{}_1.pdb"

for pdb_id in add_pdbs:
    filepath = os.path.join(abdb_pdb_path, pdb_id + "_1.pdb")
    if os.path.exists(filepath):
        print(pdb, "already exists")
        continue
    try:
        urllib.request.urlretrieve(url.format(pdb_id.upper()), filepath)
    except:
        print(pdb_id)
    print("Downloaded", pdb_id)

In [9]:
overlapping_ids.update(add_pdbs)

In [10]:
dataset = summary_df[summary_df["pdb"].isin(overlapping_ids)].copy()

In [11]:
import math
def get_chains(row):
    pdb_id = row["pdb"]
    pdb_id_rows = dataset[dataset["pdb"] == pdb_id]
    
    antibody_chains = []
    antigen_chains = []
    
    for i, pdb_row in pdb_id_rows.iterrows():
        antibody_chains.append(pdb_row["Hchain"])
        antibody_chains.append(pdb_row["Lchain"])
        
        if isinstance(pdb_row["antigen_chain"], str):
            if "|" in pdb_row["antigen_chain"]:
                for chain_id in pdb_row["antigen_chain"].split("|"):
                    antigen_chains.append(chain_id.strip())

            else:
                antigen_chains.append(pdb_row["antigen_chain"])
    
    return antibody_chains, antigen_chains

In [12]:
dataset["antibody_chains"], dataset["antigen_chains"] = zip(*dataset.apply(lambda row: get_chains(row), axis=1))

In [13]:
import numpy as np
gas_constant =  8.31446261815324 # 0.0821

def calc_temp(row):
    kd = row["affinity"]
    delta_g = row["delta_g"] * 4184 # convert kcal to joule 
    if kd == 0 or math.isnan(kd) or math.isnan(delta_g):
        return None

    return round((delta_g / (gas_constant * np.log(kd))) - 273.15) # convert kelvin to celsius

def calc_delta_g(row):
    delta_g = gas_constant * row["temperature_kelvin"] * np.log(row["affinity"])
    return delta_g / 4184 # convert to kcal

In [14]:
dataset["calculated_temp"] = dataset.apply(lambda row: calc_temp(row), axis=1)
dataset = dataset[~dataset["calculated_temp"].isnull()]

In [15]:
# add pdb file name from AbDb

abdb_pdb_path = os.path.join(config["DATA"]["path"], config["DATA"]["AbDb"]["folder_path"], config["DATA"]["AbDb"]["pdb_path"])

pdb_files = os.listdir(abdb_pdb_path)

id2file = {file.split("_")[0].lower(): file for file in pdb_files}

In [16]:
dataset["abdb_file"] = dataset["pdb"].apply(lambda pdb_id: id2file[pdb_id])

In [17]:
# clean dataset
dataset.drop_duplicates("pdb", inplace=True)
dataset = dataset.reset_index().drop("index", axis=1)
dataset = dataset[["pdb", "abdb_file", "antigen_type", "antibody_chains", "antigen_chains", 'affinity_method', 'temperature', "calculated_temp", 'affinity', 'delta_g', ]]

### Mark testset and delete redundant files

Use Antibody Benchmark as the final testset --> Remove redundant files.

In [18]:
dataset["test"] = False

In [19]:
benchmark_summary_path, benchmark_pdb_path = get_data_paths(config, "AntibodyBenchmark")
benchmark_summary_df = pd.read_excel(benchmark_summary_path)
benchmark_summary_df = benchmark_summary_df[~benchmark_summary_df["ΔG (kcal/mol)"].isna() & (benchmark_summary_df["ΔG (kcal/mol)"] != " ")]
benchmark_summary_df.reset_index(inplace=True).drop("index", axis=1)
print("Numer of available benchmark structures >>>", len(benchmark_summary_df))

Numer of available benchmark structures >>> 51


In [20]:
benchmark_pdb_ids = set([ pdb_id.split("_")[0].lower() for pdb_id in benchmark_summary_df["Complex PDB"].unique() ])
overlapping_ids = benchmark_pdb_ids.intersection(set(dataset["pdb"].unique()))
print("Number of identical PDB benchmark_pdb_ids", len(overlapping_ids))

Number of identical PDB benchmark_pdb_ids 32


In [21]:
# Set these strucutres as testset strucutres in the dataset
dataset["test"] = dataset["pdb"].apply(lambda pdb_id: pdb_id in benchmark_pdb_ids)

In [22]:
benchmark_summary_df["pdb"] = benchmark_summary_df["Complex PDB"].apply(lambda pdb_id: pdb_id.split("_")[0].lower() )
remaining_benchmark = benchmark_summary_df[~benchmark_summary_df["pdb"].isin(overlapping_ids)].copy()
print("Remaining Benchmark Entries", len(remaining_benchmark.isin(overlapping_ids)))

Remaining Benchmark Entries 19


In [23]:
# load information on redundant AbDb files
redunant_file_path = os.path.join(config["DATA"]["path"], config["DATA"]["AbDb"]["folder_path"], "Redundant_LH_Protein_Martin.txt")
with open(redunant_file_path) as f:
    lines = f.readlines()
    
redundant_ids = {}
all_ids = set()
for line in lines:
    pdb_ids = line.split(",")
    pdb_ids = [pdb_id.strip().lower().split("_")[0] for pdb_id in pdb_ids]
    pdb_ids = [pdb_id for pdb_id in pdb_ids if pdb_id.strip() != ""]
    all_ids.update(pdb_ids)
    for i, pdb_id in enumerate(pdb_ids):
        redundant_ids[pdb_id] = pdb_ids

In [24]:
benchmark_pdb_ids = set(remaining_benchmark["pdb"].unique())
overlapping_ids = benchmark_pdb_ids.intersection(all_ids)
print("Number of found PDB benchmark_pdb_ids in redundant list >>>", len(overlapping_ids))
print("Number of found pdb_ids in that are also in dataset v1 >>>", len(overlapping_ids.intersection(set(dataset["pdb"].unique()))))
print("Number of found pdb_ids in that are also in SAbDab >>>", len(overlapping_ids.intersection(sabdab_pdb_ids)))
print("Number of found pdb_ids in that are also in AbDb >>>", len(overlapping_ids.intersection(abdb_pdb_ids)))
print("Number of found pdb_ids in that are also in AbDb and SAbDab >>>", len(overlapping_ids.intersection(abdb_pdb_ids).intersection(sabdab_pdb_ids)))
print("Number of found pdb_ids in neither in AbDb and SAbDab >>>", len(overlapping_ids - overlapping_ids.intersection(abdb_pdb_ids).union(overlapping_ids.intersection((sabdab_pdb_ids)))))

Number of found PDB benchmark_pdb_ids in redundant list >>> 7
Number of found pdb_ids in that are also in dataset v1 >>> 0
Number of found pdb_ids in that are also in SAbDab >>> 1
Number of found pdb_ids in that are also in AbDb >>> 7
Number of found pdb_ids in that are also in AbDb and SAbDab >>> 1
Number of found pdb_ids in neither in AbDb and SAbDab >>> 0


One strucutre is in AbDb but not in SAbDab - should be included in the dataset as well

In [25]:
pdb_ids = list(overlapping_ids.intersection(abdb_pdb_ids).intersection(sabdab_pdb_ids))
summary_df[summary_df["pdb"].isin(pdb_ids)].head(1)

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
25,4gxu,M,N,0,A,protein,,hemagglutinin ha1 chain,VIRAL PROTEIN/IMMUNE SYSTEM,09/04/12,...,False,True,IGHV3,IGLV1,Lambda,6.24e-09,,SPR,,23236279


In [26]:
benchmark_summary_df[benchmark_summary_df["pdb"].isin(pdb_ids)].head(1)

Unnamed: 0,index,Complex PDB,Antibody PDB,Antibody,Antigen PDB,Antigen,I-RMSD (Å),ΔASA (Å2),Category,New,Kd (nM),ΔG (kcal/mol),pdb
30,36,4GXU_MN:ABEFCD,4GXV_HL,1F1 antibody,1RUZ_HIJKLM,1918 H1 Hemagglutinin,0.78,1830.0,Rigid,,6.2,-11.2,4gxu


There is no delta G and no temperature value available --> We can use the information from the benchmark paper and add this strucutre to the dataset.v1

In [27]:
pdb_id = pdb_ids[0]
new_entry = {
            "pdb": pdb_id,
            "abdb_file": id2file[pdb_id],
            "antigen_type": "protein",
            "antibody_chains": ["L", "H"],
            "antigen_chains": "K", # manuel extracted
            "affinity_method": "SPR", # manuel extracted
            "temperature": np.nan,
            "calculated_temp": calc_temp({"affinity": 6.24e-09, "delta_g": -11.2}), # manuel extracted
            "affinity": 6.24e-09, # manuel extracted
            "delta_g":  -11.2, # manuel extracted
            "test": True
}
dataset = dataset.append(new_entry, ignore_index=True)

#### Add additional strucutures from AbDb
Add the 6 Strucutures that are also present in AbDb and use the affinitx value of the benchmark dataset

In [28]:
pdb_antigen_chain_names = {
    "1ahw": ["F"],
    "1s78": ["A"],
    "3u7y": ["G"],
    "4gxu": ["K"],
    "5o14": ["A"],
    "5y9j": ["A"],
    "4fp8": ["A"]
}

for i, row in remaining_benchmark[remaining_benchmark["pdb"].isin(overlapping_ids.intersection(abdb_pdb_ids))].iterrows():
    new_entry = {
        "pdb": row["pdb"],
        "abdb_file": id2file[row["pdb"]],
        "antigen_type": "protein",
        "antibody_chains": ["L", "H"],
        "antigen_chains": pdb_antigen_chain_names[row["pdb"]],
        "affinity_method": "-",
        "temperature": np.nan,
        "calculated_temp": calc_temp({"affinity": row["Kd (nM)"] * 1e-9, "delta_g": row["ΔG (kcal/mol)"]}),
        "affinity": row["Kd (nM)"] * 1e-9,
        "delta_g":  row["ΔG (kcal/mol)"],
        "test": True
    }
    dataset = dataset.append(new_entry, ignore_index=True)

#### Find data that is in SAbDab but not in dataset_v1

In [29]:
benchmark_pdb_ids = set([ pdb_id.split("_")[0].lower() for pdb_id in benchmark_summary_df["Complex PDB"].unique() ])
missing = benchmark_pdb_ids - set(dataset["pdb"].unique())

missing_but_sabdab = missing.intersection(sabdab_pdb_ids)
missing_but_sabdab

{'4pou', '5grj', '5hgg', '5kov', '5sv3'}

In [30]:
missing_sabdab_and_redundant = missing_but_sabdab.intersection(all_ids)
missing_sabdab_and_redundant

set()

Seems like there are 8 cases that are not present in the overlap between AbDb and SAbDab but have affinity values in the benchmark dataset.

In [31]:
benchmark_pdb_ids = set([ pdb_id.split("_")[0].lower() for pdb_id in benchmark_summary_df["Complex PDB"].unique() ])
overlapping_ids = benchmark_pdb_ids.intersection(set(dataset["pdb"].unique()))
print("There are still {} cases not in dataset_v1".format(len(benchmark_summary_df) - len(overlapping_ids)))

There are still 12 cases not in dataset_v1


In [32]:
remaining_benchmark = remaining_benchmark[~remaining_benchmark["pdb"].isin(overlapping_ids)]
print("Remaining Benchmark Entries", len(remaining_benchmark.isin(overlapping_ids)))

Remaining Benchmark Entries 12


In [33]:
remaining_benchmark

Unnamed: 0,index,Complex PDB,Antibody PDB,Antibody,Antigen PDB,Antigen,I-RMSD (Å),ΔASA (Å2),Category,New,Kd (nM),ΔG (kcal/mol),pdb
10,11,2I25_N:L,2I24_N,Shark single domain antigen receptor,3LZT,Lysozyme,1.21,1425.0,Rigid,,,-12.28,2i25
32,39,4POU_B:A,4POY_A,VHHmetal,6ETL_A,bovine RNase A,1.83,1313.3,Medium,X,157.0,-9.28,4pou
33,40,4Y7M_A:C,4QGY_A,nb25,4Y7L_A,E coli TssM CTD,0.84,1102.5,Rigid,X,1.61,-11.79,4y7m
35,42,5CBA_AB:E,5C2B_HL,3B4 scFv,4ZAI_A,CXCL13,1.49,1790.2,Medium,X,0.3715,-13.38,5cba
36,44,5GRJ_HL:A,4NKI_HL,avelumab scFv (Bavencio),4Z18_A,PD-L1,1.14,1752.5,Rigid,X,0.0421,-14.15,5grj
37,45,5HGG_T:A,5HDO_A,Nb4,4FUD_A,uPA,0.84,1969.0,Medium,X,0.054,-14.01,5hgg
39,48,5KOV_C:AB,5I30_HL,PL-2 scFv,5KOU_AB,astrovirus 2 capsid protein spike domain,1.69,1735.0,Medium,X,1.87,-11.91,5kov
41,51,5SV3_C:D,5SV4_A,A3C8,1IFT_A,Ricin,0.57,1293.6,Rigid,X,0.0627,-13.92,5sv3
42,52,5VNW_D:A,5VNV_A,Nb.b201,1E78_A,human serum albumin,1.49,966.8,Medium,X,430.0,-8.68,5vnw
48,63,6CWG_B:A,6CWK_A,A9,1IFT_A,Ricin,0.76,1151.2,Rigid,X,0.1,-13.64,6cwg


In [34]:
print("There are in total {} complexes in the testset that are taken from the benchmark dataset".format(dataset["test"].sum()))

There are in total 40 complexes in the testset that are taken from the benchmark dataset


### Validation Splits

Split the train data randomly in 3 parts that can each be used for validation during model training and for hyperparameter tuning.

In [35]:
dataset["validation"] = 0
dataset = dataset.sort_values(by="test")
dataset.reset_index(inplace=True)

In [36]:
total_train_data = len(dataset[dataset["test"] == False])
print("There are in total {} complexes in available for training".format(total_train_data))

There are in total 371 complexes in available for training


In [37]:
np.random.seed(123)
indices = np.arange(total_train_data)
np.random.shuffle(indices)
val_indices = np.split(indices, [124, 248])

In [38]:
for i, val_idx in enumerate(val_indices):
    i += 1
    dataset.loc[val_idx, "validation"] = i

In [39]:
dataset.head()

Unnamed: 0,index,pdb,abdb_file,antigen_type,antibody_chains,antigen_chains,affinity_method,temperature,calculated_temp,affinity,delta_g,test,validation
0,0,2r56,2R56_1.pdb,protein,"[I, M, H, L]","[B, A]",SPR,,25.0,1.3e-09,-12.12,False,2
1,271,3l5x,3L5X_1.pdb,protein,"[H, L]",[A],SPR,25.0,25.0,2.69e-10,-13.05,False,3
2,270,3sdy,3SDY_1.pdb,protein | protein,"[H, L]","[B, A]",Other,30.0,30.0,1e-09,-12.48,False,3
3,269,5i5k,5I5K_1.pdb,protein,"[X, Y, H, L]","[A, B]",SPR,,25.0,1.76e-11,-14.671746,False,3
4,268,3kr3,3KR3_1.pdb,protein,"[H, L]",[D],SPR,25.0,25.0,4.9e-11,-14.06,False,2


In [40]:
dataset.to_csv(os.path.join(config["DATA"]["path"], config["DATA"]["Dataset_v1"]["folder_path"], config["DATA"]["Dataset_v1"]["summary"]), index=False)