# Combination of AbDb and SAbDab for affinity values and redundancy check

Additionally compare to Antibody Benchmark

In [1]:
import pandas as pd
import yaml
import os
from tqdm import tqdm
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

from abag_affinity.utils.config import read_config, get_data_paths, get_resources_paths

In [3]:
config = read_config("../../config.yaml")
sabdab_summary_path, sabdab_pdb_path = get_resources_paths(config, "SAbDab")
abdb_summary_path, abdb_pdb_path = get_resources_paths(config, "AbDb")

In [4]:
abdb_pdb_ids = os.listdir(abdb_pdb_path)
abdb_pdb_ids = set([ pdb_id.split("_")[0].lower() for pdb_id in abdb_pdb_ids])

In [5]:
sabdab_df = pd.read_csv(sabdab_summary_path, sep="\t")
#sabdab_df = sabdab_df[sabdab_df["antigen_type"].isin(["protein", "peptide"])]
sabdab_pdb_ids = set(sabdab_df["pdb"].unique())

In [6]:
print("Number of Total PDB IDs in AbDb >>> {}".format(len(abdb_pdb_ids)))#
print("Number of Total PDB IDs in SAbDab >>> {}".format(len(sabdab_pdb_ids)))

Number of Total PDB IDs in AbDb >>> 2140
Number of Total PDB IDs in SAbDab >>> 746


In [7]:
overlapping_ids = abdb_pdb_ids.intersection(sabdab_pdb_ids)
print("Number of overlapping PDB IDs >>> {}".format(len(overlapping_ids)))

Number of overlapping PDB IDs >>> 476


## Comparison with Antibody Benchmark

In [8]:
benchmark_summary_path, benchmark_pdb_path = get_resources_paths(config, "AntibodyBenchmark")
benchmark_df = pd.read_excel(benchmark_summary_path)

benchmark_df = benchmark_df.replace(" ", np.nan)
benchmark_df = benchmark_df[benchmark_df["ΔG (kcal/mol)"].notnull() & benchmark_df["Kd (nM)"].notnull()]
print("There are {} ab-ag cases with affinity".format(len(benchmark_df)))
benchmark_pdb_ids = set([ pdb_id.split("_")[0].lower() for pdb_id in benchmark_df["Complex PDB"].unique() ])

There are 42 ab-ag cases with affinity


In [9]:
print("Number of overlapping PDB IDs with AbDb >>> {}".format(len(abdb_pdb_ids.intersection(benchmark_pdb_ids))))

Number of overlapping PDB IDs with AbDb >>> 31


In [10]:
print("Number of overlapping PDB IDs with SAbDab >>> {}".format(len(sabdab_pdb_ids.intersection(benchmark_pdb_ids))))

Number of overlapping PDB IDs with SAbDab >>> 31


In [11]:
print("Number of overlapping PDB IDs with both >>> {}".format(len(overlapping_ids.intersection(benchmark_pdb_ids))))

Number of overlapping PDB IDs with both >>> 26


## Comparison of Affinity Values between SAbDab and Antibody Benchmark

In [12]:
benchmark_df["pdb"] = benchmark_df["Complex PDB"].apply(lambda x: x.split("_")[0].lower())

In [13]:
overlapping_data = sabdab_df.merge(benchmark_df, on="pdb", how="inner")
overlapping_data.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,Antibody PDB,Antibody,Antigen PDB,Antigen,I-RMSD (Å),ΔASA (Å2),Category,New,Kd (nM),ΔG (kcal/mol)
0,5wux,C,D,0,G,protein,,tumor necrosis factor alpha,IMMUNE SYSTEM,12/21/16,...,5WUV_HL,certolizumab (Cimzia),1TNF_CBA,TNFalpha,0.8,2072.8,Rigid,X,0.0274,-14.41
1,5wux,A,B,0,F,protein,,tumor necrosis factor alpha,IMMUNE SYSTEM,12/21/16,...,5WUV_HL,certolizumab (Cimzia),1TNF_CBA,TNFalpha,0.8,2072.8,Rigid,X,0.0274,-14.41
2,5wux,H,L,0,E,protein,,tumor necrosis factor alpha,IMMUNE SYSTEM,12/21/16,...,5WUV_HL,certolizumab (Cimzia),1TNF_CBA,TNFalpha,0.8,2072.8,Rigid,X,0.0274,-14.41
3,4gxu,M,N,0,A,protein,,hemagglutinin ha1 chain,VIRAL PROTEIN/IMMUNE SYSTEM,09/04/12,...,4GXV_HL,1F1 antibody,1RUZ_HIJKLM,1918 H1 Hemagglutinin,0.78,1830.0,Rigid,,6.2,-11.2
4,4gxu,W,X,0,K,protein,,hemagglutinin ha1 chain,VIRAL PROTEIN/IMMUNE SYSTEM,09/04/12,...,4GXV_HL,1F1 antibody,1RUZ_HIJKLM,1918 H1 Hemagglutinin,0.78,1830.0,Rigid,,6.2,-11.2


In [14]:
overlapping_data["pdb"].unique()

array(['5wux', '4gxu', '4etq', '5hys', '3hi6', '3wd5', '4dn4', '4pou',
       '5kov', '3g6d', '2fjg', '5sv3', '6b0s', '3mj9', '3v6z', '5whk',
       '5grj', '4fqi', '2w9e', '2dd8', '5hgg', '3l5w', '5c7x', '4m5z',
       '3eoa', '3rvw', '4g6j', '4g6m', '2vxt', '6a0z', '3mxw'],
      dtype=object)

In [15]:
overlapping_data["affinity"].corr(overlapping_data["Kd (nM)"].astype(float))

0.9999951144763157

## Analysis of overlapping Complexes

In [16]:
#sabdab_df.drop_duplicates('pdb', inplace=True)
overlapping_data = sabdab_df[sabdab_df["pdb"].isin(overlapping_ids)].copy()
overlapping_data = overlapping_data.reset_index().drop(["index"], axis=1)
overlapping_data["pdb_filename"] = overlapping_data["pdb"].apply(lambda x: x.upper() + "_1.pdb")
overlapping_data.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid,pdb_filename
0,1hh9,B,A,0,C,peptide,,pep-2,IMMUNE SYSTEM/PEPTIDE,12/21/00,...,False,IGHV1,IGKV14,Kappa,1e-05,,Other,4.0,10990450,1HH9_1.pdb
1,2r56,I,M,0,B,protein,,beta-lactoglobulin,IMMUNE SYSTEM,09/03/07,...,False,IGHV3,IGKV1,Kappa,1.3e-09,-12.12,SPR,,17997967,2R56_1.pdb
2,2r56,H,L,0,A,protein,,beta-lactoglobulin,IMMUNE SYSTEM,09/03/07,...,False,IGHV3,IGKV1,Kappa,1.3e-09,-12.12,SPR,,17997967,2R56_1.pdb
3,5wux,C,D,0,G,protein,,tumor necrosis factor alpha,IMMUNE SYSTEM,12/21/16,...,True,IGHV3,IGKV1,Kappa,2.74e-11,-14.409486,SPR,,TBD,5WUX_1.pdb
4,5wux,A,B,0,F,protein,,tumor necrosis factor alpha,IMMUNE SYSTEM,12/21/16,...,True,IGHV3,IGKV1,Kappa,2.74e-11,-14.409486,SPR,,TBD,5WUX_1.pdb


In [17]:
overlapping_data[overlapping_data["affinity"].isnull()]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid,pdb_filename


## Calculate Temperature of affinity prediction experiment

Experiment with temperature values and how they are connected to the Kd and delta G values. 

In [18]:
import numpy as np
gas_constant =  8.31446261815324 # 0.0821

def calc_temp(row):
    kd = row["affinity"]
    delta_g = row["delta_g"] * 4184 # convert kcal to joule 
    return (delta_g / (gas_constant * np.log(kd))) - 273.15 # convert kelvin to celsius

def calc_delta_g(row):
    delta_g = gas_constant * row["temperature_kelvin"] * np.log(row["affinity"])
    return delta_g / 4184 # convert to kcal

In [19]:
temperature_data = overlapping_data[overlapping_data["temperature"].notnull() & overlapping_data["affinity"].notnull() & ~ overlapping_data["temperature"].isin(["Unknown", "SPR", "BLI"])].copy()
temperature_data["temperature_kelvin"] = temperature_data["temperature"].astype(int) + 273.15
temperature_data["affinity"] = temperature_data["affinity"].astype(float)

temperature_data["calc_dg"] = temperature_data.apply(lambda row: calc_delta_g(row), axis=1)

temperature_data[["temperature", "affinity", "calc_dg", "delta_g"]]

Unnamed: 0,temperature,affinity,calc_dg,delta_g
0,4,1.000000e-05,-6.340786,
9,25,1.000000e-10,-13.642470,-13.63
32,25,2.860000e-09,-11.655627,-11.45
53,25,9.100000e-08,-9.605607,-9.60
54,25,9.100000e-08,-9.605607,-9.60
...,...,...,...,...
818,25,1.300000e-09,-12.122776,-12.12
822,25,4.100000e-08,-10.077988,-10.07
823,25,1.420000e-08,-10.706217,-10.70
832,22,8.900000e-09,-10.872509,-10.87


In [20]:
temperature_data["calc_dg"].corr(temperature_data["delta_g"])

0.9996887835584191

In [21]:
affinity_data = overlapping_data[overlapping_data["affinity"].notnull() & overlapping_data["delta_g"].notnull()].copy()

affinity_data["calculated_temp"] = affinity_data.apply(lambda row: calc_temp(row), axis=1).round(0).astype(int)

In [22]:
affinity_data.value_counts("calculated_temp")

calculated_temp
25    755
37     24
20     11
23      5
16      4
30      4
28      3
4       2
22      2
29      2
24      1
27      1
43      1
dtype: int64

## Chain Analysis

In [23]:
from abag_affinity.utils.pdb_reader import read_file

row = overlapping_data[overlapping_data["pdb"] == "2r56"].iloc[0]
print("Getting Strcuture for {}".format(row["pdb"]))
path = os.path.join(abdb_pdb_path, row["pdb_filename"])
structure, header= read_file(row["pdb"], path)

Getting Strcuture for 2r56


In [24]:
row["affinity"]

1.3e-09

In [25]:
chain_list = list(structure.get_chains())
chain_list

[<Chain id=L>, <Chain id=H>, <Chain id=A>]

In [26]:
from Bio.PDB import Selection
Selection.unfold_entities(structure, 'C')

[<Chain id=L>, <Chain id=H>, <Chain id=A>]

In [27]:
resiude_list = list(chain_list[0].get_residues())

In [28]:
from Bio.SeqUtils import seq1
chains = {chain.id:seq1(''.join(residue.resname for residue in chain)) for chain in structure.get_chains()}
chains

{'L': 'DIVMTQSPSSLSASVGDRVTITCRASQGISSRLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYCQQYHSYPWTFGQGTKLEIKRTV',
 'H': 'QVSLRESGGGLVQPGRSLRLSCTASGFTFRHHGMTWVRQAPGKGLEWVASLSGSGTKTHFADSVKGRFTISRDNSNNTLYLQMDNVRDEDTAIYYCAKAKRVGATGYFDLWGRGTLVTVSS',
 'A': 'TQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEELKPTPEGDLEILLQKWENGECAQKKIIAEKTKIPAVFKIDALNENKVLVLDTDYKKYLLFCMENSAEPEQSLACQCLVRTPEVDDEALEKFDKALKALPMHIRLSFNPTQLEEQCHI'}