# Dataset Version 1

This notebook generates a dataframe containing information about data first version dataset:

- Overlap of AbDb data (filtered and enriched abag data) with SAbDab data (affinity values for conformtions) 

In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
import os
import pandas as pd
import warnings
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

warnings.filterwarnings("ignore")
from abag_affinity.utils.config import read_yaml, get_data_paths

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook().pandas()


0it [00:00, ?it/s]

In [3]:
config = read_yaml("../abag_affinity/config.yaml")

summary_path, pdb_path = get_data_paths(config, "SAbDab")
summary_df = pd.read_csv(summary_path, sep="\t")

abdb_summary_path, abdb_pdb_path = get_data_paths(config, "AbDb")
abdb_pdb_ids = os.listdir(abdb_pdb_path)
abdb_pdb_ids = set([ pdb_id.split("_")[0].lower() for pdb_id in abdb_pdb_ids])

In [4]:
# get overlap using pdb ids

sabdab_pdb_ids = set(summary_df["pdb"].unique())
overlapping_ids = abdb_pdb_ids.intersection(sabdab_pdb_ids)

In [5]:
dataset = summary_df[summary_df["pdb"].isin(overlapping_ids)].copy()

In [6]:
import math
def get_chains(row):
    pdb_id = row["pdb"]
    pdb_id_rows = dataset[dataset["pdb"] == pdb_id]
    
    light_chains = []
    heavy_chains = []
    antigen_chains = []
    
    for i, pdb_row in pdb_id_rows.iterrows():
        light_chains.append(pdb_row["Lchain"])
        heavy_chains.append(pdb_row["Hchain"])
        if isinstance(pdb_row["antigen_chain"], str):
            if "|" in pdb_row["antigen_chain"]:
                for chain_id in pdb_row["antigen_chain"].split("|"):
                    antigen_chains.append(chain_id.strip())

            else:
                antigen_chains.append(pdb_row["antigen_chain"])
    
    return light_chains, heavy_chains, antigen_chains

In [7]:
dataset["light_chains"], dataset["heavy_chains"], dataset["antigen_chains"] = zip(*dataset.apply(lambda row: get_chains(row), axis=1))

In [8]:
import numpy as np
gas_constant =  8.31446261815324 # 0.0821

def calc_temp(row):
    kd = row["affinity"]
    delta_g = row["delta_g"] * 4184 # convert kcal to joule 
    if kd == 0 or math.isnan(kd) or math.isnan(delta_g):
        return None

    return round((delta_g / (gas_constant * np.log(kd))) - 273.15) # convert kelvin to celsius

def calc_delta_g(row):
    delta_g = gas_constant * row["temperature_kelvin"] * np.log(row["affinity"])
    return delta_g / 4184 # convert to kcal

In [9]:
dataset["calculated_temp"] = dataset.apply(lambda row: calc_temp(row), axis=1)
dataset = dataset[~dataset["calculated_temp"].isnull()]

In [10]:
# add pdb file name from AbDb

abdb_pdb_path = os.path.join(config["DATA"]["path"], config["DATA"]["AbDb"]["folder_path"], config["DATA"]["AbDb"]["pdb_path"])

pdb_files = os.listdir(abdb_pdb_path)

id2file = {file.split("_")[0].lower(): file for file in pdb_files}

In [11]:
dataset["abdb_file"] = dataset["pdb"].apply(lambda pdb_id: id2file[pdb_id])

In [12]:
# clean dataset
dataset.drop_duplicates("pdb", inplace=True)
dataset = dataset.reset_index().drop("index", axis=1)
dataset = dataset[["pdb", "abdb_file", "antigen_type", "light_chains", "heavy_chains", "antigen_chains", 'affinity_method', 'temperature', "calculated_temp", 'affinity', 'delta_g', ]]

In [13]:
dataset.head()

Unnamed: 0,pdb,abdb_file,antigen_type,light_chains,heavy_chains,antigen_chains,affinity_method,temperature,calculated_temp,affinity,delta_g
0,2r56,2R56_1.pdb,protein,"[M, L]","[I, H]","[B, A]",SPR,,25.0,1.3e-09,-12.12
1,5wux,5WUX_1.pdb,protein,"[D, B, L]","[C, A, H]","[G, F, E]",SPR,,25.0,2.74e-11,-14.409486
2,4xi5,4XI5_1.pdb,protein | protein,[C],[D],"[B, A]",SPR,,25.0,4.9e-10,-12.700845
3,4k9e,4K9E_1.pdb,protein,[L],[H],[C],SPR,,25.0,6.4e-12,-15.271103
4,4k94,4K94_1.pdb,carbohydrate | protein,[L],[H],"[C, C]",TBD,SPR,25.0,6.3e-10,-12.551945


In [14]:
dataset.to_csv(os.path.join(config["DATA"]["path"], config["DATA"]["Dataset_v1"]["folder_path"], config["DATA"]["Dataset_v1"]["summary"]), index=False)

## Data Analysis

#### Chain ID analysis

In [16]:
from collections import Counter
all_antigen_chains = [ chain_id for list_of_chains in dataset["antigen_chains"].tolist() for chain_id in list_of_chains]
print(Counter(all_antigen_chains))

Counter({'A': 164, 'C': 78, 'B': 66, 'P': 49, 'E': 46, 'D': 35, 'F': 33, 'G': 28, 'I': 25, 'J': 18, 'S': 18, 'Q': 16, 'X': 13, 'Y': 12, 'V': 11, 'K': 10, 'R': 10, 'O': 7, 'T': 6, 'N': 6, 'M': 6, 'W': 5, 'U': 5, 'L': 4, 'c': 4, 'a': 3, 'e': 3, 'g': 3, 'H': 2, 's': 2, 'u': 2, 'j': 2, 'h': 2, 'q': 2, 'm': 2, 'o': 2, 'Z': 2, 'w': 1, 'x': 1, 'i': 1, 'k': 1})


In [17]:
all_antigen_chains = [ chain_id for list_of_chains in dataset["heavy_chains"].tolist() for chain_id in list_of_chains]
print(Counter(all_antigen_chains))

Counter({'H': 299, 'B': 64, 'A': 52, 'I': 38, 'C': 38, 'D': 33, 'E': 24, 'J': 20, 'G': 15, 'K': 14, 'M': 12, 'F': 11, 'Q': 8, 'O': 6, 'N': 5, 'X': 4, 'W': 4, 'T': 4, 'L': 3, 'R': 3, 'Z': 3, 'P': 3, 'S': 3, 'U': 3, 'h': 1, 'Y': 1})


In [18]:
all_antigen_chains = [ chain_id for list_of_chains in dataset["light_chains"].tolist() for chain_id in list_of_chains]
print(Counter(all_antigen_chains))

Counter({'L': 301, 'B': 63, 'A': 57, 'M': 37, 'D': 36, 'C': 32, 'F': 19, 'N': 18, 'E': 16, 'K': 14, 'J': 11, 'I': 10, 'H': 9, 'P': 7, 'Y': 6, 'G': 6, 'R': 5, 'V': 4, 'Q': 4, 'X': 4, 'O': 4, 'T': 3, 'S': 2, 'U': 1, 'W': 1, 'l': 1})
