# AbDb Data Analysis

The metadata files have been downloaded from http://www.abybank.org/abdb/ using the "Complete Antibodies - Martin Numbering - Non Redundant (NR)" Dataset. (10.05.2022)

In [1]:
import pandas as pd
import yaml
import os
from ast import literal_eval

In [2]:
from abag_affinity.utils.config import read_config, get_data_paths, get_resources_paths

config = read_config("../../config.yaml")
_, pdb_path = get_resources_paths(config, "AbDb")

In [3]:
abdb_df = pd.DataFrame()
abdb_df = pd.DataFrame()

abdb_df.head()

In [4]:
pdb_files = os.listdir(pdb_path)

print("Number of PDB Files >>>", len(pdb_files))

Number of PDB Files >>> 3446


In [5]:
if len(abdb_df) == 0:
    abdb_df["abdb_filename"] = pdb_files
    pdb_ids = [pdb_file.split("_")[0].lower() for pdb_file in pdb_files]

    abdb_df["pdb"] = pdb_ids

In [6]:
abdb_df.head()

Unnamed: 0,abdb_filename,pdb
0,6N5E_1.pdb,6n5e
1,6CUE_5.pdb,6cue
2,4TTD_1.pdb,4ttd
3,5kvd.pdb,5kvd.pdb
4,1XCQ_3.pdb,1xcq


## Chain Protein Connection Analysis

Check how to identify if a chain belongs to antibody or antigen:

- Check if in every conformation a light (L) and heavy (H) chain exist that belong to the antibody
- Check if there are any specific naming conventions for antigen chains

In [7]:
import warnings
warnings.filterwarnings("ignore")
from abag_affinity.utils.pdb_reader import read_file

def get_chain_ids(row):
    pdb_id = row["pdb"]
    path = os.path.join(pdb_path, row["abdb_filename"])
    
    structure, header = read_file(pdb_id, path)
    
    chain_ids = []
    for chain in structure.get_chains():
        chain_ids.append(chain.id)
    return chain_ids

def check_LH_chains(row):
    if "L" in row["chains"] and "H" in row["chains"]:
        return True
    else:
        return False

In [8]:
from tqdm.auto import tqdm
tqdm.pandas()

if "antibody_chains" not in abdb_df.columns and "chains" not in abdb_df.columns:
    abdb_df["chains"] = abdb_df.apply(lambda row: get_chain_ids(row), axis=1)
if "antibody_chains" not in abdb_df.columns and "LH_exist" not in abdb_df.columns:
    abdb_df["LH_exist"] = abdb_df.apply(lambda row: check_LH_chains(row), axis=1)
    print("PDB Files where chain L and H do not exist >>> {}".format(abdb_df["LH_exist"].count() - abdb_df["LH_exist"].sum()))

PDB Files where chain L and H do not exist >>> 0


In [9]:
if "antibody_chains" not in abdb_df.columns:
    abdb_df["antibody_chains"] = [["L", "H"] for i in abdb_df.index]

In [10]:
def get_antigen_chains(row):
    antigen_chains = set(row["chains"]) - set(row["antibody_chains"])
    return list(antigen_chains)

if "antigen_chains" not in abdb_df.columns:
    abdb_df["antigen_chains"] = abdb_df.apply(lambda row: get_antigen_chains(row), axis=1)

In [11]:
from Bio import PDB

def get_antigen_size(row):
    antigen_chains = row["antigen_chains"]
    if isinstance(antigen_chains, str):
        antigen_chains = literal_eval(antigen_chains)
    pdb_id = row["pdb"]
    path = os.path.join(pdb_path, row["abdb_filename"])
    
    structure, header = read_file(pdb_id, path)
    
    total_residues = 0
    total_non_residues = 0

    for chain_id in antigen_chains:
        chain = structure[0][chain_id]
        for residue in chain.get_residues():
            if PDB.is_aa(residue):
                total_residues += 1
            else:
                total_non_residues
                
    row["antigen_length"] = total_residues
    row["antigen_length_no_residues"] = total_non_residues

    return row


if "antigen_length" not in abdb_df.columns:
    abdb_df = abdb_df.apply(lambda row: get_antigen_size(row), axis=1)

In [13]:
abdb_df = abdb_df[["abdb_filename", "pdb", "antibody_chains", "antigen_chains", "antigen_length"]]

abdb_df.head()

Unnamed: 0,abdb_filename,pdb,antibody_chains,antigen_chains,antigen_length
0,6N5E_1.pdb,6n5e,"[L, H]",[B],282
1,6CUE_5.pdb,6cue,"[L, H]","[2, 1]",585
2,4TTD_1.pdb,4ttd,"[L, H]",[A],127
3,5kvd.pdb,5kvd.pdb,"[L, H]",[E],108
4,1XCQ_3.pdb,1xcq,"[L, H]",[P],24
