This notebook peforms a look up on a set of therapeutic antibodies to see if there exist antibody and antigen complex structures in the NCBI Structure database. It is necessary to obtain structures for both the antibody and the associated antigen to visualize binding.

#Prepare Environment

In [1]:
# Import libraries
import os
import re
import gdown
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from google.colab import files

# Download the latest TheraSAbDab Database

In [2]:
# download the latest TheraSAbDab database
url_TheraSAbDab = 'https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/static/downloads/TheraSAbDab_SeqStruc_OnlineDownload.xlsx'
r = requests.get(url_TheraSAbDab)

fp_TheraSAbDab = '{}'.format(url_TheraSAbDab.split('/')[-1])
with open(fp_TheraSAbDab, 'wb') as f:
    f.write(r.content)

In [3]:
# Filter out Therapeutics without any structure
df = pd.read_excel(fp_TheraSAbDab)

cols = ['100% SI Structure', '99% SI Structure','95-98% SI Structure']
df = df[(df[cols] == ['None', 'None', 'None']).sum(axis=1) < 3]
df = df[(df[cols] == ['None;None', 'None;None', 'None;None']).sum(axis=1) < 3]

# Identify structures with antigen


In [4]:
# For each structure, search the NCBI structure database
# and extract the title

# This cell takes about 5~10 minutes to run
def extract_pdb_title_ncbi(str_pdbid)->str:
    """
    """
    url_ncbi_structure = 'https://www.ncbi.nlm.nih.gov/Structure/pdb'
    url = '{}/{}'.format(url_ncbi_structure, str_pdbid)
    r = requests.get(url)
    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')
        return soup.title.text
    return 'N/A'


cols = ['100% SI Structure', '99% SI Structure', '95-98% SI Structure']
for col in cols:
    # convert none values to np.NaN for the structure columns
    df.loc[:, col] = df[col].apply(lambda x: np.NaN if re.search('None', x) else x).copy()
    df.loc[~df[col].isna(), col] = df[col].dropna().str.split('/').copy()
    df.loc[~df[col].isna(), col] = df[col].dropna().apply(lambda x: [item.split(':')[0] for item in x]).copy()

    # extract ncbi structure page title for each pdb id
    idx_notnan = df.loc[~df[col].isna()].index
    df.loc[:, 'title_{}'.format(col)] = np.NaN
    df.loc[idx_notnan, 'title_{}'.format(col)] = df[col].dropna().apply(lambda x: [extract_pdb_title_ncbi(pdbid) for pdbid in x ])

In [5]:
# Identify titles containing the term 'complex.'
# Titles with 'complex, complexed' are likely
# to contain structure for both the Abs and the antigen.

def id_pdbid_wcomplex(str_title):
    str_title = ''.join(str_title.lower())
    if re.search('complex', str_title):
        pdbid = str_title.split(':')[0]
        return pdbid

cols = ['100% SI Structure', '99% SI Structure', '95-98% SI Structure']
cols = ['title_{}'.format(col) for col in cols]
for col in cols:
    idx_notnan = df.loc[~df[col].isna()].index
    df.loc[idx_notnan, 'ret_{}'.format(col.split('_')[1])] = \
       df[col].dropna().apply(lambda x: [id_pdbid_wcomplex(title) for title in x]).copy()

In [6]:
# conbine all identified pdbid into a string, seperate each pdbid by colon
def combine_to_list(x, cols):
  ls = []
  for col in cols:
      if type(x[col]) == list:
          ls.extend(x[col])
  ls = [pdbid for pdbid in ls if pdbid != None]
  ls = [pdbid for pdbid in ls if pdbid != '']
  return ';'.join(ls)

cols = ['100% SI Structure', '99% SI Structure', '95-98% SI Structure']
cols = ['ret_{}'.format(col) for col in cols]

df['structure_w_complex'] = df[cols].apply(lambda x: combine_to_list(x, cols=cols), axis=1)
df[['Therapeutic'] + cols + ['structure_w_complex']]

Unnamed: 0,Therapeutic,ret_100% SI Structure,ret_99% SI Structure,ret_95-98% SI Structure,structure_w_complex
1,Abciximab,[6v4p],,,6v4p
2,Abelacimab,,,[6r8x],6r8x
3,Abituzumab,[6r8x],,,6r8x
8,Acimtamig,,,,
11,Adalimumab,"[None, None]",[3wd5],,3wd5
...,...,...,...,...,...
858,Vudalimab,,,,
869,Zanidatamab,,,"[1s78, None, None, None, None, None, None, Non...",1s78;3be1;3bdy
874,Zenocutuzumab,"[5o4o, 5o7p]",,,5o4o;5o7p
881,Zinlirvimab,"[6orn, 6udk, 6udj, None, 6ccb, 6okp, 7ucf, 7uc...","[5t3x, 5t3z]",[None],6orn;6udk;6udj;6ccb;6okp;7ucf;7ucg;7ugn;7ugo;7...


# Export selected columns to csv

In [7]:
# prepare df for export
cols = ['Therapeutic', 'Format', "Highest_Clin_Trial (Jan '23)", 'Est. Status', 'Target']
df_export = df[cols].copy()
df_export['PDB ID for structure with an antigen'] = df['structure_w_complex']
df_export['Is there a structure with an antigen?'] = \
    df_export['PDB ID for structure with an antigen'].apply(lambda x: 'yes' if x != '' else 'no').copy()

cols = ['Therapeutic', 'Format', "Highest_Clin_Trial (Jan '23)", 'Est. Status', 'Target',
        'Is there a structure with an antigen?','PDB ID for structure with an antigen']

df_export = df_export[cols].copy()

In [8]:
# export to csv
fp_updated = 'Therapeutics_w_AntigenStructure.csv'
df_export.to_csv(fp_updated)

# Download Therapeutics with antigen structure(s) in .csv

In [10]:
files.download(fp_updated)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>