In [1]:
# Packages
import os
from pathlib import Path
import csv

import pandas as pd
import numpy as np

import natsort

from Bio.Data.IUPACData import protein_letters_3to1
from Bio.Data import CodonTable
from Bio.PDB import PDBParser

import camelot # "pip install "camelot-py[base]"", and maybe also "pip install opencv-python" and "pip install 'PyPDF2<3.0'"
from scipy.spatial.distance import pdist, squareform

In [2]:
# Directories
working_dir = Path(os.getcwd()).parent

# General comments

- DMS data was obtained from the following sources: 1) the [Jesse Bloom lab](https://github.com/orgs/jbloomlab/repositories); 2) [Timothy Whitehead](https://scholar.google.com/citations?hl=en&user=fOvtrZUAAAAJ&view_op=list_works&sortby=pubdate) 3) looking at references from the following publication: https://www.sciencedirect.com/science/article/pii/S0165614721002273?via%3Dihub. Only DMS data of antibody-protein complexes were considered, meaning those involving non-protein binding partners were overlooked.

- General structure of the directories is as follows: "antigen name"\_"author|lab"\_"date (+ optional letter)". Each directory contains a "DMS_raw_data" and a "PDB_structures" sub-directory. This directory structure needs to be created and populated with the corresponding files each time a new paper is added. The "DMS_interface_data_dir" sub-directory is created by the main functions below.

- All PDB structures were edited with PyMOL such that 1) there is only one antibody-antigen complex per PDB file; 2) heavy and light chain antibodies are always named H and L respectively, and antigen chains are named A,B,...

- TODO: For many DMS datasets, the number of extracted data points is lower than what is theoretically expected. Sometimes its because of differences between wild-type amino acid in the PDB structure and the DMS, in which case we could remodel the PDBs so that they have the same amino acid as in the DMS data (using ColabFold or FoldX). Other times the DMS data is actually not really a DMS, its a partial mutational scanning, in which case there is nothing we can do about missing amino acids.

- TODO: Need to run ColabFold or FoldX ReconstructSideChains+RepairPDB to complete some RBD interafaces and other missing atoms.

- TODO: Double check that the residue IDs of spike protein RBD residues in the different PDB files are correct so that we can map them directly with DMS data.

- TODO: In our future publication, provide examples in our dataset of mutations with a strong impact on binding affinity that are far from the binding interface.

In [3]:
# Main functions to generate the interface DMS data CSV files

def get_interface_residues(file, distance_thr=4, interface_to_return='antigen'):
    """
    Extract interface residues from PDB file based on distance w.r.t the other interface. Returns either antibody or antigen interface residues.
    """
    # Read PDB structure and extract antibody and antigen chains
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(id=file.stem, file=file)
    chains = {
        chain.id:chain
        for chain in structure.get_chains()
    }

    antibody_atoms = list(chains['H'].get_atoms()) + list(chains['L'].get_atoms())

    antigen_atoms = []
    for chain in chains:
        if chain in ('H', 'L'):
            continue
        antigen_atoms += list(chains[chain].get_atoms())

    # Generate all vs all atom distance matrix
    all_atoms = antigen_atoms + antibody_atoms
    all_atoms_coords = [biopython_atom_object.coord for biopython_atom_object in all_atoms]

    distance_matrix = pd.DataFrame(
        data=squareform(pdist(all_atoms_coords, metric='euclidean'), checks=True),
        index=all_atoms, columns=all_atoms
    )

    # rows = antigen atoms, columns = antibody atoms
    antigen_rows_mask = [
        True if atom.parent.parent.id not in ('H', 'L') else False
        for atom in distance_matrix.index.values
    ]
    antibody_columns_mask = [
        True if atom.parent.parent.id in ('H', 'L') else False
        for atom in distance_matrix.columns.values
    ]
    distance_matrix = distance_matrix.loc[antigen_rows_mask, antibody_columns_mask]

    # Extract corresponding interface residues
    if interface_to_return == 'antibody':
        axis=0
    elif interface_to_return == 'antigen':
        axis=1
    else:
        raise ValueError("interface_to_return must be either 'antibody' or 'antigen'")
    
    interface_atoms_mask = distance_matrix.apply(lambda row_or_column:row_or_column.le(distance_thr).any(), axis=axis) # All interface atoms within X distance of any atom of the other interface
    interface_atoms = distance_matrix.index[interface_atoms_mask] if interface_to_return == 'antigen' else distance_matrix.columns[interface_atoms_mask]
    
    interface_residues = []
    for atom in interface_atoms:
        resname_3 = atom.parent.resname.title()
        if resname_3 not in protein_letters_3to1: # Ignore non-amino acid interface atoms like waters
            continue
        
        resname_1 = protein_letters_3to1[resname_3]
        residue_number = str(atom.parent.id[1])
        chain = atom.parent.parent.id

        interface_residues.append(f'{chain}:{resname_1}{residue_number}') # eg: A:V123
        
    return set(interface_residues)

def add_chain_info_to_mut_name(chainless_mut_name, interface_residues):
    """
    """
    for full_residue_ID in interface_residues:
        if full_residue_ID.endswith(chainless_mut_name[:-1]):
            chain_ID = full_residue_ID[0]
            return f'{chain_ID}:{chainless_mut_name}'
        
def write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name, interface_to_return='antigen'):
    """
    For each antibody-antigen complex in the paper_dir, creates a CSV file containing DMS data of interface residues only.
    """
    DMS_raw_data_dir = paper_dir / 'DMS_raw_data'
    DMS_interface_data_dir = paper_dir / 'DMS_interface_data'; DMS_interface_data_dir.mkdir(exist_ok=True)
    PDB_structures_dir = paper_dir / 'PDB_structures'

    for DMS_data_file_name, PDB_structure_file_name in data.items():
        PDB_ID = PDB_structure_file_name.removesuffix('.pdb')
        DMS_data_file_path = DMS_raw_data_dir / DMS_data_file_name
        PDB_structure_file_path = PDB_structures_dir / PDB_structure_file_name

        df_DMS = pd.read_csv(DMS_data_file_path, header=0)
        interface_residues = get_interface_residues(file=PDB_structure_file_path, distance_thr=4, interface_to_return=interface_to_return)
        if interface_to_return == 'antigen':
            # Antigen DMS raw data files do not contain chain information, unlike antibody DMS data. To obtain the chain information for antigens, we assume that their
            # interface residue IDs are unique (including when the interface involves mutiple antigen chains). We then cross-check these residue IDs with the 
            # interface residues derived from the PDB structure to obtain the chain IDs.
            df_DMS['mut_name'] = df_DMS.apply(lambda row:f'{row.wildtype}{row.site}{row.mutation}', axis=1) # No chain information
            interface_df = df_DMS.loc[
                df_DMS.mut_name.apply(lambda mut_name:any(full_residue_ID.endswith(mut_name[:-1]) for full_residue_ID in interface_residues)), # eg: mut_name=V123G, full_residue_ID=A:V123
                ['mut_name', DMS_data_col_name]
            ]
            interface_df['mut_name'] = interface_df.mut_name.apply(add_chain_info_to_mut_name, args=(interface_residues,))

        elif interface_to_return == 'antibody':
            df_DMS['mut_name'] = df_DMS.apply(lambda row:f'{row.chain}:{row.wildtype}{row.site}{row.mutation}', axis=1)
            interface_df = df_DMS.loc[df_DMS.mut_name.str.startswith(tuple(interface_residues)),  ['mut_name', DMS_data_col_name]] # eg: mut_name=H:V123G, interface_residues=full_residue_ID=[H:V123, ...]
        
        else:
            raise ValueError("interface_to_return must be either 'antibody' or 'antigen'")
        
        interface_df.to_csv(DMS_interface_data_dir / f'{PDB_ID}_interface_DMS_data.csv', index=None)
        
        print(f'{PDB_ID}: {len(interface_df)}/{len(interface_residues)*20} data points.')
    
    return


# Zika virus

Publication: https://journals.asm.org/doi/full/10.1128/jvi.01291-19#F5

GitHub: https://github.com/jbloomlab/ZIKV_DMS_with_EvansLab

The paper has DMS data for antibodies ZK-64 and ZK-185, but there is only a PDB structure for the first antibody (5KVF), so we can't exploit the DMS of the second antibody.

In [4]:
paper_dir = working_dir / 'zika_JBloom_2019'

In [5]:
data = {
    'summary_ZKA64-meanmutdiffsel.csv':'ZV64_5kvf.pdb',
    #'summary_ZKA185-meanmutdiffsel.csv':'....pdb'
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutdiffsel')

ZV64_5kvf: 280/300 data points.


Publication: https://journals.asm.org/doi/full/10.1128/jvi.01414-23

GitHub: https://github.com/jbloomlab/ZIKV_MAP_GooLab

Paper with DMS data for 5 antibodies.

Note: Antibody ZV67 might be a negative control, maybe there is no mutational data ? Check

In [6]:
paper_dir = working_dir / 'zika_JBloom_2023'

In [7]:
data = {
    'summary_EDE1-C8-1800-meanmutdiffsel.csv':'EDE1-C8_5lbs.pdb',
    'summary_EDE1-C10-300-meanmutdiffsel.csv':'EDE1-C10_5h37.pdb',
    'summary_MZ4-4800-medianmutdiffsel.csv':'MZ4_6niu.pdb',
    'summary_SIgN-3C-20000-meanmutdiffsel.csv':'SIgN-3C_7bua.pdb',
    'summary_ZV-67-40000-meanmutdiffsel.csv':'ZV-67_5kvg.pdb'
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutdiffsel')

EDE1-C8_5lbs: 80/100 data points.
EDE1-C10_5h37: 400/400 data points.
MZ4_6niu: 320/320 data points.
SIgN-3C_7bua: 340/340 data points.
ZV-67_5kvg: 440/480 data points.


# COVID-19

Publication: https://www.nature.com/articles/s41564-021-00972-2#MOESM3

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_AZ_Abs

Paper with DMS data for 2 antibodies.

In [21]:
paper_dir = working_dir / 'COVID-19_JBloom_2021a'

In [22]:
# Create a separate mutational escape CSV file for each antibody
DMS_raw_data = paper_dir / 'DMS_raw_data'

df = pd.read_csv(DMS_raw_data / 'AZ_cocktail_raw_data.csv' , header=0, index_col=0).dropna()
df[df.index == 'AZD8895'].to_csv(DMS_raw_data / 'AZD8895_mutational_escape.csv')
df[df.index == 'AZD1061'].to_csv(DMS_raw_data / 'AZD1061_mutational_escape.csv')

In [24]:
data = {
    'AZD1061_mutational_escape.csv':'AZD1061_7l7e.pdb',
    'AZD8895_mutational_escape.csv':'AZD8895_7l7d.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mut_escape')

AZD1061_7l7e: 261/320 data points.
AZD8895_7l7d: 218/300 data points.


Publication: https://www.sciencedirect.com/science/article/pii/S1931312820306247

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Crowe_antibodies

Paper with DMS data for 9 SARS-CoV-2 antibodies. There are no PDB structures of the complexes, only negative-stain electron microscopy structures from which they derive the spike protein residues in interaction (i.e the epitope residues).

In [25]:
paper_dir = working_dir / 'COVID-19_JBloom_2021b'

In [26]:
# Create a separate mutational escape CSV file for each antibody
DMS_raw_data = paper_dir / 'DMS_raw_data'

df = pd.read_csv(DMS_raw_data / 'MAP_paper_antibodies_raw_data.csv', header=0, index_col=0)
for antibody_name, antibody_df in df.groupby(by=df.index):
    if antibody_name == 'rCR3022': # It's a SARS-CoV-1 antibody
        continue

    antibody_df.to_csv(DMS_raw_data / f'{antibody_name}_mutational_escape.csv')

In [13]:
...

Ellipsis

Publication: https://www.nature.com/articles/s41467-021-24435-8

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Crowe_antibodies

Paper with DMS data and structures of 10 antibodies.

In [27]:
paper_dir = working_dir / 'COVID-19_JBloom_2021c'

In [28]:
# Create a separate mutational escape CSV file for each antibody
DMS_raw_data = paper_dir / 'DMS_raw_data'

df = pd.read_csv(DMS_raw_data / 'all_samples_raw_data.csv', header=0, index_col=0)
for antibody_name, antibody_df in df.groupby(by=df.index):
    if antibody_name.startswith(('subject', 'COV')): # Non-antibody data
        continue

    antibody_df.to_csv(DMS_raw_data / f'{antibody_name}_mutational_escape.csv')

In [29]:
data = {
    'C002_mutational_escape.csv':'C002_7k8s.pdb',
    'C105_mutational_escape.csv':'C105_6xcm.pdb',
    'C110_mutational_escape.csv':'C110_7k8v.pdb',
    'C121_mutational_escape.csv':'C121_7k8x.pdb',
    'C135_mutational_escape.csv':'C135_7k8z.pdb',
    'C144_mutational_escape.csv':'C144_7k90.pdb',
    'LY-CoV016_mutational_escape.csv':'LY-CoV016_7c01.pdb',
    'LY-CoV555_mutational_escape.csv':'LY-CoV555_7kmg.pdb',
    'REGN10933_mutational_escape.csv':'REGN10933_6xdg.pdb',
    'REGN10987_mutational_escape.csv':'REGN10987_6xdg.pdb'
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mut_escape')

C002_7k8s: 250/300 data points.
C105_6xcm: 214/380 data points.
C110_7k8v: 190/280 data points.
C121_7k8x: 188/240 data points.
C135_7k8z: 79/140 data points.
C144_7k90: 300/400 data points.
LY-CoV016_7c01: 272/540 data points.
LY-CoV555_7kmg: 218/320 data points.
REGN10933_6xdg: 147/220 data points.
REGN10987_6xdg: 153/180 data points.


Publication: https://www.nature.com/articles/s41586-021-03807-6

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Vir_mAbs

Paper with DMS data of 13 antibodies. There is no structure for 4 antibodies (S2H58, S2X16, S2X58 and S2X227).

In [30]:
paper_dir = working_dir / 'COVID-19_JBloom_2021d'

In [31]:
# Create a separate mutational escape CSV file for each antibody
DMS_raw_data = paper_dir / 'DMS_raw_data'

df = pd.read_csv(DMS_raw_data / 'all_antibodies_raw_data.csv', header=0, index_col=0)
for antibody_name, antibody_df in df.groupby(by=df.index):
    if not antibody_name.startswith('S'):
        continue

    antibody_df.to_csv(DMS_raw_data / f'{antibody_name}_mutational_escape.csv')

In [32]:
data = {
    'S2D106_mutational_escape.csv': 'S2D106_7r7n.pdb',
    'S2E12_mutational_escape.csv':'S2E12_7r6x.pdb',
    'S2H13_mutational_escape.csv':'S2H13_7jv6.pdb',
    'S2H14_mutational_escape.csv':'S2H14_7jx3.pdb',
    # 'S2H58_mutational_escape.csv':'....pdb',
    'S2H97_mutational_escape.csv':'S2H97_7m7w.pdb',
    # 'LY-S2X16_mutational_escape.csv':'....pdb',
    'S2X35_mutational_escape.csv':'S2X35_7r6w.pdb',
    # 'S2X58_mutational_escape.csv':'....pdb',
    # 'S2X227_mutational_escape.csv':'....pdb',
    'S2X259_mutational_escape.csv':'S2X259_7m7w.pdb',
    'S304_mutational_escape.csv':'S304_7jx3.pdb',
    'S309_mutational_escape.csv':'S309_7r6w.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mut_escape')

S2D106_7r7n: 181/260 data points.
S2E12_7r6x: 183/260 data points.
S2H13_7jv6: 122/140 data points.
S2H14_7jx3: 232/360 data points.
S2H97_7m7w: 162/340 data points.
S2X35_7r6w: 240/440 data points.
S2X259_7m7w: 240/420 data points.
S304_7jx3: 247/440 data points.
S309_7r6w: 144/240 data points.


Publication: https://www.nature.com/articles/s41586-022-04980-y

GitHub: https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps


**Check https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_NUS-mAbs/tree/main/data/pdbs for additional PDB structures and "The epitopes of class 1 antibody S2K146 (PDB ID: 7TAT) is represented in red. The class 3 S309 epitope (PDB ID: 6WPT) is represented in blue, and the class 4 epitopes of C022 (PDB ID: 7RKU), C118 (PDB ID: 7RKS). DH1047 (PDB ID: 7SG4) and ADG-2 (PDB ID: 7U2D) are represented in yellow." !!!**

Paper with DMS data for 100s of antibodies against the RBD, but antibody-RBD 3D structures are only available for 7 antibodies. In some PDBs, the RBD sequence is Delta/Omicron/etc rather than the Wuhan sequence, yet the DMS data seems to have been generated from the Wuhan sequence, so this needs to be accounted for in some way (how ???).

2 additional PDB structures (containing 3 antibodies in total) have been taken from https://www.nature.com/articles/s41422-021-00555-0 (7EYA(BD-804), 7EZV(BD-812 and BD-836)). I suspect there are other publications with PDB structures of other antibody-RBD complexes that could be used to exploit more of the available DMS data.

In [8]:
paper_dir = working_dir / 'COVID-19_JBloom_2022'

In [9]:
# Create a separate mutational escape CSV file for each antibody
DMS_raw_data = paper_dir / 'DMS_raw_data'

df = pd.read_csv(DMS_raw_data / 'escape_data.csv.zip', header=0, index_col=0, low_memory=False).sort_index()
for antibody_name, antibody_df in df.groupby(by=df.index):
    if not antibody_name in ('BD-804', 'BD-812', 'BD-836', 'BD55-1239', 'BD55-3152', 'BD55-3372', 'BD55-5840', 'FC08', 'LY-CoV1404'):
        continue

    antibody_df.to_csv(DMS_raw_data / f'{antibody_name}_mutational_escape.csv')

In [10]:
data = {
    'BD-804_mutational_escape.csv':'BD-804_7eya.pdb',
    'BD-812_mutational_escape.csv':'BD-812_7ezv.pdb',
    'BD-836_mutational_escape.csv':'BD-836_7ezv.pdb',
    'BD55-1239_mutational_escape.csv':'BD55-1239_Omicron_7wrl.pdb',
    'BD55-3152_mutational_escape.csv':'BD55-3152_Fab_B11529_7wr8.pdb',
    'BD55-3372_mutational_escape.csv':'BD55-3372_Delta_7wro.pdb',
    'BD55-5840_mutational_escape.csv':'BD55-5840_Omicron_7wrz.pdb', # or BD55-5840_BA2_7x6a ?
    'FC08_mutational_escape.csv':'FC08_7dx4.pdb',
    'LY-CoV1404_mutational_escape.csv':'LY-CoV1404_7mmo.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mut_escape')

BD-804_7eya: 56/340 data points.
BD-812_7ezv: 55/260 data points.
BD-836_7ezv: 43/280 data points.
BD55-1239_Omicron_7wrl: 21/380 data points.
BD55-3152_Fab_B11529_7wr8: 68/320 data points.
BD55-3372_Delta_7wro: 63/260 data points.
BD55-5840_Omicron_7wrz: 65/320 data points.
FC08_7dx4: 74/420 data points.
LY-CoV1404_7mmo: 151/300 data points.


Publication: https://www.science.org/doi/full/10.1126/sciadv.ade3470

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_NUS-mAbs

Paper with DMS data for 6 new antibodies, of which 1 or 2 have an antibody-antigen 3D structure. 

Problem: It is not clear what the mapping is between the names of the antibodies in the paper and the names in the DMS CSV file ...

In [23]:
...

Ellipsis

Publication: https://www.biorxiv.org/content/10.1101/2024.05.05.592584v1.full.pdf

GitHub: https://github.com/dms-vep/SARS-CoV-2_XBB.1.5_spike_DMS_Barnes_mAbs

Paper with DMS data for 3-4 antibodies, but the DMS data are generated on antibody combinations rather than individual antibodies, so it's unclear how to make use of this data.

In [24]:
...

Ellipsis

Publication: pre-print not available yet

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Karolinska

DMS data for 3 antibodies, with only 1 with an available antibody-antigen 3D structure. Once the pre-print is available check if they also resolved the structures of the other 2 complexes.

In [39]:
paper_dir = working_dir / 'COVID-19_JBloom_2024'

In [40]:
# Create a separate mutational escape CSV file for each antibody
DMS_raw_data = paper_dir / 'DMS_raw_data'

df = pd.read_csv(DMS_raw_data / 'karolinska_raw_data.csv', header=0, index_col=0)
for antibody_name, antibody_df in df.groupby(by=df.index):

    antibody_df.to_csv(DMS_raw_data / f'{antibody_name}_mutational_escape.csv')

In [42]:
data = {
    'CAB-A17_mutational_escape.csv':'CAB-A17_8c2r.pdb',
    # 'CAB-A49_mutational_escape.csv':'....pdb',
    # 'CAB-C19_mutational_escape.csv':'....pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mut_escape')

CAB-A17_8c2r: 194/440 data points.


# HIV

Publication: https://www.sciencedirect.com/science/article/pii/S1931312817301968?via%3Dihub

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Vir_mAbs

Very special antibody (PDB: 5fuu), makes contact with 3 different chains as well as 2 N-glycans. Do we keep it ???

In [28]:
paper_dir = working_dir / 'HIV_JBloom_2017'

In [29]:
...

Ellipsis

Publications: https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1007159

GitHub: https://github.com/jbloomlab/MAP_Vaccine_FP_Abs

Paper with DMS data for 3 antibodies, with antibody-antigen 3D structures taken from https://www.nature.com/articles/s41591-018-0042-6.

In [43]:
paper_dir = working_dir / 'HIV_JBloom_2018'

In [44]:
data = {
    'summary_FP16-02-500ug-meanmutdiffsel.csv':'FP16-02_6cdi.pdb',
    'summary_FP20-01-500ug-meanmutdiffsel.csv':'FP20-01_6cde.pdb',
    'summary_VRC34-33ug-meanmutdiffsel.csv':'VRC34-01_5i8h.pdb'
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutdiffsel')

FP16-02_6cdi: 420/420 data points.
FP20-01_6cde: 440/440 data points.
VRC34-01_5i8h: 340/340 data points.


Publications: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6435357/

GitHub: https://github.com/jbloomlab/EnvsAntigenicAtlas

Paper with DMS data for 8 antibodies, of which 7 have a usable antibody-antigen 3D structures.

In [45]:
paper_dir = working_dir / 'HIV_JBloom_2019a'

In [46]:
data = {
    'summary_3BNC117-meanmutdiffsel.csv':'3BNC117_5v8m.pdb',
    'summary_101074-meanmutdiffsel.csv':'3BN-1074_5t3z.pdb',
    'summary_PG9-meanmutdiffsel.csv':'PG9_3u4e.pdb',
    'summary_PGT121-meanmutdiffsel.csv':'PGT121_usingPGT122_5fyl.pdb',
    'summary_PGT145-meanmutdiffsel.csv':'PGT145_5v8l.pdb',
    'summary_PGT151-meanmutdiffsel.csv':'PGT151_5fuu.pdb',
    'summary_VRC01-meanmutdiffsel.csv':'VRC01_5fyk.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutdiffsel')

3BNC117_5v8m: 540/540 data points.
3BN-1074_5t3z: 200/240 data points.
PG9_3u4e: 240/340 data points.
PGT121_usingPGT122_5fyl: 200/260 data points.
PGT145_5v8l: 480/480 data points.
PGT151_5fuu: 420/520 data points.
VRC01_5fyk: 380/480 data points.


Publications: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6755680/

GitHub: https://github.com/jbloomlab/MAP_NHP_FP_Abs

Paper with DMS data for 8 antibodies, of which 6 have a usable antibody-antigen 3D structures.

In [47]:
paper_dir = working_dir / 'HIV_JBloom_2019b'

In [48]:
data = {
    'summary_17D4-meanmutdiffsel.csv':'17D4_6n1v.pdb',
    # 'summary_106B6-meanmutdiffsel.csv':'....pdb',
    'summary_106E6-meanmutdiffsel.csv':'106E6_6n1w.pdb',
    'summary_110D12-meanmutdiffsel.csv':'110D12_6mph.pdb',
    # 'summary_DF1W203-meanmutdiffsel.csv': '....pdb',
    'summary_DF1W314-meanmutdiffsel.csv':'DF1W314_6mph.pdb',
    'summary_OPV12-meanmutdiffsel.csv':'OPV12_6ot1.pdb',
    'summary_OPV20-meanmutdiffsel.csv':'OPV20_6osy.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutdiffsel')

17D4_6n1v: 280/280 data points.
106E6_6n1w: 340/340 data points.
110D12_6mph: 300/300 data points.
DF1W314_6mph: 300/300 data points.
OPV12_6ot1: 280/280 data points.
OPV20_6osy: 300/300 data points.


Publications: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7042716/

GitHub: https://github.com/jbloomlab/MAP_118

Paper with DMS data for 1 antibody and its antibody-antigen 3D structure. Note that the DMS data here is survival fraction rather than mutation selection.

In [50]:
paper_dir = working_dir / 'HIV_JBloom_2020'

In [51]:
data = {
    'summary_118-meanmutfracsurvive.csv':'118_6udj.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutfracsurvive')

118_6udj: 180/200 data points.


Publications: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8376252/

GitHub: XXX

Paper with DMS data for 1 antibody and its antibody-antigen 3D structure (7N65). DMS data can be seen in Figure 7a, but the raw data is not available. I have sent an email to the authors.

In [38]:
...

Ellipsis

Publication: https://www.sciencedirect.com/science/article/pii/S2589004220308142

GitHub: https://github.com/meghangarrett/Phage-DMS

There might be some data to extract from this paper. It seems like the DMS data needs to be generated using the code on the GitHub, and no PDB structures are referenced in the paper, so need to check if structures are available.

In [39]:
...

Ellipsis

Publication: https://www.science.org/doi/full/10.1126/science.aad9195

GitHub: *NA* (data in supplementary materials)

They provide heatmaps in supp mat but not the original values. I have sent an email to ask for the raw values. Potentially 2-3 antibodies.

In [40]:
...

Ellipsis

Publications: https://www.science.org/doi/10.1126/science.aad9195?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub%20%200pubmed

GitHub: 

Despite being accessible to the Supp Material, I can't access the main part of this paper, making me unable to grab some main points.

# Influenza

Publications: https://www.nature.com/articles/s41467-018-03665-3

GitHub: https://github.com/jbloomlab/HA_antibody_ease_of_escape. Median frac survive values were taken from the paper's supplementary Data 5.

Carefull with numbering (https://github.com/jbloomlab/HA_numbering) !!!

Paper with DMS data for 6 antibodies, of which 3 have an antibody-antigen 3D structure.

In [52]:
paper_dir = working_dir / 'Influenza_JBloom_2018'

In [53]:
data = {
    'antibody_C179_median_fracsurvive.csv': 'C179_4hlz.pdb',
    'antibody_FI6v3_median_fracsurvive.csv':'FI6V_33ztn.pdb',
    # 'antibody_H17L7_median_fracsurvive.csv':'....pdb',
    # 'antibody_H17L10_median_fracsurvive.csv': '....pdb',
    # 'antibody_H17L19_median_fracsurvive.csv': '....pdb',
    'antibody_S139_median_fracsurvive.csv':'S139_4gms.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutfracsurvive')

C179_4hlz: 20/260 data points.
FI6V_33ztn: 0/380 data points.
S139_4gms: 160/360 data points.


Publications: https://elifesciences.org/articles/49324

GitHub: https://github.com/jbloomlab/map_flu_serum_Perth2009_H3_HA

Paper with DMS data for 6 antibodies, but 0 antibody-antigen 3D structure !!!

In [43]:
...

Ellipsis

Publications: https://www.science.org/doi/full/10.1126/science.aaz5143

GitHub: https://github.com/wchnicholas/HAstemEscape

Paper with DMS data for 3 antibodies, of which 3 have an antibody-antigen 3D structure.

**There is a problem with the 'site' column in the raw DMS data !!!**

In [54]:
paper_dir = working_dir / 'Influenza_JBloom_2020'

In [55]:
data = {
    'Perth09_antibody_FI6v3_median_fracsurvive.csv':'FI6V3-H3_3ztj.pdb',
    'WSN_antibody_CR9114_median_fracsurvive.csv':'CR9114_4fqy.pdb',
    'WSN_antibody_FI6v3_median_fracsurvive.csv':'FI6V3-H1_3ztn.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutfracsurvive')

FI6V3-H3_3ztj: 0/320 data points.
CR9114_4fqy: 20/340 data points.
FI6V3-H1_3ztn: 0/380 data points.


# EGFR

Publications: https://www.science.org/doi/full/10.1126/science.aaz5143

GitHub: *NA* (data taken from supplementary material)

Paper with DMS data for 1 antibody with a 3D structure. DMS is on the antibody !

In [76]:
paper_dir = working_dir / 'EGFR_AbbVie_2013'

In [82]:
# Extract the enrichment ratio (ER) for each CDR mutation from supplementary material PDF, page 14
DMS_raw_data = paper_dir / 'DMS_raw_data'

pdf_table = camelot.read_pdf(str(DMS_raw_data / 'mabs-5-523-s01.pdf'), pages="14")
df = pdf_table[0].df

# Set first row as header (ie column names)
df = df.rename(columns=df.iloc[0,:]).drop(0)

# Map DNA codons to amino acids
DNA_to_AA_map = CodonTable.standard_dna_table.forward_table
df.rename(columns=DNA_to_AA_map, inplace=True)
df['WT codon'] = [DNA_to_AA_map[codon] for codon in df['WT codon']]

# Change dtype of ER columns from str fo float
df.replace('', np.nan, inplace=True) # Deal with missing values
df = df.astype({amino_acid:'float' for amino_acid in set(DNA_to_AA_map.values())})

# Add site, wildtype amino acid and chain columns
df['site'] = df.position.apply(lambda value:value.split(':')[1][1:])
df['wildtype'] = df['WT codon']
df['chain'] = df.position.apply(lambda value:value[1])

# Wide to long table reformating
df = pd.melt(df.iloc[:, 3:], id_vars=('chain', 'site', 'wildtype'), var_name='mutation', value_name='ER')

# Some mutations are duplicated, so we take the average per mutation
df.sort_values(by=['chain', 'site', 'mutation'], key=natsort.natsort_keygen(), inplace=True) # Needed for groupby to work correctly
df = df.groupby(by=['chain', 'site', 'wildtype', 'mutation'], as_index=False, sort=False).aggregate('mean').round(2)

df.to_csv(DMS_raw_data / 'Cetuximab_ER_raw_data.csv', index=False)

In [83]:
data = {
    'Cetuximab_ER_raw_data.csv':'Cetuximab_1yy9.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='ER', interface_to_return='antibody')

Cetuximab_1yy9: 280/360 data points.


# VEGF

Publication: https://www.pnas.org/doi/full/10.1073/pnas.1613231114

I have sent an email to have the raw DMS data. A PDB structure is available.

In [49]:
...

Ellipsis

# Lysozyme

Publications: https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007207

GitHub: *NA* (data taken from supplementary material)

Paper with DMS data for 1 antibody with a 3D structure. DMS is on the antibody !

**The ER values are -ln() transformed, so need to check that the direction of the values is the same as ER values from other publications !!**

In [59]:
paper_dir = working_dir / 'lysozyme_fleishmanlab_2019'

In [60]:
# Extract and reformat the enrichment ratio (ER) from Excel table.
DMS_raw_data = paper_dir / 'DMS_raw_data'

df_excel  = pd.read_excel(DMS_raw_data / 'pcbi.1007207.s010.xlsx')
mutations = df_excel.iloc[2:, 0].values # Order of the mutations in the table

df_light = df_excel.iloc[:, 1:48]
df_heavy = df_excel.iloc[:, 48:]

with open(DMS_raw_data / 'D441_ER_data.csv', 'w',  newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['chain','site','wildtype','mutation','ER'])

    for _, column in df_light.items():
        site = int(column[0])
        wildtype = column[1]
        for mutation, ER in zip(mutations, column.values[2:]):
            csv_writer.writerow(['L', site, wildtype, mutation, float(ER)])

    for _, column in df_heavy.items():
        site = int(column[0])
        wildtype = column[1]
        for mutation, ER in zip(mutations, column.values[2:]):
            csv_writer.writerow(['H', site, wildtype, mutation, float(ER)])

In [61]:
data = {
    'D441_ER_data.csv':'D441_1mlc.pdb',
}

write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='ER', interface_to_return='antibody')

D441_1mlc: 340/420 data points.


# Nerve growth factor (NGF)

Publication: https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/full/10.1002/bit.26706

GitHub: *NA* (data in supplementary material)

DMS data is available, but need to understand how they generated the heatmaps using the raw data, its not clear.

In [53]:
...

Ellipsis

# TNFaR

Publication: https://www.sciencedirect.com/science/article/pii/S0006291X12020384

GitHub: *NA*

Could try to ask for the raw DMS data, but its a joint publication with a pharma company so unlikely.

In [54]:
...

Ellipsis

# Other

Publications with DMS data for multiple antigens at the same time

Publication: https://pubmed.ncbi.nlm.nih.gov/26296891/

GitHub: *NA* (data in supplementary materials)

DMS data is available, but need to understand how they generated the heatmaps using the raw data, its not clear. 3 antibodies in total with Ab-Ag structures available.

In [55]:
...

Ellipsis

Publication: https://www.jbc.org/article/S0021-9258(20)44841-0/fulltext

GitHub: *NA* (data as heatmaps in main paper)

DMS data for 2 antibodies targeting VEGF and angiopoietin, each with an avaiable 3D structures.

In [56]:
...

Ellipsis

Publications: https://sci-hub.st/10.1016/j.bbrc.2012.10.066

...

Publication: https://pubmed.ncbi.nlm.nih.gov/33859386/

Should have a look, it contains a dataset of ~ 9000 mutations of Trastuzumab's VH-CDR3 (which targets HER2).

DL47

Publication: https://www.tandfonline.com/doi/full/10.1080/19420862.2020.1829335

Title: Beyond affinity: selection of antibody variants with optimal biophysical properties and reduced immunogenicity from mammalian display libraries.

GitHub: *NA* (data in supplementary material)

Not DMS data - Multiple mutations introduced at the same time and the mutations are not introduced for all positions, thus not "deep" 

DL49

Publication: https://www.sciencedirect.com/science/article/abs/pii/S0022283614005750?via%3Dihub

Title: Conformation-dependent epitopes recognized by prion protein antibodies probed using mutational scanning and deep sequencing.

GitHub: 

About prion protein-binding antibodies. Mutations were introduced to prion, i.e., the antigen, then the binding affinity were determined. However, I don't find detailed affinity values.

DL50

Publication: https://www-sciencedirect-com.ezproxy.ulb.ac.be/science/article/pii/S0022283614005191#s0020

Title: Precise and efficient antibody epitope determination through library design, yeast display and next-generation sequencing.

GitHub:

About a epitope-idetifying method where the epitope residues are determined by DMS and observation of the affinity change. Only Kd value of wild-type antibodies (targeting S.aureus antigens) are shown in Table S4. I don't find Kd or Kd-like values for all the DMS variants.  

DL51

Publication: https://www.jbc.org/article/S0021-9258(20)49521-3/fulltext#supplementaryMaterial

Title: Rapid fine conformational epitope mapping using comprehensive mutagenesis and deep sequencing.

GitHub: NA

Experimental epitope-mapping method. Affinity change values are formatted in heatmaps. They defined their own "fitness metric".

DL!52

Publication: https://analyticalsciencejournals-onlinelibrary-wiley-com.ezproxy.ulb.ac.be/doi/full/10.1002/bit.26706

Title: Pro region engineering of nerve growth factor by deep mutational scanning enables a yeast platform for conformational epitope mapping of anti-NGF monoclonal antibodies.

GitHub: https://github.com/JKlesmith/Deep_Sequencing_Analysis

Study of tanezumab, a humanized anti-NGF mAb.

DL53

Publication: https://www.mcponline.org/article/S1535-9476(20)33088-7/fulltext#supplementaryMaterial

Title: Proteome-wide Epitope Mapping of Antibodies Using Ultra-dense Peptide Arrays

No DMS data

DL!54

Publication: https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1009453

Title: A human coronavirus evolves antigenically to escape antibody immunity

GitHub: https://github.com/jbloomlab/CoV_229E_antigenic_drift

DL!55

Publication: https://journals.asm.org/doi/epub/10.1128/jvi.01291-19

Title: Deep Mutational Scanning Comprehensively Maps How Zika Envelope Protein Mutations Affect Viral Growth and Antibody Escape

GitHub: https://github.com/jbloomlab/ZIKV_DMS_with_EvansLab

DL!56

Publication: https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1006271#sec016

Title: Complete mapping of viral escape from neutralizing antibodies

GitHub: https://github.com/mbdoud/mutational_antigenic_profiling

DL!57

Publication: https://www.pnas.org/doi/abs/10.1073/pnas.1806133115?url_ver=Z39.88-2003&rfr_id=ori%3Arid%3Acrossref.org&rfr_dat=cr_pub++0pubmed

Title: Deep mutational scanning of hemagglutinin helps predict evolutionary fates of human H3N2 influenza variants

GitHub: https://github.com/jbloomlab/Perth2009-DMS-Manuscript

DL58

Publication: https://www.science.org/doi/epdf/10.1126/science.aaz5143

Title: Different genetic barriers for resistance to HA stem antibodies in influenza H3 and H1 viruses.

DL!59

Publication: https://www.cell.com/cell-host-microbe/fulltext/S1931-3128(17)30204-4?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS1931312817302044%3Fshowall%3Dtrue

Title: Diversity of Functionally Permissive Sequences in the Receptor-Binding Site of Influenza Hemagglutinin

DL!60

Publication: https://www.cell.com/cell-host-microbe/fulltext/S1931-3128(17)30196-8?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS1931312817301968%3Fshowall%3Dtrue#supplementaryMaterial

Title: Comprehensive mapping of HIV-1 escape from a broadly neutralizing antibody.

DL!62

Publication: https://www.cell.com/cell/fulltext/S0092-8674(20)31003-5?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867420310035%3Fshowall%3Dtrue

Title: Deep Mutational Scanning of SARS-CoV-2 Receptor Binding Domain Reveals Constraints on Folding and ACE2 Binding

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_DMS

DL!63

Publication: https://www.nature.com/articles/s41586-021-03807-6#Sec7

Title: SARS-CoV-2 RBD antibodies that maximize breadth and resistance to escape.

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Vir_mAbs

DMS data locate at GitHub sub-directory: results/supp_data

DL!64

Publication: https://www.cell.com/cell-reports-medicine/fulltext/S2666-3791(21)00071-9?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS2666379121000719%3Fshowall%3Dtrue

Title: Complete map of SARS-CoV-2 RBD mutations that escape the monoclonal antibody LY-CoV555 and its cocktail with LY-CoV016

GitHub: https://jbloomlab.github.io/SARS-CoV-2-RBD_MAP_LY-CoV555

DL!65

Publication: https://www.science.org/doi/full/10.1126/science.abf9302?

Title: Prospective mapping of viral mutations that escape antibodies used to treat COVID-19

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_clinical_Abs

DMS data for antibody LY-CoV016 and REGN10933 and REGN10987.

In [None]:
paper_dir = working_dir / 'COVID-19_JBloom_2021e'
DMS_raw_data_dir = paper_dir / "DMS_raw_data"

df = pd.read_csv(DMS_raw_data_dir / "REGN_and_LY-CoV016_raw_data.csv")
for name in df["condition"].drop_duplicates():
    df[df["condition"]==name].to_csv(DMS_raw_data_dir/f"{name}_raw_data.csv")

# data = {
#     'summary_EDE1-C8-1800-meanmutdiffsel.csv':'EDE1-C8_5lbs.pdb',
#     'summary_EDE1-C10-300-meanmutdiffsel.csv':'EDE1-C10_5h37.pdb',
#     'summary_MZ4-4800-medianmutdiffsel.csv':'MZ4_6niu.pdb',
#     'summary_SIgN-3C-20000-meanmutdiffsel.csv':'SIgN-3C_7bua.pdb',
#     'summary_ZV-67-40000-meanmutdiffsel.csv':'ZV-67_5kvg.pdb'
# }

# write_interface_DMS_data_files(paper_dir, data, DMS_data_col_name='mutdiffsel')

DL!66 (alreay included in DMS_antibody_dataset/COVID-19_JBloom_2021d)

Publication: https://www.nature.com/articles/s41586-021-03817-4#data-availability

Title: Broad sarbecovirus neutralization by a human monoclonal antibody.

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Vir_mAbs

DL!67 (already included in DMS_antibody_dataset/COVID-19_JBloom_2021c)

Publication: https://www.cell.com/cell-host-microbe/fulltext/S1931-3128(20)30624-7

Title: Complete Mapping of Mutations to the SARS-CoV-2 Spike Receptor-Binding Domain that Escape Antibody Recognition

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_Crowe_antibodies

Ten antibodies binding to corresponding Coronavirus spike protein RBD (9 for ACE2 RBD in SARS-CoV-2; 1 for rCR3022 RBD). 

DL!68

Publication: https://www.nature.com/articles/s41564-021-00972-2

Title: Genetic and structural basis for SARS-CoV-2 variant neutralization by a two-antibody cocktail

GitHub: https://github.com/jbloomlab/SARS-CoV-2-RBD_MAP_AZ_Abs

DL69

Publication: https://www.frontiersin.org/journals/microbiology/articles/10.3389/fmicb.2021.698365/full

Title: Comprehensive Deep Mutational Scanning Reveals the Immune-Escaping Hotspots of SARS-CoV-2 Receptor-Binding Domain Targeting Neutralizing Antibodies

They have DMS data but the values were computationally predicted, not from experimental tests.

DL70

Publication: https://academic.oup.com/bioinformatics/article/37/22/4041/6295884?login=false#405570562

Title: Humanization of antibodies using a machine learning approach on large-scale repertoire data 

This is methodology paper from Charlotte Deane's lab about a classifer to discriminate human antibody (sequences) from non-human ones. Users can introduce mutations to an Ab sequence then score the level of its humanization.

DL71

Publication: https://academic.oup.com/bioinformatics/article/36/13/3996/5823885?login=false

Title: Learning context-aware structural representations to predict antigen and antibody binding interfaces

PECAN's paper, which is a paratope and epitope interface predictor.

DL!72

Publication: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7223817/

Title: A topology-based network tree for the prediction of protein–protein binding affinity changes following mutation

An in-silico ddG predictor. They used AB-bind and SKEMPI datasets, which are instereting.

DL!73

Publication: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-3439-4#availability-of-data-and-materials

Title: Variant effect predictions capture some aspects of deep mutational scanning experiments

Bechmarking analysis of serveral SAV preditors. Their test sets are given at: https://data.mendeley.com/datasets/2rwrkp7mfk/1. These are DMS data but for general proteins. So need to further check if there is Ab DMS data.

DL!74

Publication: https://onlinelibrary.wiley.com/doi/epdf/10.1002/prot.26184

Title: Predicting mutant outcome by combining deep mutational scanning and machine learning.

A mutation outcome predictor. One of the datasets they used is immunoglobulin-related, namely IGBPG (immunoglobulin G-binding β1 domain of streptococcal protein G). Need to check if this IGBPG is an antigen or an non-antigen Ab-binding protein.

DL!75

Publication: https://www.biorxiv.org/content/10.1101/2020.07.14.201475v4.full

Title: MAVE-NN: quantitative modeling of genotype–phenotype maps as information bottlenecks.

They used GB1 DMS dataset, which is similar to the above one.

DL76

Publication: https://www.mdpi.com/2073-4468/9/2/12

Title: A Review of Deep Learning Methods for Antibodies

A review. In "datasets/benchmarks" sections, the authors list two DMS-related datasets: AB-Bind and SKEMPI, which were already covered by the above papers. 

DL!77

Publication: https://www.nature.com/articles/s41467-021-22732-w

Title: Protein design and variant prediction using autoregressive generative models

GitHub: https://github.com/debbiemarkslab/SeqDesign

DL?78

Publication: https://www.nature.com/articles/s41551-021-00699-9#data-availability

Title: Optimization of therapeutic antibodies by predicting antigen specificity from antibody sequence via deep learning

GitHub: https://github.com/dahjan/DMS_opt

The authors said "The raw and analysed datasets generated during the study are too large to be publicly shared".

DL?79

Publication: https://academic.oup.com/bioinformatics/article/36/7/2126/5645171?login=false#409064291

Title: Antibody complementarity determining region design using high-capacity machine learning 

GitHub: https://github.com/gifford-lab/antibody-2019

A predictor of phage display panning enrichment. I am not sure if this is related to DMS.

DL80

Publication: https://www.biorxiv.org/content/10.1101/2021.07.09.450648v2.full

Title: Language models enable zero-shot prediction of the effects of mutations on protein function

A predictor of mutation outcome, like #74 and #75. The dataset is the same as in #78.

DL81

Publication: https://www.pnas.org/doi/full/10.1073/pnas.2016239118

Title: Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences

A large-scale unsupervised learning on protein sequences. They applied the embeddings to Prediction of Mutational Effects. The two datasets they used are: 1) the same as in #78; 2) in this paper: https://www.cell.com/cms/10.1016/j.cels.2017.11.003/attachment/e0dd3c5d-9f2a-4b4c-a403-4aa4f5716717/mmc1 (see Table S1) 
which are already covered by the above papers.