In [1]:
%reload_ext autoreload
%autoreload 2

##### Define ChEMBL and Papyrus versions

In [2]:
chembl_version = '31'
papyrus_version = '05.5'
papyrus_flavor = 'nostereo'

##### Define annotation round

In [3]:
annotation_round = 2
previous_annotation_round = annotation_round - 1

##### Define data directory

In [4]:
from data_path import get_data_path
data_dir = get_data_path()

##### Import analysis functions

In [5]:
import numpy as np
import pandas as pd
import ast

from preprocessing import obtain_chembl_data,combine_chembl_papyrus_mutants,merge_chembl_papyrus_mutants
from annotation import *

#### Curate round 1 annotation (semi-manual curation)

In [6]:
# Read round 1 annotated assays making sure that columns containing lists are read as lists
def convert_to_list(s):
    return ast.literal_eval(s)
converters = {'aa_change': convert_to_list,
              'mutants': convert_to_list}

previous_round_assays = pd.read_csv(os.path.join(data_dir, f'chembl{chembl_version}_annotated_assays_round{previous_annotation_round}.csv'),
                            sep='\t', converters=converters)
previous_round_assays

Unnamed: 0,assay_id,description,accession,sequence,mutation,aa_change,mutants,target_id,Protein_Type
0,83907,In vivo inhibitory activity against human Hepa...,Q9Y251,MLLRSKPALPPPLMLLLLGPLGPLSPGALPRPAQAQDVVDLDFFTQ...,,[],[],Q9Y251_WT,WT
1,154606,Inhibitory activity against Palmitoyl-CoA oxid...,P07872,MNPDLRKERASATFNPELITHILDGSPENTRRRREIENLILNDPDF...,,[],[],P07872_WT,WT
2,51352,Inhibition of cytochrome P450 1A2 of isolated ...,P05177,MALSQSVPFSATELLLASAIFCLVFWVLKGLRPRVPKGLKSPPEPW...,,[],[],P05177_WT,WT
3,51895,Inhibition of cytochrome P450 3A4 of isolated ...,P08684,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,,[],[],P08684_WT,WT
4,51528,Inhibition of cytochrome P450 2C9 of isolated ...,P11712,MDSLVVLVLCLSCLLLLSLWRQSSGRGKLPPGPTPLPVIGNILQIG...,,[],[],P11712_WT,WT
...,...,...,...,...,...,...,...,...,...
376228,2136225,KinomeScan assay: inhibition of LATS2,Q9NRM7,MRPKTFPATTYSGNSRQRLQEIREGLKQPSKSSVQGLPAGPNSDTS...,,[],[],Q9NRM7_WT,WT
376229,2136385,KinomeScan assay: inhibition of ROCK1,Q13464,MSTGDSFETRFEKMDNLLRDPKSEVNSDCLLDGLDALVYDLDFPAL...,,[],[],Q13464_WT,WT
376230,2136386,KinomeScan assay: inhibition of ROCK2,O75116,MSRPPPTGKMPGAPETAPGDGAGASRQRKLEALIRDPRSPINVESL...,,[],[],O75116_WT,WT
376231,2136430,KinomeScan assay: inhibition of TGFBR2,P37173,MGRGLLRGLWPLHIVLWTRIASTIPPHVQKSVNNDMIVTDNNGAVK...,,[],[],P37173_WT,WT


In [7]:
# Get additional information for assays to check where differences in local and ChEMBL annotations might come from
def get_assay_info(assay_data:pd.DataFrame, chembl_version: str):
    """
    Get additional assay information
    """
    # Extract from ChEMBL information about assays
    query = """
            SELECT *
            FROM assays
                INNER JOIN target_dictionary
                    ON assays.tid = target_dictionary.tid
                INNER JOIN target_components
                    ON target_dictionary.tid = target_components.tid
                INNER JOIN component_sequences
                    ON target_components.component_id = component_sequences.component_id
            """

    chembl_assay_information = chembl_downloader.query(query, version=chembl_version,
                                                       prefix=['mutants-in-pcm', 'chembl'])

    # Merge assay information to annotated assays
    assay_data_ExtraInfo = pd.merge(assay_data, chembl_assay_information, how='left', on=['assay_id','accession'])

    return assay_data_ExtraInfo

previous_round_assays_ExtraInfo = get_assay_info(previous_round_assays, chembl_version)

##### False positives (i.e. annotated by us but not by ChEMBL)

The annotation of false positives was done by manually checking the new annotations and "rescued" UNDEFINED
MUTATION assay-accession pairs (651 and 182 in round 1). The assays descriptions were checked against the annotated
mutations. Assay-accession pairs that were considered false positives were extracted in C:\Users\gorostiolam\Documents\Gorostiola Gonzalez,
 Marina\PROJECTS\6_Mutants_PCM\DATA\2_Analysis\0_annotation_analysis\round_1\new_annotations_incorrect.xlsx and given a
"reason" flag that described that reason why the annotated mutation was likely given. This should help categorize the
 false positives and give ideas to improve the annotation pipeline. The different reasons were further grouped into
 more general groups in "group_reason" flag.

In [8]:
# Check newly annotated assays (not previously annotated on ChEMBL)
previous_round_assays_ExtraInfo[~previous_round_assays_ExtraInfo['target_id'].str.contains('_WT') &  previous_round_assays_ExtraInfo['mutation'].isnull() ]

Unnamed: 0,assay_id,description_x,accession,sequence_x,mutation,aa_change,mutants,target_id,Protein_Type,doc_id,...,homologue,component_id,component_type,sequence_y,sequence_md5sum,description_y,tax_id,organism,db_source,db_version
3103,226874,Ratio of IC50 value against hNK1 receptor to t...,P25103,MDNVLPVDSDLSPNISTNTSEPNQFVQPAWQIVLWAAAYTVIVVTS...,,[Q165A],[Q165A],P25103_Q165A,Q165A,13275,...,0,235,PROTEIN,MDNVLPVDSDLSPNISTNTSEPNQFVQPAWQIVLWAAAYTVIVVTS...,2ef7318627a5eee5d4155c6f70c38a09,Substance-P receptor,9606,Homo sapiens,SWISS-PROT,2022_02
5557,153379,Transcriptional activation activity on human I...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,[I272F],[I272F],Q07869_I272F,I272F,5772,...,0,245,PROTEIN,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,f758f3566fa5c02ea133ec7a89a5d06d,Peroxisome proliferator-activated receptor alpha,9606,Homo sapiens,SWISS-PROT,2022_02
5558,153380,Transcriptional activation activity on human T...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,[T279M],[T279M],Q07869_T279M,T279M,5772,...,0,245,PROTEIN,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,f758f3566fa5c02ea133ec7a89a5d06d,Peroxisome proliferator-activated receptor alpha,9606,Homo sapiens,SWISS-PROT,2022_02
5559,153557,Ratio of transcriptional activation of I272F m...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,[I272F],[I272F],Q07869_I272F,I272F,5772,...,0,245,PROTEIN,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,f758f3566fa5c02ea133ec7a89a5d06d,Peroxisome proliferator-activated receptor alpha,9606,Homo sapiens,SWISS-PROT,2022_02
5560,153560,Ratio of transcriptional activation of T279M m...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,[T279M],[T279M],Q07869_T279M,T279M,5772,...,0,245,PROTEIN,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,f758f3566fa5c02ea133ec7a89a5d06d,Peroxisome proliferator-activated receptor alpha,9606,Homo sapiens,SWISS-PROT,2022_02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371279,2113388,Inhibition of human non-phosphorylated ABL1 F3...,P00519,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,[F317L],[F317L],P00519_F317L,F317L,120543,...,0,173,PROTEIN,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,d24f1ea01ac4c8d96e75dd9550a0fb09,Tyrosine-protein kinase ABL1,9606,Homo sapiens,SWISS-PROT,2022_02
373864,2122982,Inhibition of MLL1 (unknown origin)-mediated H...,Q03164,MAHSCRWRFPARPGTTGGGGGGGRRGLGGAPRQRVPALLLPPGPPV...,,[H3K],[H3K],Q03164_H3K,H3K,120915,...,0,5131,PROTEIN,MAHSCRWRFPARPGTTGGGGGGGRRGLGGAPRQRVPALLLPPGPPV...,f90e5c53fc570d85e49f666ad445dcde,Histone-lysine N-methyltransferase 2A,9606,Homo sapiens,SWISS-PROT,2022_02
374254,2125019,Activation of human PKM2-C424A expressed in Es...,P14618,MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTG...,,[C424A],[C424A],P14618_C424A,C424A,120974,...,0,4349,PROTEIN,MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTG...,b9f341db7edc1c70c1439f8176ea2889,Pyruvate kinase PKM,9606,Homo sapiens,SWISS-PROT,2022_02
374819,2127601,Binding affinity to human partial length MCL1 ...,Q07820,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,,"[D172N, L2S]",[D172N],Q07820_D172N,D172N,121070,...,0,2678,PROTEIN,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,ac05f6a57fb5fffa7a8ddcdf26e1ba8d,Induced myeloid leukemia cell differentiation ...,9606,Homo sapiens,SWISS-PROT,2022_02


In [9]:
# Check newly annotated assays (annotated as 'UNDEFINED MUTATION' in ChEMBL)
previous_round_assays_ExtraInfo[~previous_round_assays_ExtraInfo['target_id'].str.contains('_WT') &
                                  (previous_round_assays_ExtraInfo['mutation'] == 'UNDEFINED MUTATION')]#.groupby(['curated_by'])['assay_id'].count()

Unnamed: 0,assay_id,description_x,accession,sequence_x,mutation,aa_change,mutants,target_id,Protein_Type,doc_id,...,homologue,component_id,component_type,sequence_y,sequence_md5sum,description_y,tax_id,organism,db_source,db_version
225304,1433633,Potentiation of human CFTR F508del/G551D mutan...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,UNDEFINED MUTATION,[G551D],[G551D],P13569_G551D,G551D,89790,...,0,2370,PROTEIN,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,f29b8c0a9056a0f7680f3290d259b6ac,Cystic fibrosis transmembrane conductance regu...,9606,Homo sapiens,SWISS-PROT,2022_02
227327,1442036,Inhibition of KIT V559/T670I mutant (unknown o...,P10721,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,UNDEFINED MUTATION,[T670I],[T670I],P10721_T670I,T670I,89287,...,0,255,PROTEIN,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,f753f2b2d9752a30646c83e8630d1df8,Mast/stem cell growth factor receptor Kit,9606,Homo sapiens,SWISS-PROT,2022_02
228138,1435798,Inhibition of wild type HIV1 reverse transcrip...,Q72547,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,UNDEFINED MUTATION,"[Y181I, Y181C]","[Y181C, Y181I]",Q72547_Y181C_Y181I,Y181C;Y181I,89737,...,0,375,PROTEIN,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,347541fa98e55ebf1791ded88f229d8c,Reverse transcriptase/RNaseH,11676,Human immunodeficiency virus 1,TREMBL,2022_02
228447,1440950,Inhibition of EGFR T790M/del746 to 750 mutant ...,P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,UNDEFINED MUTATION,[T790M],[T790M],P00533_T790M,T790M,89694,...,0,147,PROTEIN,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,99d03b567dbcdd7fb0dd21fb40d5d283,Epidermal growth factor receptor,9606,Homo sapiens,SWISS-PROT,2022_02
228673,1441757,Corrector activity at human bronchial epitheli...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,UNDEFINED MUTATION,[G551D],[G551D],P13569_G551D,G551D,89697,...,0,2370,PROTEIN,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,f29b8c0a9056a0f7680f3290d259b6ac,Cystic fibrosis transmembrane conductance regu...,9606,Homo sapiens,SWISS-PROT,2022_02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374735,2127220,Binding affinity to 6His-FLAG-Tev-BRDT (1 to 3...,Q58F21,MSLPSRQTAIIVNPPPPEYINTKKNGRLTNQLQYLQKVVLKDLWKH...,UNDEFINED MUTATION,"[Y309A, Y66A]","[Y66A, Y309A]",Q58F21_Y66A_Y309A,Y66A;Y309A,121054,...,0,5553,PROTEIN,MSLPSRQTAIIVNPPPPEYINTKKNGRLTNQLQYLQKVVLKDLWKH...,2ac9e601c29e15829546463a6894420e,Bromodomain testis-specific protein,9606,Homo sapiens,SWISS-PROT,2022_02
374749,2127280,Inhibition of 6His-FLAG-Tev-BRDT (1 to 397 res...,Q58F21,MSLPSRQTAIIVNPPPPEYINTKKNGRLTNQLQYLQKVVLKDLWKH...,UNDEFINED MUTATION,"[Y309A, Y66A]","[Y66A, Y309A]",Q58F21_Y66A_Y309A,Y66A;Y309A,121056,...,0,5553,PROTEIN,MSLPSRQTAIIVNPPPPEYINTKKNGRLTNQLQYLQKVVLKDLWKH...,2ac9e601c29e15829546463a6894420e,Bromodomain testis-specific protein,9606,Homo sapiens,SWISS-PROT,2022_02
374751,2127282,Inhibition of 6His-Thr-BRD3 (1 to 435 residues...,Q15059,MSTATTVAPAGIPATPGPVNPPPPEVSNPSKPGRKTNQLQYMQNVV...,UNDEFINED MUTATION,"[Y348A, Y73A]","[Y73A, Y348A]",Q15059_Y73A_Y348A,Y73A;Y348A,121056,...,0,5554,PROTEIN,MSTATTVAPAGIPATPGPVNPPPPEVSNPSKPGRKTNQLQYMQNVV...,b946c4c397f75fdd24b1c5acba15efa0,Bromodomain-containing protein 3,9606,Homo sapiens,SWISS-PROT,2022_02
374752,2127283,Inhibition of 6His-Thr-BRD2 (1 to 473 residues...,P25440,MLQNVTPHNKLPGEGNAGLLGLGPEAAAPGKRIRKPSLLYEGFESP...,UNDEFINED MUTATION,"[Y386A, Y113A]","[Y113A, Y386A]",P25440_Y113A_Y386A,Y113A;Y386A,121056,...,0,5013,PROTEIN,MLQNVTPHNKLPGEGNAGLLGLGPEAAAPGKRIRKPSLLYEGFESP...,5dd925de15997e9e3938ab23517cdcfc,Bromodomain-containing protein 2,9606,Homo sapiens,SWISS-PROT,2022_02


##### False negatives (i.e. annotated by ChEMBL but not by us)

The annotation of false negatives was done by classifying all rejected ChEMBL mutations into different groups (i.e.
"rejection_flag") defined from an initial visual inspection and talks to the variant annotation team in ChEMBL.
Further relevance/confidence of the different categories were given by further visual inspection.

In [10]:
# Check how many mutations previously annotated in ChEMBL fail validation based on sequence (or exceptions)
# First get the assays that were originally annotated on ChEMBL with a defined mutation
previous_round_assays_original = previous_round_assays_ExtraInfo[previous_round_assays_ExtraInfo['mutation'] !=
                                                                     'UNDEFINED MUTATION'].dropna(subset=['mutation'])

def check_original_valid(row):
    original_mutations = row['mutation'].split(',')
    valid_mutations = list(row['mutants'])
    if all(item in valid_mutations for item in original_mutations):
        return True
    else:
        return False

previous_round_assays_original_rejected = previous_round_assays_original[~previous_round_assays_original.apply(check_original_valid,axis=1)]
previous_round_assays_original_rejected

Unnamed: 0,assay_id,description_x,accession,sequence_x,mutation,aa_change,mutants,target_id,Protein_Type,doc_id,...,homologue,component_id,component_type,sequence_y,sequence_md5sum,description_y,tax_id,organism,db_source,db_version
88483,317562,Binding affinity for mutant T877A Androgen rec...,P10275,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,T878A,[T877A],[],P10275_WT,WT,20674,...,0,187,PROTEIN,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,0e8126a2b4e057dff4e5e448802e64bf,Androgen receptor,9606,Homo sapiens,SWISS-PROT,2022_02
94476,322454,Inhibitory concentration against IL-3 independ...,P10721,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,D816V,[D816V],[],P10721_WT,WT,30197,...,0,255,PROTEIN,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,f753f2b2d9752a30646c83e8630d1df8,Mast/stem cell growth factor receptor Kit,9606,Homo sapiens,SWISS-PROT,2022_02
105052,426971,Ratio of Ki to Km for HCV NS5B RNA polymerase ...,Q8JXU8,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,S282T,"[S5B, S282T]",[],Q8JXU8_WT,WT,35099,...,0,3671,PROTEIN,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,3f5012b8af293164261075aeb623fec5,RNA-directed RNA polymerase,11103,Hepatitis C virus,TREMBL,2022_02
105060,426978,Activity of HCV NS5B RNA polymerase S282T asse...,Q8JXU8,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,S282T,"[S5B, S282T]",[],Q8JXU8_WT,WT,35099,...,0,3671,PROTEIN,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,3f5012b8af293164261075aeb623fec5,RNA-directed RNA polymerase,11103,Hepatitis C virus,TREMBL,2022_02
107598,445385,Agonist activity at GluR1 I687A mutant express...,P42261,MQHIFAFFCTGFLGAVVGANFPNNIQIGGLFPNQQSQEHAAFRFAL...,I705A,[I687A],[],P42261_WT,WT,36637,...,0,4996,PROTEIN,MQHIFAFFCTGFLGAVVGANFPNNIQIGGLFPNQQSQEHAAFRFAL...,1875787bc6f3513d65588f76d95dede6,Glutamate receptor 1,9606,Homo sapiens,SWISS-PROT,2022_02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374141,2124228,Stabilization of TTR V3OM mutant (unknown orig...,P02766,MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,V50M,[V3O],[],P02766_WT,WT,120955,...,0,1517,PROTEIN,MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,6b2e8f183e8bc0d7efc3e9933236616e,Transthyretin,9606,Homo sapiens,SWISS-PROT,2022_02
374142,2124230,Stabilization of TTR V3OM mutant (unknown orig...,P02766,MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,V50M,[V3O],[],P02766_WT,WT,120955,...,0,1517,PROTEIN,MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,6b2e8f183e8bc0d7efc3e9933236616e,Transthyretin,9606,Homo sapiens,SWISS-PROT,2022_02
374144,2124234,Binding affinity to TTR V30M mutant (unknown o...,P02766,MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,V50M,[V30M],[],P02766_WT,WT,120955,...,0,1517,PROTEIN,MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,6b2e8f183e8bc0d7efc3e9933236616e,Transthyretin,9606,Homo sapiens,SWISS-PROT,2022_02
376126,2135003,Protac activity at CRBN/AR T878A mutant in hum...,Q96SW2,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,T878A,[T878A],[],Q96SW2_WT,WT,121349,...,0,16927,PROTEIN,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,6322efb1bd9f9859167c6800b30ed7ea,Protein cereblon,9606,Homo sapiens,SWISS-PROT,2022_02


In [11]:
# Classify rejected annotated variants into categories
def give_rejection_flag(row):
    original_mutations = row['mutation'].split(',')
    extracted_mutations = list(row['aa_change'])
    valid_mutations = list(row['mutants'])
    # Category 1: no amino acid change was extracted from the description with the regular expression
    if len(extracted_mutations) == 0:
        # Category 1A: original mutation is a deletion
        if any(['del' in om for om in original_mutations]):
            return 'original_deletion'
        # Category 1B: original mutation is undefined (THIS SHOULD BE ZERO BECAUSE WE FILTER THEM OUT)
        elif any(['UNDEFINED MUTATION' in om for om in original_mutations]):
            return 'original_undefined'
        # Category 1C: other reasons (e.g. tricky definition in description)
        else:
            return 'no_extraction'
    # Category 2: amino acid was extracted but it is not valid
    elif len(extracted_mutations) > 0:
        # Category 2A: original mutation contains a deletion
        if any(['del' in om for om in original_mutations]):
            return 'original_deletion'
        # Category 2A: original and extracted mutations match in all but sequence position (e.g. possible sequence
# renumbering of which ChEMBL was aware of)
        elif (all(f'{om[0]}{om[-1]}' in [f'{em[0]}{em[-1]}' for em in extracted_mutations] for om in
        original_mutations)) and not (all(om[1:-1] in [em[1:-1] for em in extracted_mutations] for om in
        original_mutations)) and (len(valid_mutations) < len(original_mutations)):
            return 'original_shift_exception'
        # Category 2B: original and extracted mutations match, but they are not valid (e.g. wrong accession)
        elif (any(om in extracted_mutations for om in original_mutations)) and (len(valid_mutations) < len
            (original_mutations)):
            if row['target_type'] == 'PROTEIN FAMILY':
                return 'protein_family'
            else:
                return 'original_not_valid'
        # Category 2C: original and extracted mutations do not match at all (e.g. tricky definition in description
# that was known in ChEMBL as an exception)
        elif all(om not in extracted_mutations for om in original_mutations):
            return f'original_exception_{row["curated_by"]}'
    else:
        return 'other'

previous_round_assays_original_rejected['rejection_flag'] = previous_round_assays_original_rejected.apply\
    (give_rejection_flag,axis=1)
# Write out file with false negatives
false_negative_file = os.path.join(data_dir,f'chembl{chembl_version}_rejected_assays_round{previous_annotation_round}.csv')
if not os.path.exists(false_negative_file):
    previous_round_assays_original_rejected.to_csv(false_negative_file,sep='\t',index=False)

In [12]:
# Check assays in specific rejection flags
previous_round_assays_original_rejected[previous_round_assays_original_rejected['rejection_flag'] == 'original_not_valid']

Unnamed: 0,assay_id,description_x,accession,sequence_x,mutation,aa_change,mutants,target_id,Protein_Type,doc_id,...,component_id,component_type,sequence_y,sequence_md5sum,description_y,tax_id,organism,db_source,db_version,rejection_flag
94476,322454,Inhibitory concentration against IL-3 independ...,P10721,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,D816V,[D816V],[],P10721_WT,WT,30197,...,255,PROTEIN,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,f753f2b2d9752a30646c83e8630d1df8,Mast/stem cell growth factor receptor Kit,9606,Homo sapiens,SWISS-PROT,2022_02,original_not_valid
105052,426971,Ratio of Ki to Km for HCV NS5B RNA polymerase ...,Q8JXU8,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,S282T,"[S5B, S282T]",[],Q8JXU8_WT,WT,35099,...,3671,PROTEIN,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,3f5012b8af293164261075aeb623fec5,RNA-directed RNA polymerase,11103,Hepatitis C virus,TREMBL,2022_02,original_not_valid
105060,426978,Activity of HCV NS5B RNA polymerase S282T asse...,Q8JXU8,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,S282T,"[S5B, S282T]",[],Q8JXU8_WT,WT,35099,...,3671,PROTEIN,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,3f5012b8af293164261075aeb623fec5,RNA-directed RNA polymerase,11103,Hepatitis C virus,TREMBL,2022_02,original_not_valid
114623,473343,Binding affinity to NR2B receptor S645A/i641A ...,Q13224,MKPRAECCSPKFWLVLAVLAVSGSRARSQKSPPSIGIAVILVGTSD...,"S645A,I641A","[R2B, S645A]",[S645A],Q13224_S645A,S645A,39129,...,59,PROTEIN,MKPRAECCSPKFWLVLAVLAVSGSRARSQKSPPSIGIAVILVGTSD...,5bd7a79d95518f600df9ac85dd9d542c,"Glutamate receptor ionotropic, NMDA 2B",9606,Homo sapiens,SWISS-PROT,2022_02,original_not_valid
117235,534387,Inhibition of cross-linking of human Pgp TM3T1...,P08183,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...,"T199R,L339C,F728C","[M3T, L339C, F728C]","[L339C, F728C]",P08183_L339C_F728C,L339C;F728C,42551,...,2619,PROTEIN,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...,e8c72a9371a2775313fe0714108c5e40,ATP-dependent translocase ABCB1,9606,Homo sapiens,SWISS-PROT,2022_02,original_not_valid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370917,2111448,Inhibition of p110alpha E542K mutant/p85alpha ...,P27986,MSAEGYQYRALYDYKKEREEDIDLHLGDILTVNKGSLVALGFSDGQ...,E542K,[E542K],[],P27986_WT,WT,120456,...,846,PROTEIN,MSAEGYQYRALYDYKKEREEDIDLHLGDILTVNKGSLVALGFSDGQ...,5fcafd869a5a794742e5d699000e29f8,Phosphatidylinositol 3-kinase regulatory subun...,9606,Homo sapiens,SWISS-PROT,2022_02,original_not_valid
372775,2117080,Protac activity at CRBN/EGFR L858R/T790M mutan...,Q96SW2,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,"L858R,T790M","[L858R, T790M]",[],Q96SW2_WT,WT,120695,...,16927,PROTEIN,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,6322efb1bd9f9859167c6800b30ed7ea,Protein cereblon,9606,Homo sapiens,SWISS-PROT,2022_02,original_not_valid
372777,2117102,Protac activity at CRBN/EGFR L858R/T790M mutan...,Q96SW2,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,"L858R,T790M","[L858R, T790M]",[],Q96SW2_WT,WT,120695,...,16927,PROTEIN,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,6322efb1bd9f9859167c6800b30ed7ea,Protein cereblon,9606,Homo sapiens,SWISS-PROT,2022_02,original_not_valid
376126,2135003,Protac activity at CRBN/AR T878A mutant in hum...,Q96SW2,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,T878A,[T878A],[],Q96SW2_WT,WT,121349,...,16927,PROTEIN,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,6322efb1bd9f9859167c6800b30ed7ea,Protein cereblon,9606,Homo sapiens,SWISS-PROT,2022_02,original_not_valid


In [13]:
# Check how many rejected fall under each rejection category
previous_round_assays_original_rejected.groupby(['rejection_flag'])['assay_id'].count()

rejection_flag
no_extraction                      175
original_deletion                   77
original_exception_Autocuration    168
original_not_valid                 248
original_shift_exception           323
protein_family                     609
Name: assay_id, dtype: int64

#### Re-annotate assays based on curation steps from previous round annotation

In [15]:
new_round_assays = manual_reannotation(chembl_version=chembl_version,
                    annotation_round=annotation_round,
                    correct_false_positives=True,
                    correct_false_negatives=True)
new_round_assays

Unnamed: 0,assay_id,description,accession,sequence,mutation,aa_change,mutants,target_id,Protein_Type
0,83907,In vivo inhibitory activity against human Hepa...,Q9Y251,MLLRSKPALPPPLMLLLLGPLGPLSPGALPRPAQAQDVVDLDFFTQ...,,[],[],Q9Y251_WT,WT
1,154606,Inhibitory activity against Palmitoyl-CoA oxid...,P07872,MNPDLRKERASATFNPELITHILDGSPENTRRRREIENLILNDPDF...,,[],[],P07872_WT,WT
2,51352,Inhibition of cytochrome P450 1A2 of isolated ...,P05177,MALSQSVPFSATELLLASAIFCLVFWVLKGLRPRVPKGLKSPPEPW...,,[],[],P05177_WT,WT
3,51895,Inhibition of cytochrome P450 3A4 of isolated ...,P08684,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,,[],[],P08684_WT,WT
4,51528,Inhibition of cytochrome P450 2C9 of isolated ...,P11712,MDSLVVLVLCLSCLLLLSLWRQSSGRGKLPPGPTPLPVIGNILQIG...,,[],[],P11712_WT,WT
...,...,...,...,...,...,...,...,...,...
376228,2136225,KinomeScan assay: inhibition of LATS2,Q9NRM7,MRPKTFPATTYSGNSRQRLQEIREGLKQPSKSSVQGLPAGPNSDTS...,,[],[],Q9NRM7_WT,WT
376229,2136385,KinomeScan assay: inhibition of ROCK1,Q13464,MSTGDSFETRFEKMDNLLRDPKSEVNSDCLLDGLDALVYDLDFPAL...,,[],[],Q13464_WT,WT
376230,2136386,KinomeScan assay: inhibition of ROCK2,O75116,MSRPPPTGKMPGAPETAPGDGAGASRQRKLEALIRDPRSPINVESL...,,[],[],O75116_WT,WT
376231,2136430,KinomeScan assay: inhibition of TGFBR2,P37173,MGRGLLRGLWPLHIVLWTRIASTIPPHVQKSVNNDMIVTDNNGAVK...,,[],[],P37173_WT,WT


In [34]:
# Check how many assays have mutant annotations (including undefined mutations)
new_round_assays[~new_round_assays['target_id'].str.contains('WT')]

Unnamed: 0,assay_id,description,accession,sequence,mutation,aa_change,mutants,target_id,Protein_Type
2008,164502,Inhibitory activity against HIV-1 reverse tran...,Q72547,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,M184V,['M184V'],['M184V'],Q72547_M184V,M184V
2009,164500,Inhibitory activity against HIV-1 reverse tran...,Q72547,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,M184I,['M184I'],['M184I'],Q72547_M184I,M184I
2163,164503,Substrate Activity against HIV-1 reverse trans...,Q72547,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,M184V,['M184V'],['M184V'],Q72547_M184V,M184V
2164,164501,Substrate Activity against HIV-1 reverse trans...,Q72547,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,M184I,['M184I'],['M184I'],Q72547_M184I,M184I
3102,205569,Binding affinity against Gln165Ala mutant type...,P25103,MDNVLPVDSDLSPNISTNTSEPNQFVQPAWQIVLWAAAYTVIVVTS...,Q165A,['Q165A'],['Q165A'],P25103_Q165A,Q165A
...,...,...,...,...,...,...,...,...,...
375937,2135067,Protac activity at CRBN/AR T878A mutant in hum...,Q96SW2,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,T878A,['T878A'],[],Q96SW2_MUTANT,WT
375967,2135214,Inhibition of EGFR L858R mutant (unknown origin),P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,L858R,['L858R'],['L858R'],P00533_L858R,L858R
376193,2135936,Inhibition of GDP-loaded His-tagged KRAS G12C ...,P01116,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,G12C,['G12C'],['G12C'],P01116_G12C,G12C
376194,2135937,Inhibition of KRAS G12C mutant in human NCI-H3...,P01116,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,G12C,['G12C'],['G12C'],P01116_G12C,G12C


In [28]:
# Check how many assay now have new annotations compared to ChEMBL after reannotation
new_round_assays[~(new_round_assays['target_id'].str.contains('_WT') | new_round_assays['target_id'].str.contains
('_MUTANT')) &  new_round_assays['mutation'].isnull() ]

Unnamed: 0,assay_id,description,accession,sequence,mutation,aa_change,mutants,target_id,Protein_Type
3103,226874,Ratio of IC50 value against hNK1 receptor to t...,P25103,MDNVLPVDSDLSPNISTNTSEPNQFVQPAWQIVLWAAAYTVIVVTS...,,['Q165A'],['Q165A'],P25103_Q165A,Q165A
5557,153379,Transcriptional activation activity on human I...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,['I272F'],['I272F'],Q07869_I272F,I272F
5558,153380,Transcriptional activation activity on human T...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,['T279M'],['T279M'],Q07869_T279M,T279M
5559,153557,Ratio of transcriptional activation of I272F m...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,['I272F'],['I272F'],Q07869_I272F,I272F
5560,153560,Ratio of transcriptional activation of T279M m...,Q07869,MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,,['T279M'],['T279M'],Q07869_T279M,T279M
...,...,...,...,...,...,...,...,...,...
367794,2098264,Inhibition of human GCN2 S808G mutant kinase d...,Q9P2K8,MAGGRGAPGRGRDEPPESYPQRQDHELQALEAIYGADFQDLRPDAC...,,['S808G'],['S808G'],Q9P2K8_S808G,S808G
371084,2113388,Inhibition of human non-phosphorylated ABL1 F3...,P00519,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,['F317L'],['F317L'],P00519_F317L,F317L
374059,2125019,Activation of human PKM2-C424A expressed in Es...,P14618,MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTG...,,['C424A'],['C424A'],P14618_C424A,C424A
374624,2127601,Binding affinity to human partial length MCL1 ...,Q07820,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,,"['D172N', 'L2S']",['D172N'],Q07820_D172N,D172N


In [29]:
# Check newly annotated assays (annotated as 'UNDEFINED MUTATION' in ChEMBL)
new_round_assays[~(new_round_assays['target_id'].str.contains('_WT') | new_round_assays['target_id'].str.contains
                                                                        ('_MUTANT')) &
                                  (new_round_assays['mutation'] == 'UNDEFINED MUTATION')]

Unnamed: 0,assay_id,description,accession,sequence,mutation,aa_change,mutants,target_id,Protein_Type
225249,1433633,Potentiation of human CFTR F508del/G551D mutan...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,UNDEFINED MUTATION,['G551D'],['G551D'],P13569_G551D,G551D
228081,1435798,Inhibition of wild type HIV1 reverse transcrip...,Q72547,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,UNDEFINED MUTATION,"['Y181I', 'Y181C']","['Y181C', 'Y181I']",Q72547_Y181C_Y181I,Y181C;Y181I
228390,1440950,Inhibition of EGFR T790M/del746 to 750 mutant ...,P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,UNDEFINED MUTATION,['T790M'],['T790M'],P00533_T790M,T790M
228616,1441757,Corrector activity at human bronchial epitheli...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,UNDEFINED MUTATION,['G551D'],['G551D'],P13569_G551D,G551D
231419,1472456,Inhibition of FLT3 ITD D835V mutant (unknown o...,P36888,MPALARDGGQLPLLVVFSAMIFGTITNQDLPVIKCVLINHKNNDSS...,UNDEFINED MUTATION,['D835V'],['D835V'],P36888_D835V,D835V
...,...,...,...,...,...,...,...,...,...
372157,2114787,Inhibition of FLT3 ITD/F691L double mutant (un...,P36888,MPALARDGGQLPLLVVFSAMIFGTITNQDLPVIKCVLINHKNNDSS...,UNDEFINED MUTATION,['F691L'],['F691L'],P36888_F691L,F691L
372171,2114825,Inhibition of FLT3 ITD/D835Y double mutant (un...,P36888,MPALARDGGQLPLLVVFSAMIFGTITNQDLPVIKCVLINHKNNDSS...,UNDEFINED MUTATION,['D835Y'],['D835Y'],P36888_D835Y,D835Y
372172,2114826,Inhibition of FLT3 ITD/D835V double mutant (un...,P36888,MPALARDGGQLPLLVVFSAMIFGTITNQDLPVIKCVLINHKNNDSS...,UNDEFINED MUTATION,['D835V'],['D835V'],P36888_D835V,D835V
373444,2121848,Inhibition of EGFR del19/T790M/C797S triple mu...,P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,UNDEFINED MUTATION,"['T790M', 'C797S']","['T790M', 'C797S']",P00533_T790M_C797S,T790M;C797S


In [26]:
# Check how many annotated assays are annotated with an undefined label
new_round_assays[new_round_assays['target_id'].str.contains('MUTANT')]

Unnamed: 0,assay_id,description,accession,sequence,mutation,aa_change,mutants,target_id,Protein_Type
94476,322454,Inhibitory concentration against IL-3 independ...,P10721,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,D816V,['D816V'],[],P10721_MUTANT,WT
105052,426971,Ratio of Ki to Km for HCV NS5B RNA polymerase ...,Q8JXU8,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,S282T,"['S5B', 'S282T']",[],Q8JXU8_MUTANT,WT
105060,426978,Activity of HCV NS5B RNA polymerase S282T asse...,Q8JXU8,RTEEAIYQCCDLDPQARVAIRSLTERLYVGGPLTNSRGENCGYRRX...,S282T,"['S5B', 'S282T']",[],Q8JXU8_MUTANT,WT
114613,473331,Binding affinity to NR2B receptor M3c T647A mu...,Q13224,MKPRAECCSPKFWLVLAVLAVSGSRARSQKSPPSIGIAVILVGTSD...,T647T,"['R2B', 'T647A']",['T647A'],Q13224_MUTANT,T647A
114623,473343,Binding affinity to NR2B receptor S645A/i641A ...,Q13224,MKPRAECCSPKFWLVLAVLAVSGSRARSQKSPPSIGIAVILVGTSD...,"S645A,I641A","['R2B', 'S645A']",['S645A'],Q13224_MUTANT,S645A
...,...,...,...,...,...,...,...,...,...
374554,2127280,Inhibition of 6His-FLAG-Tev-BRDT (1 to 397 res...,Q58F21,MSLPSRQTAIIVNPPPPEYINTKKNGRLTNQLQYLQKVVLKDLWKH...,UNDEFINED MUTATION,"['Y309A', 'Y66A']","['Y66A', 'Y309A']",Q58F21_MUTANT,Y66A;Y309A
374556,2127282,Inhibition of 6His-Thr-BRD3 (1 to 435 residues...,Q15059,MSTATTVAPAGIPATPGPVNPPPPEVSNPSKPGRKTNQLQYMQNVV...,UNDEFINED MUTATION,"['Y348A', 'Y73A']","['Y73A', 'Y348A']",Q15059_MUTANT,Y73A;Y348A
374557,2127283,Inhibition of 6His-Thr-BRD2 (1 to 473 residues...,P25440,MLQNVTPHNKLPGEGNAGLLGLGPEAAAPGKRIRKPSLLYEGFESP...,UNDEFINED MUTATION,"['Y386A', 'Y113A']","['Y113A', 'Y386A']",P25440_MUTANT,Y113A;Y386A
375931,2135003,Protac activity at CRBN/AR T878A mutant in hum...,Q96SW2,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,T878A,['T878A'],[],Q96SW2_MUTANT,WT


In [32]:
# Check how many ChEMBL-annotated mutants are still fully rejected
new_round_assays_original = new_round_assays[new_round_assays['mutation'] != 'UNDEFINED MUTATION'].dropna\
    (subset=['mutation'])

def check_original_valid(row):
    original_mutations = row['mutation'].split(',')
    valid_mutations = row['target_id'].split('_')[1:]
    if valid_mutations == ['MUTANT']:
        return True
    else:
        if all(item in valid_mutations for item in original_mutations):
            return True
        else:
            return False

new_round_assays_original[~new_round_assays_original.apply(check_original_valid,axis=1)]

Unnamed: 0,assay_id,description,accession,sequence,mutation,aa_change,mutants,target_id,Protein_Type
143911,624186,Inhibition of EGFR Leu858Arg and Thr790Met mut...,P04626,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,"L858R,T790M","['L858R', 'T790M']",[],P04626_WT,WT
143913,624186,Inhibition of EGFR Leu858Arg and Thr790Met mut...,Q15303,MKPATGLWVWVSLLVAAGTVQPSDSQSVCAGTENKLSSLSDLEQQY...,"L858R,T790M","['L858R', 'T790M']",[],Q15303_WT,WT
143914,624186,Inhibition of EGFR Leu858Arg and Thr790Met mut...,P21860,MRANDALQVLGLLFSLARGSEVGNSQAVCPGTLNGLSVTGDAENQY...,"L858R,T790M","['L858R', 'T790M']",['L858R'],P21860_L858R,L858R
143915,624185,Inhibition of EGFR Leu858Arg mutant by HTRF assay,P04626,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,L858R,['L858R'],[],P04626_WT,WT
143917,624185,Inhibition of EGFR Leu858Arg mutant by HTRF assay,Q15303,MKPATGLWVWVSLLVAAGTVQPSDSQSVCAGTENKLSSLSDLEQQY...,L858R,['L858R'],[],Q15303_WT,WT
...,...,...,...,...,...,...,...,...,...
364470,2086921,Synergistic potentiator activity at CFTR F508d...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,F508del,[],[],P13569_WT,WT
364471,2086922,Synergistic potentiator activity at CFTR F508d...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,F508del,[],[],P13569_WT,WT
365662,2094601,Potentiation of CFTR F508del mutant (unknown o...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,F508del,[],[],P13569_WT,WT
365666,2094620,Potentiation of CFTR F508del mutant in human H...,P13569,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,F508del,[],[],P13569_WT,WT


#### Re-annotate bioactivity data for new round based on manual curation

Check also some simple statistics. Full statistics can be computed in the annotation_analysis notebook with
annotation_round = 2.

In [35]:
chembl_dataset_new_round = chembl_annotation(chembl_version=chembl_version, annotation_round=annotation_round)
chembl_dataset_new_round

  chembl_bioactivity_dataset = map_activity_mutations(chembl_data, chembl_assays_annotated)


Unnamed: 0,target_id,chembl_id,assay_id,activity_id,accession,pchembl_value,activity_comment,canonical_smiles,mutation,year,sequence,pchembl_value_Mean,Activity_class
0,A0A024AXB9_WT,CHEMBL1230673,[1986298],[20667385],A0A024AXB9,[6.37],[nan],CN[C@@H](C)C(=O)N[C@H](C(=O)N1CC[C@H](C)[C@H]1...,,2020.0,AETDEDHAHWLEARMLLDNIYLQDGLIASLYRPEADKVAAIEGEFK...,6.370000,
1,A0A024AXB9_WT,CHEMBL1332616,"[1986298, 1986301]","[20667410, 20667411]",A0A024AXB9,"[5.77, 6.22]","[nan, nan]",O=C1OC2(c3ccccc31)c1cc(I)c(O)c(I)c1Oc1c2cc(I)c...,,2020.0,AETDEDHAHWLEARMLLDNIYLQDGLIASLYRPEADKVAAIEGEFK...,5.995000,
2,A0A024AXB9_WT,CHEMBL151,[1855280],[19053938],A0A024AXB9,[4.28],[nan],O=c1cc(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,,2020.0,AETDEDHAHWLEARMLLDNIYLQDGLIASLYRPEADKVAAIEGEFK...,4.280000,
3,A0A024AXB9_WT,CHEMBL1535,[1986298],[20667415],A0A024AXB9,[4.04],[nan],CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12,,2020.0,AETDEDHAHWLEARMLLDNIYLQDGLIASLYRPEADKVAAIEGEFK...,4.040000,
4,A0A024AXB9_WT,CHEMBL164,"[1855280, 1855317, 1855317]","[19053935, 19053936, 19053937]",A0A024AXB9,"[4.66, 5.89, 6.1]","[nan, nan, nan]",O=c1c(O)c(-c2cc(O)c(O)c(O)c2)oc2cc(O)cc(O)c12,,2020.0,AETDEDHAHWLEARMLLDNIYLQDGLIASLYRPEADKVAAIEGEFK...,5.550000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1870743,nan_WT,CHEMBL529,"[582849, 582849, 582849]","[2702686, 2702686, 2702686]",,"[6.52, 6.52, 6.52]","[nan, nan, nan]",CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,,2009.0,UGCCUGGCGGCCGUAGCGCGGUGGUCCCACCUGACCCCAUGCCGAA...,6.520000,
1870744,nan_WT,CHEMBL532,"[576183, 576183, 576183, 576182, 576182, 57618...","[2663798, 2663798, 2663798, 2663799, 2663799, ...",,"[7.54, 7.54, 7.54, 7.96, 7.96, 7.96, 5.72, 5.7...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,,2007.0,UGCCUGGCGGCCGUAGCGCGGUGGUCCCACCUGACCCCAUGCCGAA...,6.418889,
1870745,nan_WT,CHEMBL553222,"[582849, 582849, 582849]","[2702683, 2702683, 2702683]",,"[7.0, 7.0, 7.0]","[nan, nan, nan]",CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,,2009.0,UGCCUGGCGGCCGUAGCGCGGUGGUCCCACCUGACCCCAUGCCGAA...,7.000000,
1870746,nan_WT,CHEMBL557931,"[582849, 582849, 582849]","[2702682, 2702682, 2702682]",,"[5.89, 5.89, 5.89]","[nan, nan, nan]",CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,,2009.0,UGCCUGGCGGCCGUAGCGCGGUGGUCCCACCUGACCCCAUGCCGAA...,5.890000,


In [37]:
# Check how many bioactivity datapoints in ChEMBL are tested in mutants
chembl_dataset_new_round[~chembl_dataset_new_round['target_id'].str.contains('WT')]

Unnamed: 0,target_id,chembl_id,assay_id,activity_id,accession,pchembl_value,activity_comment,canonical_smiles,mutation,year,sequence,pchembl_value_Mean,Activity_class
257,A0A045ISB3_F161S,CHEMBL4637373,[1992590],[20694452],A0A045ISB3,[5.13],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.130,
258,A0A045ISB3_F161S,CHEMBL4637459,[1992590],[20694450],A0A045ISB3,[5.07],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.070,
259,A0A045ISB3_F161S,CHEMBL4638011,[1992590],[20694453],A0A045ISB3,[5.04],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2F)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.040,
260,A0A045ISB3_F161S,CHEMBL4647367,[1992590],[20694451],A0A045ISB3,[5.27],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Cl)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.270,
261,A0A045ISB3_I203A,CHEMBL4637373,[1992587],[20694440],A0A045ISB3,[7.68],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,I203A,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,7.680,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868112,Q9YQ12_V82F,CHEMBL3797292,"[1576372, 1576372, 1822317, 1822317]","[16572620, 16572621, 18916325, 18916333]",Q9YQ12,"[7.62, 7.89, 7.62, 7.89]","[nan, nan, nan, nan]",COc1ccc(S(=O)(=O)N(CC(C)C)[C@@H](O)[C@H](Cc2cc...,V82F,2016.0,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,7.755,
1868113,Q9YQ12_V82F,CHEMBL3799941,"[1576372, 1576372, 1822317, 1822317]","[16572618, 16572619, 18916332, 18916379]",Q9YQ12,"[7.72, 8.0, 8.0, 7.72]","[nan, nan, nan, nan]",COc1ccc(S(=O)(=O)N(CC(C)C)[C@@H](O)[C@H](Cc2cc...,V82F,2016.0,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,7.860,
1868114,Q9YQ12_V82F,CHEMBL4452050,[1854379],[19051909],Q9YQ12,[8.77],[nan],COc1ccc(S(=O)(=O)N(CC(C)C)C[C@H](O)[C@@H](Cc2c...,V82F,2019.0,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,8.770,
1870322,T1QYY3_G140S_Q148H,CHEMBL4779601,[2070418],[22876490],T1QYY3,[7.77],[nan],CN(C)C(=O)C(=O)NC12CCC(CC1)Cn1c2nc(C(=O)NCc2cc...,"G140S,Q148H",2020.0,FLDGIDKAQEDHEKYHSNWRAMASDFNMPPIXAKEIVASCDKCQQK...,7.770,


Unnamed: 0,target_id,chembl_id,assay_id,activity_id,accession,pchembl_value,activity_comment,canonical_smiles,mutation,year,sequence,pchembl_value_Mean,Activity_class
257,A0A045ISB3_F161S,CHEMBL4637373,[1992590],[20694452],A0A045ISB3,[5.13],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.130,
258,A0A045ISB3_F161S,CHEMBL4637459,[1992590],[20694450],A0A045ISB3,[5.07],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.070,
259,A0A045ISB3_F161S,CHEMBL4638011,[1992590],[20694453],A0A045ISB3,[5.04],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2F)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.040,
260,A0A045ISB3_F161S,CHEMBL4647367,[1992590],[20694451],A0A045ISB3,[5.27],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Cl)cc1,F161S,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,5.270,
261,A0A045ISB3_I203A,CHEMBL4637373,[1992587],[20694440],A0A045ISB3,[7.68],[nan],O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,I203A,2020.0,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,7.680,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868112,Q9YQ12_V82F,CHEMBL3797292,"[1576372, 1576372, 1822317, 1822317]","[16572620, 16572621, 18916325, 18916333]",Q9YQ12,"[7.62, 7.89, 7.62, 7.89]","[nan, nan, nan, nan]",COc1ccc(S(=O)(=O)N(CC(C)C)[C@@H](O)[C@H](Cc2cc...,V82F,2016.0,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,7.755,
1868113,Q9YQ12_V82F,CHEMBL3799941,"[1576372, 1576372, 1822317, 1822317]","[16572618, 16572619, 18916332, 18916379]",Q9YQ12,"[7.72, 8.0, 8.0, 7.72]","[nan, nan, nan, nan]",COc1ccc(S(=O)(=O)N(CC(C)C)[C@@H](O)[C@H](Cc2cc...,V82F,2016.0,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,7.860,
1868114,Q9YQ12_V82F,CHEMBL4452050,[1854379],[19051909],Q9YQ12,[8.77],[nan],COc1ccc(S(=O)(=O)N(CC(C)C)C[C@H](O)[C@@H](Cc2c...,V82F,2019.0,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,8.770,
1870322,T1QYY3_G140S_Q148H,CHEMBL4779601,[2070418],[22876490],T1QYY3,[7.77],[nan],CN(C)C(=O)C(=O)NC12CCC(CC1)Cn1c2nc(C(=O)NCc2cc...,"G140S,Q148H",2020.0,FLDGIDKAQEDHEKYHSNWRAMASDFNMPPIXAKEIVASCDKCQQK...,7.770,


In [38]:
# Check how many of those are tested on undefined mutants
chembl_dataset_new_round[chembl_dataset_new_round['target_id'].str.contains('MUTANT')]

Unnamed: 0,target_id,chembl_id,assay_id,activity_id,accession,pchembl_value,activity_comment,canonical_smiles,mutation,year,sequence,pchembl_value_Mean,Activity_class
4394,A5Z252_MUTANT,CHEMBL222813,[1804226],[18813644],A5Z252,[8.12],[nan],CC(=O)N[C@@H]1[C@@H](NC(=N)N)C=C(C(=O)O)O[C@H]...,H274Y,2018.0,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,8.12,
4395,A5Z252_MUTANT,CHEMBL4276773,[1804226],[18813639],A5Z252,[7.44],[nan],CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](NCc2ccc(Sc3cc...,H274Y,2018.0,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,7.44,
4396,A5Z252_MUTANT,CHEMBL4278505,[1804226],[18813633],A5Z252,[6.04],[nan],CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](NCc2ccc(Cc3cc...,H274Y,2018.0,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,6.04,
4397,A5Z252_MUTANT,CHEMBL4281970,[1804226],[18813635],A5Z252,[4.52],[nan],CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](NCc2cccc(OC)c...,H274Y,2018.0,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,4.52,
4398,A5Z252_MUTANT,CHEMBL4282553,[1804226],[18813637],A5Z252,[6.44],[nan],CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](NCc2ccc(Oc3cc...,H274Y,2018.0,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,6.44,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1826325,Q9UM73_MUTANT,CHEMBL3669148,[1640332],[17701115],Q9UM73,[7.89],[nan],COc1ncc(F)cc1-c1c(C(N)=O)sc2cnc(Nc3cc(C4CCOCC4...,L1196M,2015.0,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,7.89,
1826326,Q9UM73_MUTANT,CHEMBL3669149,[1640332],[17701116],Q9UM73,[7.37],[nan],COc1ncccc1-c1c(C(N)=O)sc2cnc(Nc3cccc(N4CCN(C)C...,L1196M,2015.0,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,7.37,
1826327,Q9UM73_MUTANT,CHEMBL3669150,[1640332],[17701117],Q9UM73,[6.37],[nan],COc1ccccc1-c1c(C(N)=O)sc2cnc(Nc3cc4c(cc3OC(C)C...,L1196M,2015.0,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,6.37,
1826328,Q9UM73_MUTANT,CHEMBL3669151,[1640332],[17701118],Q9UM73,[7.54],[nan],COc1ccc(F)cc1-c1c(CO)sc2cnc(Nc3ccc(N4CCN(C)CC4...,L1196M,2015.0,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,7.54,


In [39]:
# Concatenate newly annotated ChEMBL bioactivity data with non-ChEMBL part of Papyrus (only targets with at least
# one annotated variant)
chembl_papyrus_bioactivity_dataset_new_round = combine_chembl_papyrus_mutants(chembl_version, papyrus_version, papyrus_flavor, 1_000_000, \
                                     annotation_round)
chembl_papyrus_bioactivity_dataset_new_round

  chembl_annotated = chembl_annotation(chembl_version, annotation_round)


Unnamed: 0,CID,connectivity,target_id,AID,accession,pchembl_value_Mean,SMILES,sequence,source,Activity_class,Year
0,CHEMBL4637373,LUVQLVJTOZVLDB,A0A045ISB3_F161S,[1992590],A0A045ISB3,5.130,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,ChEMBL31,,2020.0
1,CHEMBL4637459,ALIJNJWQOZKBPP,A0A045ISB3_F161S,[1992590],A0A045ISB3,5.070,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2)cc1,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,ChEMBL31,,2020.0
2,CHEMBL4638011,IPTNWUSXZLVWFT,A0A045ISB3_F161S,[1992590],A0A045ISB3,5.040,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2F)cc1,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,ChEMBL31,,2020.0
3,CHEMBL4647367,CXXIUJZSLPZZIM,A0A045ISB3_F161S,[1992590],A0A045ISB3,5.270,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Cl)cc1,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,ChEMBL31,,2020.0
4,CHEMBL4637373,LUVQLVJTOZVLDB,A0A045ISB3_I203A,[1992587],A0A045ISB3,7.680,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,ChEMBL31,,2020.0
...,...,...,...,...,...,...,...,...,...,...,...
481316,Christmann2016.compound.1413,VMJFTOSOFDEKTM,P08581_N1100Y,Christmann2016.assay,P08581,8.699,Cn1cc(-c2cnc3ccc4c(cc(CS(=O)(=O)NCc5ncccc5)cc4...,MKAPAVLAPGILVLLFTLVQRSNGECKEALAKSEMNVNMKYQLPNF...,Papyrus05.5_Christmann2016,,2011.0
481317,Christmann2016.compound.1413,VMJFTOSOFDEKTM,P08581_Y1230C,Christmann2016.assay,P08581,9.000,Cn1cc(-c2cnc3ccc4c(cc(CS(=O)(=O)NCc5ncccc5)cc4...,MKAPAVLAPGILVLLFTLVQRSNGECKEALAKSEMNVNMKYQLPNF...,Papyrus05.5_Christmann2016,,2011.0
481318,Christmann2016.compound.1413,VMJFTOSOFDEKTM,P08581_Y1230H,Christmann2016.assay,P08581,9.222,Cn1cc(-c2cnc3ccc4c(cc(CS(=O)(=O)NCc5ncccc5)cc4...,MKAPAVLAPGILVLLFTLVQRSNGECKEALAKSEMNVNMKYQLPNF...,Papyrus05.5_Christmann2016,,2011.0
481319,Christmann2016.compound.1640,WCIGMFCFPXZRMQ,Q5S007_A2016T,Christmann2016.assay,Q5S007,7.091,O=C(Nc1cnccc1)c1cc(-c2ccnc(F)c2)ccc1OCc1ccccc1,MASGSCQGCEEDEETLKKLIVRLNNVQEGKQIETLVQILEDLLVFT...,Papyrus05.5_Christmann2016,,2012.0


In [40]:
annotated_data_new_round = merge_chembl_papyrus_mutants(chembl_version, papyrus_version, papyrus_flavor, 1_000_000, annotation_round)
annotated_data_new_round

  chunksize, annotation_round, predefined_variants)


Unnamed: 0,target_id,connectivity,pchembl_value_Mean,Activity_class_consensus,source,SMILES,CID,accession,sequence,Year,UniProtID,Organism,HGNC_symbol
0,A0A045ISB3_F161S,ALIJNJWQOZKBPP,5.07,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2)cc1,CHEMBL4637459,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
1,A0A045ISB3_F161S,CXXIUJZSLPZZIM,5.27,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Cl)cc1,CHEMBL4647367,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
2,A0A045ISB3_F161S,IPTNWUSXZLVWFT,5.04,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2F)cc1,CHEMBL4638011,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
3,A0A045ISB3_F161S,LUVQLVJTOZVLDB,5.13,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,CHEMBL4637373,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
4,A0A045ISB3_I203A,ALIJNJWQOZKBPP,7.37,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2)cc1,CHEMBL4637459,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
455834,Q9YQ12_WT,ZYLARFCKPNSSDA,6.17,"Series([], dtype: object)",ChEMBL31,CC(C)[C@H](NC(=O)COc1ccccc1)C(=O)N[C@@H](Cc1cc...,CHEMBL3331342,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2014.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,
455835,Q9YQ12_WT,ZYLFWZWRSRMGBQ,10.10,"Series([], dtype: object)",ChEMBL31,Nc1cccc(CN2C(=O)N(Cc3ccc4cn[nH]c4c3)C(Cc3ccccc...,CHEMBL2296983,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2013.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,
455836,Q9YQ12_WT,ZZCUJLFFPCGLAL,5.51,"Series([], dtype: object)",ChEMBL31,CC1(C)CC[C@]2(C(=O)NCCCCCC(=O)NCC(=O)O)CC[C@]3...,CHEMBL4530451,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2019.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,
455837,Q9YQ12_WT,ZZGMOZUZSAJAML,6.26,"Series([], dtype: object)",ChEMBL31,CC(C)c1nc(CN(C(=O)N[C@H](C(=O)N[C@H](CC[C@H](C...,CHEMBL3115161,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2014.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,


In [41]:
# Check how many bioactivity datapoints in ChEMBL + Papyrus are tested in mutants
annotated_data_new_round[~annotated_data_new_round['target_id'].str.contains('WT')]

Unnamed: 0,target_id,connectivity,pchembl_value_Mean,Activity_class_consensus,source,SMILES,CID,accession,sequence,Year,UniProtID,Organism,HGNC_symbol
0,A0A045ISB3_F161S,ALIJNJWQOZKBPP,5.070,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2)cc1,CHEMBL4637459,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
1,A0A045ISB3_F161S,CXXIUJZSLPZZIM,5.270,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Cl)cc1,CHEMBL4647367,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
2,A0A045ISB3_F161S,IPTNWUSXZLVWFT,5.040,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2F)cc1,CHEMBL4638011,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
3,A0A045ISB3_F161S,LUVQLVJTOZVLDB,5.130,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2Br)cc1,CHEMBL4637373,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
4,A0A045ISB3_I203A,ALIJNJWQOZKBPP,7.370,"Series([], dtype: object)",ChEMBL31,O=C(O)C(=O)Nc1ccc(C#Cc2ccc(C(F)(F)F)cc2)cc1,CHEMBL4637459,A0A045ISB3,MAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRR...,2020.0,A0A045ISB3_MYCTX,Mycobacterium tuberculosis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
454742,Q9YQ12_V82A,SBBYGVNSLWBXFB,7.455,"Series([], dtype: object)",ChEMBL31,COc1ccc(S(=O)(=O)N(CC(C)C)[C@@H](O)[C@H](Cc2cc...,CHEMBL3799941,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2016.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,
454743,Q9YQ12_V82A,XHKGWTXTLQTTQA,7.400,"Series([], dtype: object)",ChEMBL31,COc1ccc(S(=O)(=O)N(CC(C)C)[C@@H](O)[C@H](Cc2cc...,CHEMBL3797292,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2016.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,
454744,Q9YQ12_V82F,MEWAZRJLRMEJDV,8.770,"Series([], dtype: object)",ChEMBL31,COc1ccc(S(=O)(=O)N(CC(C)C)C[C@H](O)[C@@H](Cc2c...,CHEMBL4452050,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2019.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,
454745,Q9YQ12_V82F,SBBYGVNSLWBXFB,7.860,"Series([], dtype: object)",ChEMBL31,COc1ccc(S(=O)(=O)N(CC(C)C)[C@@H](O)[C@H](Cc2cc...,CHEMBL3799941,Q9YQ12,PQITLWQRPFVTIKIEGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,2016.0,Q9YQ12_9HIV1,Human immunodeficiency virus 1,


In [42]:
# Check how many bioactivity datapoints are tested in undefined mutants
annotated_data_new_round[annotated_data_new_round['target_id'].str.contains('MUTANT')]

Unnamed: 0,target_id,connectivity,pchembl_value_Mean,Activity_class_consensus,source,SMILES,CID,accession,sequence,Year,UniProtID,Organism,HGNC_symbol
1886,A5Z252_MUTANT,ARAIBEBZBOPLMB,8.12,"Series([], dtype: object)",ChEMBL31,CC(=O)N[C@@H]1[C@@H](NC(=N)N)C=C(C(=O)O)O[C@H]...,CHEMBL222813,A5Z252,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,2018.0,A5Z252_9INFA,Influenza A virus (A/Turkey/651242/2006(H5N1)),
1887,A5Z252_MUTANT,BWDGROPAFQXXFI,6.44,"Series([], dtype: object)",ChEMBL31,CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](NCc2ccc(Oc3cc...,CHEMBL4282553,A5Z252,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,2018.0,A5Z252_9INFA,Influenza A virus (A/Turkey/651242/2006(H5N1)),
1888,A5Z252_MUTANT,DSRHDFMYUHIMKS,4.57,"Series([], dtype: object)",ChEMBL31,CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](NCc2ccc(-c3cc...,CHEMBL4285029,A5Z252,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,2018.0,A5Z252_9INFA,Influenza A virus (A/Turkey/651242/2006(H5N1)),
1889,A5Z252_MUTANT,ITKUZBOVXCDIPW,6.14,"Series([], dtype: object)",ChEMBL31,CCc1ccc(CN[C@H]2CC(C(=O)O)=C[C@@H](OC(CC)CC)[C...,CHEMBL4285767,A5Z252,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,2018.0,A5Z252_9INFA,Influenza A virus (A/Turkey/651242/2006(H5N1)),
1890,A5Z252_MUTANT,MCRWKUDKDJFWEE,7.44,"Series([], dtype: object)",ChEMBL31,CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](NCc2ccc(Sc3cc...,CHEMBL4276773,A5Z252,MNPNQKIITIGSICMVIGIVSLMLQIGNMISIWVSHSIQTGNQRQA...,2018.0,A5Z252_9INFA,Influenza A virus (A/Turkey/651242/2006(H5N1)),
...,...,...,...,...,...,...,...,...,...,...,...,...,...
451580,Q9UM73_MUTANT,OFVDSACHDVLZDS,8.52,"Series([], dtype: object)",ChEMBL31,COc1ncccc1-c1c(C(N)=O)sc2cnc(Nc3cc(C)c(N4CCN(C...,CHEMBL3669139,Q9UM73,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,2015.0,ALK_HUMAN,Homo sapiens (Human),ALK
451581,Q9UM73_MUTANT,QFALLVRHZMVEMI,7.24,"Series([], dtype: object)",ChEMBL31,COc1ccccc1-c1c(C(N)=O)sc2cnc(Nc3ccc(NC4CCOCC4)...,CHEMBL3669147,Q9UM73,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,2015.0,ALK_HUMAN,Homo sapiens (Human),ALK
451582,Q9UM73_MUTANT,SWRGFJBGNAEJOY,7.54,"Series([], dtype: object)",ChEMBL31,COc1ccc(F)cc1-c1c(CO)sc2cnc(Nc3ccc(N4CCN(C)CC4...,CHEMBL3669151,Q9UM73,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,2015.0,ALK_HUMAN,Homo sapiens (Human),ALK
451583,Q9UM73_MUTANT,URHVQJZEFHMQBZ,7.27,"Series([], dtype: object)",ChEMBL31,COc1ccccc1-c1c(C(N)=O)sc2cnc(Nc3ccc(-c4cnn(C)c...,CHEMBL3669143,Q9UM73,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,2015.0,ALK_HUMAN,Homo sapiens (Human),ALK


In [43]:
# Check the distribution of per source of mutant bioactivity datapoints
annotated_data_new_round[~annotated_data_new_round['target_id'].str.contains('WT')].groupby(['source'])['pchembl_value_Mean'].count()

source
ChEMBL31                               22992
ChEMBL31;Papyrus05.5_Christmann2016     1422
Papyrus05.5_Christmann2016               672
Name: pchembl_value_Mean, dtype: int64

#### Quickly check EGFR as case study

##### Check how many deletions are there

In [44]:
# Reload annotations of false positives and negatives from round 1 and keep only deletion flags for EGFR
false_negative_full = pd.read_csv(os.path.join(data_dir, f'chembl{chembl_version}_rejected_assays_round{previous_annotation_round}.csv'),
                                  sep='\t',usecols=['assay_id','accession','rejection_flag'])
false_positive_full = pd.read_csv(os.path.join(data_dir,
                                               f'chembl{chembl_version}_wrong_annotated_assays_round{previous_annotation_round}.csv'),
                                  sep='\t', usecols=['assay_id','accession','reason','group_reason'])
false_negative_deletions = false_negative_full[false_negative_full['rejection_flag'] == 'original_deletion']
false_positive_deletions = false_positive_full[false_positive_full['reason'] == 'missing deletion']
egfr_deletions = pd.concat([false_positive_deletions[false_positive_deletions['accession']=='P00533'][['assay_id',
                                                                                                       'accession']],
                            false_negative_deletions[false_negative_deletions['accession']=='P00533'][['assay_id',
                                                                                                  'accession']]])
egfr_deletions['deletion'] = True

In [45]:
# Attach deletion flags to round 2 assays (with updated target_id)
chembl_assays_with_egfr_deletions = pd.merge(new_round_assays[new_round_assays['accession']=='P00533'], egfr_deletions,
                                             how='left',on=['assay_id','accession']).fillna(False)
# Check how many assays (from ChEMBL) have hidden deletions per EGFR annotated mutation
chembl_assays_with_egfr_deletions.groupby(['target_id','deletion'])['assay_id'].count()

target_id                 deletion
P00533_A750P              False          2
                          True          26
P00533_C797S              False          7
P00533_C797S_L858R        False          2
P00533_G719C              False         30
P00533_G719S              False         30
P00533_L858R              False        176
                          True           1
P00533_L861Q              False         58
P00533_P753S              False          2
                          True          25
P00533_T790M              False        106
                          True          15
P00533_T790M_C797S        False          2
                          True           5
P00533_T790M_C797S_L858R  False         30
P00533_T790M_L858M        False          1
P00533_T790M_L858R        False        293
P00533_T790M_L861R        False          1
P00533_WT                 False       1810
Name: assay_id, dtype: int64

In [46]:
# Attach ChEMBL data to round 2 assays with deletion flags
chembl_data = obtain_chembl_data(chembl_version)
# Not possible to ChEMBL + Papyrus round 2 (annotated_data_new_round) because it does not have assay_id variable anymore
chembl_data_with_egfr_deletions = pd.merge(chembl_data[chembl_data['accession']=='P00533'][['assay_id','accession',
                                                                                            'chembl_id','mutation']],
                                           chembl_assays_with_egfr_deletions[['assay_id','accession','target_id',
                                                                              'deletion']],
                                           how='left',on=['assay_id','accession']).fillna(False)
# Check how many activity datapoints (from ChEMBL) have hidden deletions per EGFR annotated mutation
chembl_data_with_egfr_deletions.groupby(['target_id','deletion'])['chembl_id'].count()

  


target_id                 deletion
P00533_A750P              False          20
                          True           62
P00533_C797S              False          30
P00533_C797S_L858R        False          34
P00533_G719C              False          91
P00533_G719S              False          82
P00533_L858R              False        1704
                          True            1
P00533_L861Q              False         901
P00533_P753S              False          18
                          True           58
P00533_T790M              False        1144
                          True          108
P00533_T790M_C797S        False           5
                          True           20
P00533_T790M_C797S_L858R  False         145
P00533_T790M_L858M        False          30
P00533_T790M_L858R        False        2757
P00533_T790M_L861R        False           1
P00533_WT                 False       14100
Name: chembl_id, dtype: int64