In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# Define ChEMBL and PaPyRus versions
chembl_version = '31'
papyrus_version = '05.5'
papyrus_flavor = 'nostereo'

In [3]:
# Define annotation round
annotation_round = 2

In [4]:
# Define protein for analysis
analysis_accession = 'P00533' #EGFR
# analysis_accession = 'Q72547'  #HIV
# analysis_accession = 'P00519' #ABL1
# analysis_accession = 'O60885' #BRD4
# analysis_accession = 'O75874' #IDH1

In [5]:
# Define butina cutoff used for clustering
butina_cutoff = {'P00533': 0.7,  #EGFR
                 'Q72547': 0.5,  #HIV
                 'P00519': 0.5,  #ABL1
                 'O60885': 0.7,  #BRD4
                 'O75874': 0.7  #IDH1
                 }

In [6]:
# Define directories of interest
from mutants_in_pcm import data_path
data_path.data_dir = '../data'

In [7]:
from utils import get_mutant_analysis_path
data_dir = '1_mutant_statistics'

In [8]:
compound_analysis_dir = get_mutant_analysis_path(data_dir, 'compound', annotation_round)

In [9]:
# Import libraries and analysis functions
import os
import pandas as pd
import numpy as np

In [10]:
from mutants_in_pcm.mutant_analysis_compounds import get_clustering_stats,map_chembl_compounds,group_unique_df,explore_cluster_compound_info,annotate_cluster_compounds

### Check approval status in cluster analysis 

In [11]:
# Read compound information retrieved from ChEMBL
mapped_compounds = map_chembl_compounds('31', '05.5', 'nostereo', 1_000_000, annotation_round=annotation_round)
mapped_compounds

ChEMBL/Papyrus compound mapping file already exists. Reading it.


  


Unnamed: 0,connectivity,CID,chembl_id,molregno,pref_name,max_phase,therapeutic_flag,natural_product,molecule_type,first_approval,...,chembl_id_child,pref_name_child,max_phase_child,accession_child,mutation_child,mechanism_of_action_child,action_type_child,mechanism_comment_child,selectivity_comment_child,indications_child
0,AAAAZQPHATYWOK,CHEMBL175513,CHEMBL175513,299040,,0,0,-1,Small molecule,,...,,,,,,,,,,
1,AAABHMIRDIOYOK,CHEMBL1527551,CHEMBL1527551,951435,,0,0,-1,Small molecule,,...,,,,,,,,,,
2,AAABTPAECTZDET,CHEMBL221553,CHEMBL221553,368668,,0,0,-1,Small molecule,,...,,,,,,,,,,
3,AAACBXVBBDAYRQ,CHEMBL4067228,CHEMBL4067228,2207139,,0,0,-1,Small molecule,,...,,,,,,,,,,
4,AAADPBLPXCELKR,CHEMBL495028,CHEMBL495028,478485,,0,0,-1,Small molecule,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317276,ZZZNVZMIBLYSAK,CHEMBL1309011,CHEMBL1309011,732895,,0,0,-1,Small molecule,,...,,,,,,,,,,
317277,ZZZOPDYSYWKLLT,CHEMBL1779477,CHEMBL1779477,1147950,,0,0,-1,Small molecule,,...,,,,,,,,,,
317278,ZZZQZZTYLVXHSC,CHEMBL1165483,CHEMBL1165483,644719,,0,0,-1,Small molecule,,...,,,,,,,,,,
317279,ZZZZEECYUGCIIB,CHEMBL4098540,CHEMBL4098540,2238451,,0,0,-1,Small molecule,,...,,,,,,,,,,


In [12]:
# Read cluster statistics and compound-cluster mapping
cluster_compounds = get_clustering_stats(accession = analysis_accession, 
                                     output_dir = compound_analysis_dir, 
                                     subset_alias = 'full_dual_tested_set', # Full set still refers to compounds tested
                                         # at least 
                                     # in two variants (normally WT and another)
                                     cutoff = '0.5') # Butina cutoff was always 0.5 for clustering in this analysis

Number of clusters: 118
Number of compounds in clusters: 1219
Number of compounds per cluster:
Cluster 1: 253 compounds
Cluster 2: 236 compounds
Cluster 3: 91 compounds
Cluster 4: 78 compounds
Cluster 5: 35 compounds
Cluster 6: 35 compounds
Cluster 7: 30 compounds
Cluster 8: 29 compounds
Cluster 9: 28 compounds
Cluster 10: 26 compounds
Cluster 11: 24 compounds
Cluster 12: 23 compounds
Cluster 13: 20 compounds
Cluster 14: 20 compounds
Cluster 15: 19 compounds
Cluster 16: 19 compounds
Cluster 17: 16 compounds
Cluster 18: 13 compounds
Cluster 19: 12 compounds
Cluster 20: 12 compounds
Cluster 21: 11 compounds
Cluster 22: 10 compounds
Cluster 23: 7 compounds
Cluster 24: 6 compounds
Cluster 25: 6 compounds
Cluster 26: 5 compounds
Cluster 27: 5 compounds
Cluster 28: 5 compounds
Cluster 29: 5 compounds
Cluster 30: 4 compounds
Cluster 31: 4 compounds
Cluster 32: 4 compounds
Cluster 33: 4 compounds
Cluster 34: 4 compounds
Cluster 35: 4 compounds
Cluster 36: 4 compounds
Cluster 37: 4 compounds
Cl

In [13]:
# Annotate cluster compounds with ChEMBL information
cluster_df_unique = annotate_cluster_compounds(cluster_compounds, mapped_compounds)
cluster_df_unique

Unnamed: 0,connectivity,CID,chembl_id,molregno,pref_name,max_phase,therapeutic_flag,natural_product,molecule_type,first_approval,...,pref_name_child,max_phase_child,accession_child,mutation_child,mechanism_of_action_child,action_type_child,mechanism_comment_child,selectivity_comment_child,indications_child,cluster
0,AAFHSECTTHOVFV,CHEMBL2029442,CHEMBL2029442,1341486,,0,0,-1,Small molecule,,...,,,,,,,,,,1
1,AAKJLRGGTJKAMG,CHEMBL553,CHEMBL553,14785,ERLOTINIB,4,1,0,Small molecule,2004.0,...,ERLOTINIB HYDROCHLORIDE,4,P00533,,Epidermal growth factor receptor erbB1 inhibitor,INHIBITOR,,,Astrocytoma (1.0);Glioma (1.0);Meningioma (1.0...,2
2,ABEGTFCLSXRJQE,CHEMBL4077228,CHEMBL4077228,2217139,,0,0,-1,Small molecule,,...,,,,,,,,,,10
3,ABGXDJDXYCHCQN,CHEMBL4778502,CHEMBL4778502,2515093,,0,0,-1,,,...,,,,,,,,,,7
4,ABHDYLBMXLXCKE,CHEMBL4548481,CHEMBL4548481,2418909,,0,0,-1,,,...,,,,,,,,,,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214,ZYZAJOCFJDOSLF,CHEMBL3353406,CHEMBL3353406,1817514,,0,0,-1,Small molecule,,...,,,,,,,,,,1
1215,ZZGGOXMEIWLVTJ,CHEMBL2029432,CHEMBL2029432,1341476,,0,0,-1,Small molecule,,...,,,,,,,,,,1
1216,ZZHGYQVOTUDVEE,CHEMBL4276842,CHEMBL4276842,2316293,,0,0,-1,Small molecule,,...,,,,,,,,,,3
1217,ZZSBPGIGIUFJRA,CHEMBL338956;CHEMBL1240703,CHEMBL338956;CHEMBL1240703,215441;699914,nan;CGP-52421,0,0,-1,Small molecule,,...,,,,,,,,,,37


In [14]:
# Check how many compounds per cluster are linked to the analysis accession in their MOA
explore_cluster_compound_info(cluster_df_unique, analysis_accession, analysis_type='MOA', output_type='stats', 
                              sort='both')

Number of clusters with at least one (parent or child) compound satisfying the condition: 11 (0.09%)


Unnamed: 0_level_0,P00533_MOA,P00533_MOA_child,P00533_MOA_total
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,7,3,10
1,2,0,2
11,2,0,2
16,1,0,1
20,1,0,1
...,...,...,...
113,0,0,0
115,0,0,0
116,0,0,0
117,0,0,0


In [15]:
# Check which compounds are linked to the analysis accession in their MOA
explore_cluster_compound_info(cluster_df_unique, analysis_accession, analysis_type='MOA', output_type='df', 
                              sort='both')

Unnamed: 0,connectivity,pref_name,accession,accession_child,mutation,max_phase,cluster
203,DUYJMQONPNNFPI,OSIMERTINIB,P00533,,T790M,4,1
359,HUFOZJXAKZVRNJ,ROCILETINIB,P00533,,,3,1
1,AAKJLRGGTJKAMG,ERLOTINIB,,P00533,,4,2
1050,WVUNYSQLFKLYNI,PELITINIB,P00533,,,2,2
929,ULXXDDBFHOBEHA,AFATINIB,,P00533;P04626;Q15303,,4,2
917,UHTHHESEBZOYNR,VANDETANIB,P29317;P29323;P54756;P54764;P29322;Q9UF33;Q153...,,,4,2
688,OMZCMEYTWSXEPZ,CANERTINIB,,P00533;P04626;Q15303,,3,2
611,MXDSJQHFFDGFDK,AZD-3759,P00533,,,2,2
556,LVXJQMNHJWSHET,DACOMITINIB,P00533;P04626;Q15303,,,4,2
1063,XGALLCVXEZPNRQ,GEFITINIB,P00533,,,4,2


In [16]:
# Check how many compounds per cluster are linked to a mutation in their MOA
explore_cluster_compound_info(cluster_df_unique, analysis_accession, analysis_type='mutation', 
                              output_type='stats', sort='both')

Number of clusters with at least one (parent or child) compound satisfying the condition: 4 (0.03%)


Unnamed: 0_level_0,P00533_mutation,P00533_mutation_child,P00533_mutation_total
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,2,0,2
1,1,0,1
16,1,0,1
29,1,0,1
2,0,0,0
...,...,...,...
114,0,0,0
115,0,0,0
116,0,0,0
117,0,0,0


In [17]:
# Check which mutations
explore_cluster_compound_info(cluster_df_unique, analysis_accession, analysis_type='mutation', output_type='df', 
                              sort='both')

Unnamed: 0,connectivity,pref_name,accession,mutation,mutation_child,max_phase,cluster
203,DUYJMQONPNNFPI,OSIMERTINIB,P00533,T790M,,4,1
221,FDMQDKQUTRLUBU,OLMUTINIB,P00533,T790M,,2,11
935,UOFYSRZSLXWIQB,ABIVERTINIB,P00533;Q06187,UNDEFINED MUTATION;T790M;nan,,3,11
459,JYIUNVOCEFIUIU,MAVELERTINIB,P00533,"UNDEFINED MUTATION;L858R;T790M;T790M,L858M",,2,16
678,ODMXWZROLKITMS,PF-06459988,P00533,T790M,,2,29


In [18]:
# Check how many compounds per cluster are approved
# Check how many compounds per cluster are approved
explore_cluster_compound_info(cluster_df_unique, analysis_accession, analysis_type='approval', 
                              output_type='stats',sort='both')

Number of clusters with at least one (parent or child) compound satisfying the condition: 18 (0.15%)


Unnamed: 0_level_0,approved,approved_child,approved_total
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,6,0,6
1,1,0,1
6,1,0,1
13,1,0,1
27,1,0,1
...,...,...,...
112,0,0,0
113,0,0,0
114,0,0,0
116,0,0,0


In [19]:
# Check which compounds are approved
explore_cluster_compound_info(cluster_df_unique, analysis_accession, analysis_type='approval', output_type='df', 
                              sort='both')

Unnamed: 0,connectivity,pref_name,accession,mutation,max_phase,max_phase_child,cluster
203,DUYJMQONPNNFPI,OSIMERTINIB,P00533,T790M,4,,1
1,AAKJLRGGTJKAMG,ERLOTINIB,,,4,4,2
1063,XGALLCVXEZPNRQ,GEFITINIB,P00533,,4,,2
929,ULXXDDBFHOBEHA,AFATINIB,,,4,4,2
917,UHTHHESEBZOYNR,VANDETANIB,P29317;P29323;P54756;P54764;P29322;Q9UF33;Q153...,,4,,2
457,JWNPDZNEKVCWMY,NERATINIB,P00533;P04626;Q15303,,4,,2
556,LVXJQMNHJWSHET,DACOMITINIB,P00533;P04626;Q15303,,4,,2
1101,XYFPWWZEPKGCCK,IBRUTINIB,Q06187,,4,,6
903,UBPYILGKFZZVDX,BOSUTINIB,P00519;P11274;P08631;P07948;P12931,,4,,13
505,KTUFNOKKBVMGRW,nan;IMATINIB,,,0;4,nan;4;0,27


In [20]:
# Check how many compounds per cluster are in different states of clinical trials
(cluster_df_unique.groupby(['cluster','max_phase']).agg({'connectivity':'nunique','pref_name':'unique'}))

Unnamed: 0_level_0,Unnamed: 1_level_0,connectivity,pref_name
cluster,max_phase,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,250,[nan]
1,3,1,[ROCILETINIB]
1,4,1,[OSIMERTINIB]
1,0;2,1,[nan;BI-2536]
2,0,226,"[nan, AZD-1152-HQPA]"
...,...,...,...
114,1,1,[CEP-32496]
115,4,1,[SELUMETINIB]
116,3,1,[QUIZARTINIB]
117,4,1,[PAZOPANIB]
