# Compound  Counts

Count distinct parent compounds

In [1]:
from local_utils.file_utils import backup_file

### Configuration

In [2]:
# ChEMBL connection...

engine = create_engine(open('database.txt').read().strip())

In [3]:
# Maximum number of heavy atoms allowed...

max_nat = 50

### Reload targets

In [4]:
targets = pd.read_pickle('chembl_targets.pkl')

targets.shape[0], targets.query("exclude == 1").shape[0]

(377, 19)

In [5]:
targets.head()

Unnamed: 0,symbol,approved_name,targets,n_target,chembl_id,target_type,pref_name,species,exclude,target_class_0,target_class_1
0,ABCB1,"ATP-binding cassette, sub-family B (MDR/TAP), member 1","MDR1;7|MDR1;8|ATP-binding cassette, sub-family B (MDR/TAP), member 1;11",1,CHEMBL4302,SINGLE PROTEIN,P-glycoprotein 1,Human,0,Transporter > Primary active transporter,Transporter > Primary active transporter
1,ABCB1,"ATP-binding cassette, sub-family B (MDR/TAP), member 1","MDR1;7|MDR1;8|ATP-binding cassette, sub-family B (MDR/TAP), member 1;11",1,CHEMBL1075229,SINGLE PROTEIN,Multidrug resistance protein 1,Rat,0,,
2,ABCB11,"ATP-binding cassette, sub-family B (MDR/TAP), member 11",BSEP;6|BSEP;7|BSEP;8,1,CHEMBL6020,SINGLE PROTEIN,Bile salt export pump,Human,0,Transporter > Primary active transporter,Transporter > Primary active transporter
3,ABCB11,"ATP-binding cassette, sub-family B (MDR/TAP), member 11",BSEP;6|BSEP;7|BSEP;8,1,CHEMBL2073674,SINGLE PROTEIN,Bile salt export pump,Rat,0,,
4,ABCB4,"ATP-binding cassette, sub-family B (MDR/TAP), member 4",MDR3;7|MDR3;8,1,CHEMBL1743129,SINGLE PROTEIN,Multidrug resistance protein 3,Human,0,Transporter > Primary active transporter,Transporter > Primary active transporter


## Get counts for distinct parent compounds

### Individual ChEMBL targets

Aggregate at ChEMBL target level.

Note that, at this stage, multiple target_chemblid to symbol mappings have not been pruned, hence the 'select distinct'.

In [6]:
count_by_target = pd.read_sql_query("""
select
    a.symbol
  , a.species
  , a.target_chemblid
  , count(distinct case when a.active = 1 then a.parent_cmpd_chemblid end) as n_active
  , count(distinct a.parent_cmpd_chemblid) as n_total
from
  tt_curve_data_v2 a 
where
    a.exclude = 0
and a.nat <= :max_nat 
group by
    a.symbol
  , a.species
  , a.target_chemblid
order by
    a.symbol
  , a.species
  , a.target_chemblid
""", engine, params={'max_nat': max_nat})

count_by_target.shape

(312, 5)

In [7]:
count_by_target

Unnamed: 0,symbol,species,target_chemblid,n_active,n_total
0,ABCB1,Human,CHEMBL4302,784,1249
1,ABCB11,Human,CHEMBL6020,1,16
2,ABCB11,Rat,CHEMBL2073674,8,9
3,ABCC2,Human,CHEMBL5748,5,27
4,ABCC2,Rat,CHEMBL2073676,14,55
5,ABCC3,Human,CHEMBL5918,1,1
6,ABCC3,Rat,CHEMBL2073682,1,4
7,ABCC4,Human,CHEMBL1743128,8,21
8,ABCG2,Human,CHEMBL5393,266,379
9,ABL1,Human,CHEMBL1862,978,1828


In [8]:
# Count targets having thirty or more distint active parent compounds...

{x: len(set(count_by_target.query("species == @x and n_active >= 30")['symbol'].values)) for x in ['Human', 'Rat']}

{'Human': 142, 'Rat': 54}

In [9]:
# Get symbols (Human only)...

found_targets = set(count_by_target.query("species == 'Human' and n_active >= 30")['symbol'].values)

###  Symbol and Species

Aggregate at symbol/species level, _i.e._ merge across ChEMBL targets for a given symbol/species combination.

In [10]:
count_by_symbol_and_species = pd.read_sql_query("""
select
    a.symbol
  , a.species
  , count(distinct case when a.active = 1 then a.parent_cmpd_chemblid end) as n_active
  , count(distinct a.parent_cmpd_chemblid) as n_total
from
  tt_curve_data_v2 a
where
    a.exclude = 0
and a.nat <= :max_nat
group by
    a.symbol
  , a.species
order by
    a.symbol
  , a.species
""", engine, params={'max_nat': max_nat})

count_by_symbol_and_species.shape

(279, 4)

In [11]:
count_by_symbol_and_species

Unnamed: 0,symbol,species,n_active,n_total
0,ABCB1,Human,784,1249
1,ABCB11,Human,1,16
2,ABCB11,Rat,8,9
3,ABCC2,Human,5,27
4,ABCC2,Rat,14,55
5,ABCC3,Human,1,1
6,ABCC3,Rat,1,4
7,ABCC4,Human,8,21
8,ABCG2,Human,266,379
9,ABL1,Human,978,1828


In [12]:
# Count symbols (for species) having thirty or more distint active parent compounds...

{x: len(set(count_by_symbol_and_species.query("species == @x and n_active >= 30")['symbol'].values)) for x in ['Human', 'Rat']}

{'Human': 143, 'Rat': 54}

In [13]:
# Get symbols (Human only)...

found_symbol_and_species = set(count_by_symbol_and_species.query("species == 'Human' and n_active >= 30")['symbol'].values)

In [14]:
# Symbols 'recovered' by merging targets (Human only)...

recovered_symbol_and_species = set(found_symbol_and_species).difference(found_targets)

len(recovered_symbol_and_species)

1

In [15]:
count_by_symbol_and_species.query("symbol in @recovered_symbol_and_species and species == 'Human'")

Unnamed: 0,symbol,species,n_active,n_total
225,PRKAA2,Human,31,96


In [16]:
count_by_target.query("symbol in @recovered_symbol_and_species and species == 'Human'")

Unnamed: 0,symbol,species,target_chemblid,n_active,n_total
257,PRKAA2,Human,CHEMBL2116,17,82
258,PRKAA2,Human,CHEMBL3038455,14,15


### Symbol only

Aggregate at symbol level, _i.e._ merge across ChEMBL target and species for a given symbol.

In [17]:
count_by_symbol = pd.read_sql_query("""
select
    a.symbol
  , count(distinct case when a.active = 1 then a.parent_cmpd_chemblid end) as n_active
  , count(distinct a.parent_cmpd_chemblid) as n_total
from
  tt_curve_data_v2 a
where
    a.exclude = 0
and a.nat <= :max_nat
group by
    a.symbol
order by
    a.symbol
""", engine, params={'max_nat': max_nat})

count_by_symbol.shape

(184, 3)

In [18]:
count_by_symbol

Unnamed: 0,symbol,n_active,n_total
0,ABCB1,784,1249
1,ABCB11,9,22
2,ABCC2,17,79
3,ABCC3,2,5
4,ABCC4,8,21
5,ABCG2,266,379
6,ABL1,978,1828
7,ACE,742,992
8,ACHE,2550,4877
9,ADCY5,41,105


In [19]:
# Get and count symbols having thirty or more distint active parent compounds...

found_symbol = set(count_by_symbol.query("n_active >= 30")['symbol'].values)

len(found_symbol)

147

In [20]:
# Extra symbols 'recovered' by merging targets *and* species...

recovered_symbol =  set(found_symbol).difference(found_targets).difference(found_symbol_and_species)

len(recovered_symbol)

4

In [21]:
count_by_symbol.query("symbol in @recovered_symbol")

Unnamed: 0,symbol,n_active,n_total
9,ADCY5,41,105
161,SLC15A2,30,117
162,SLC22A1,31,101
164,SLC22A6,35,73


In [22]:
count_by_target.query("symbol in @recovered_symbol")

Unnamed: 0,symbol,species,target_chemblid,n_active,n_total
14,ADCY5,Human,CHEMBL3189,23,83
15,ADCY5,Rat,CHEMBL2880,18,22
276,SLC15A2,Human,CHEMBL1743125,22,78
277,SLC15A2,Rat,CHEMBL3325,10,66
278,SLC22A1,Human,CHEMBL5685,19,79
279,SLC22A1,Rat,CHEMBL2073670,16,43
282,SLC22A6,Human,CHEMBL1641347,26,49
283,SLC22A6,Rat,CHEMBL1777665,13,47


## What targets have been lost?

What is lost when aggregating at the symbol/species level after filtering by activity and AMW?

In [23]:
# Consider only Human targets passing distinct active compound threshold...

lost = set(targets.symbol).difference(found_symbol_and_species)

len(lost)

52

In [24]:
# HTML(targets.query("symbol in @lost").to_html())

### In terms of original 'targets' and the tables they appear in

The 'lost' targets mainly seem to be...

* Ion channels
* Transporters
* Xenobiotic metabolising enzymes

In [25]:
symbols = pd.read_pickle('unique_symbols.pkl')

symbols.head()

Unnamed: 0_level_0,symbol,approved_name,targets
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABCB1,ABCB1,"ATP-binding cassette, sub-family B (MDR/TAP), member 1","((MDR1, 7), (MDR1, 8), (ATP-binding cassette, sub-family B (MDR/TAP), member 1, 11))"
ABCB11,ABCB11,"ATP-binding cassette, sub-family B (MDR/TAP), member 11","((BSEP, 6), (BSEP, 7), (BSEP, 8))"
ABCB4,ABCB4,"ATP-binding cassette, sub-family B (MDR/TAP), member 4","((MDR3, 7), (MDR3, 8))"
ABCC2,ABCC2,"ATP-binding cassette, sub-family C (CFTR/MRP), member 2","((MRP2, 7), (MRP2, 8), (ATP-binding cassette, sub-family C (CFTR/MRP), member 2, 11))"
ABCC3,ABCC3,"ATP-binding cassette, sub-family C (CFTR/MRP), member 3","((MRP3, 7), (MRP3, 8))"


In [26]:
def f(key, group):
    
    symbols = group['symbol'].values
        
    return {'target': key, 'symbols': symbols, 'count': len(symbols)}
    
pd.DataFrame([f(x, y) for x, y in sorted(symbols.loc[lost].groupby('targets'), key=lambda x: float(x[0][0][1]))], columns=['target', 'symbols', 'count'])

Unnamed: 0,target,symbols,count
0,"((Potassium voltage-gated channel KQT-like member 1 and minimal potassium channel MinK, 1),)",[KCNE1],1
1,"((ANF, 2.2),)",[NPR1],1
2,"((APJ, 2.2),)",[APLNR],1
3,"((CGRP1, 2.2),)",[CALCR],1
4,"((GPR103, 2.2),)",[QRFPR],1
5,"((adenylyl cyclase, 3),)",[ADCY5],1
6,"((tyrosine hydroxylase, 3),)",[TH],1
7,"((IK1, 4),)",[KCNJ2],1
8,"((IKAch, 4),)","[KCNJ3, KCNJ5]",2
9,"((IKP, 4),)",[KCNK3],1


In [27]:
symbols.query("symbol in @lost")

Unnamed: 0_level_0,symbol,approved_name,targets
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABCB11,ABCB11,"ATP-binding cassette, sub-family B (MDR/TAP), member 11","((BSEP, 6), (BSEP, 7), (BSEP, 8))"
ABCB4,ABCB4,"ATP-binding cassette, sub-family B (MDR/TAP), member 4","((MDR3, 7), (MDR3, 8))"
ABCC2,ABCC2,"ATP-binding cassette, sub-family C (CFTR/MRP), member 2","((MRP2, 7), (MRP2, 8), (ATP-binding cassette, sub-family C (CFTR/MRP), member 2, 11))"
ABCC3,ABCC3,"ATP-binding cassette, sub-family C (CFTR/MRP), member 3","((MRP3, 7), (MRP3, 8))"
ABCC4,ABCC4,"ATP-binding cassette, sub-family C (CFTR/MRP), member 4","((MRP4, 7), (MRP4, 8))"
ABCC6,ABCC6,"ATP-binding cassette, sub-family C (CFTR/MRP), member 6","((MRP6, 8),)"
ADCY5,ADCY5,adenylate cyclase 5,"((adenylyl cyclase, 3),)"
APLNR,APLNR,apelin receptor,"((APJ, 2.2),)"
CALCR,CALCR,calcitonin receptor,"((CGRP1, 2.2),)"
CYP2E1,CYP2E1,"cytochrome P450, family 2, subfamily E, polypeptide 1","((cytochrome P450, family 2, subfamily E, polypeptide 1, 11),)"


In [28]:
def f(x):
        
    targets = x['targets']
                 
    return pd.DataFrame({'symbol': [x['symbol']]*len(targets), 'table': [y[1] for y in targets]})

subset = symbols.query("symbol in @lost")

tables = pd.concat(f(x) for i, x in subset.iterrows())

tables['table'].value_counts()

11     16
8      15
7      11
4      10
9       4
2.2     4
6       3
5       2
3       2
1       1
dtype: int64

## Add counts to targets file

In [29]:
targets = targets.merge(count_by_target[['target_chemblid', 'n_active', 'n_total']], left_on='chembl_id', right_on='target_chemblid', how='left').drop('target_chemblid', axis=1)

targets = targets.merge(count_by_symbol_and_species[['symbol', 'species', 'n_active', 'n_total']], on=['symbol', 'species'], how='left', suffixes=['_tgt', '_ss'])

targets.fillna(0, inplace=True)

targets.shape

(377, 15)

In [30]:
targets.query("exclude == 0").shape

(358, 15)

### Save/restore

File now includes per-target counts.

In [31]:
# backup_file('chembl_targets.pkl')

targets.to_pickle('chembl_targets.pkl')

In [32]:
# targets = pd.read_pickle('chembl_targets.pkl')

# targets.shape