In [32]:
"""Notebook for inspecting the number of significant targets, novel targets, known targets, assessed targets etc. for flow of targets figure"""

'Notebook for inspecting the number of significant targets, novel targets, known targets, assessed targets etc. for flow of targets figure'

In [33]:
import pandas as pd
import pickle
import analysis_functions

In [4]:
basedir = '/scratch/ias41/ae_code'

In [5]:
with open(basedir + '/analysis/data/dirs_info.pkl', 'rb') as f:
    dirs = pickle.load(f)

In [6]:
# Target information
target_info = pd.read_csv(basedir + '/ae_target_links/data/target_names.txt', sep='\t')
target_info = target_info.loc[target_info['accession_organism']=='Homo sapiens',:]

In [7]:
# Previously reported associations
# Known associations, merge with known hierarchy HLT
known_associations = pd.read_excel(basedir + '/prev_reported_safety_associations/data/safety_meddra_annotated_effects.xls')
known_associations['Annotated MedDRA PT'] = known_associations['Annotated MedDRA PT'].apply(lambda x: x.upper())
known_meddra_hier = pd.read_excel(basedir + '/prev_reported_safety_associations/data/safety_meddra_annotated_effects_for_hierarchy_output.xlsx', skiprows=4)
known_meddra_hier['PT'] = known_meddra_hier['PT'].apply(lambda x: x.upper())
known_meddra_hier[' Term'] = known_meddra_hier[' Term'].apply(lambda x: x.upper())
known_meddra_hier['HLT'] = known_meddra_hier['HLT'].apply(lambda x: x.upper())
known_meddra_hier_selection = known_meddra_hier.loc[known_meddra_hier['Primary SOC']=='Y',['PT','HLT',' Term']].drop_duplicates()
known_merged = known_associations.merge(known_meddra_hier_selection, left_on='Annotated MedDRA PT', right_on=' Term')

hlt_manual = pd.read_excel(basedir + '/prev_reported_safety_associations/data/safety_meddra_manually_annotated_hlt_effects.xls', index=False)
hlt_manual.rename(columns={'Annotated MedDRA HLT': 'HLT'}, inplace=True)
hlt_manual['HLT'] = hlt_manual['HLT'].apply(lambda x: x.upper())
hlt_manual.drop(columns=['Annotated MedDRA HLT Code'])

known_merged = pd.concat([known_merged, hlt_manual], sort=False).reset_index(drop=True)

# MedDRA hierchy
meddra_hier = pd.read_excel(basedir + '/analysis/data/all_faers_and_sider_aes_hier_output.xlsx', skiprows=4)
meddra_hier_selection = meddra_hier.loc[meddra_hier['Primary SOC']=='Y',[' Term','HLT','SOC','PT']].drop_duplicates()
meddra_hier_selection['HLT'] = meddra_hier_selection['HLT'].apply(lambda x: x.upper())

# Known associations
known_hlt_tuples = set([(x[1]['Accession'], x[1]['HLT']) for x in known_merged.iterrows()])
known_pt_tuples = set([(x[1]['Accession'], x[1]['PT']) for x in known_merged.loc[~known_merged['PT'].isnull()].iterrows()])

In [8]:
# Number of extracted known targets (from publications)
len(set(known_merged['Accession']))

91

In [9]:
faers_data_unbound = dirs['20200110_faers_unbound_margin_pred_005_PRR2']
sider_data_unbound = dirs['20200110_sider_unbound_margin_pred']
sign_overview_unbound = pd.read_csv(basedir + '/analysis/results/unbound_margin_pred_faers_vs_sider/20200121_sign_target_overview.txt', sep='\t')

In [10]:
faers_data_cutoff = dirs['20200110_faers_cutoff6_pred_005_PRR2']
sider_data_cutoff = dirs['20200110_sider_cutoff6_pred']
sign_overview_cutoff = pd.read_csv(basedir + '/analysis/results/cutoff_pred_faers_vs_sider/20200128_sign_target_overview.txt', sep='\t')

In [11]:
def print_target_overview(sign_targets_overview):
    
    print(f'Number of significant targets: {len(sign_targets_overview)}')
    both_targets = sign_targets_overview.loc[sign_targets_overview['dataset']=='Both']
    print(f'Number of targets overlapping FAERS & SIDER: {len(both_targets)}')
    safety_targets = sign_targets_overview.loc[sign_targets_overview['Previously reported safety target']==1]
    print(f'Number of safety targets: {len(safety_targets)}')
    novel_targets = sign_targets_overview.loc[sign_targets_overview['Previously reported safety target']==0]
    print(f'Number of novel targets: {len(novel_targets)}')

In [12]:
print_target_overview(sign_overview_unbound)

Number of significant targets: 45
Number of targets overlapping FAERS & SIDER: 16
Number of safety targets: 30
Number of novel targets: 15


In [13]:
print_target_overview(sign_overview_cutoff)

Number of significant targets: 96
Number of targets overlapping FAERS & SIDER: 48
Number of safety targets: 34
Number of novel targets: 62


In [14]:
def print_targets_assessed(faers_data, sider_data):
    # Load main info from directories
    faers_all = analysis_functions.find_all_associations(basedir + '/ae_target_links/output/' + faers_data['dir'])
    sider_all = analysis_functions.find_all_associations(basedir + '/ae_target_links/output/' + sider_data['dir'])
    
    # Total nr targets assessed
    nr_targets_assessed = len(set(faers_all['accession']) | set(sider_all['accession']))
    print(f'Number of targets assessed/tested: {nr_targets_assessed}')
    
    # Targets assessed that are known
    nr_targets_assessed_known = len((set(faers_all['accession']) | set(sider_all['accession'])) & set(known_merged['Accession']))
    print(f'Number of targets assessed/tested that are known: {nr_targets_assessed_known}')

In [15]:
print_targets_assessed(faers_data_unbound, sider_data_unbound)

Number of targets assessed/tested: 104
Number of targets assessed/tested that are known: 40


In [16]:
print_targets_assessed(faers_data_cutoff, sider_data_cutoff)

Number of targets assessed/tested: 234
Number of targets assessed/tested that are known: 55


In [17]:
pd.set_option('display.max_rows',800)

In [18]:
pd.get_option("display.max_rows")

800

In [19]:
faers_all = analysis_functions.find_all_associations(basedir + '/ae_target_links/output/' + faers_data_unbound['dir'])
sider_all = analysis_functions.find_all_associations(basedir + '/ae_target_links/output/' + sider_data_unbound['dir'])

In [20]:
targets_not_in_study = set(known_merged['Accession']) - (set(faers_all['accession']) | set(sider_all['accession']))

In [21]:
len(set(known_merged['Accession']))

91

In [22]:
len(targets_not_in_study)

51

In [25]:
target_info.loc[target_info['accession'].isin(targets_not_in_study)]

Unnamed: 0,tid,pref_name,target_type,accession,accession_organism,target_organism
0,10819,Acetylcholine receptor protein alpha chain,SINGLE PROTEIN,P02708,Homo sapiens,Homo sapiens
16,10670,Neuronal acetylcholine receptor protein alpha-...,SINGLE PROTEIN,P43681,Homo sapiens,Homo sapiens
20,161,Glutamate (NMDA) receptor subunit zeta 1,SINGLE PROTEIN,Q05586,Homo sapiens,Homo sapiens
31,247,Endothelin receptor ET-B,SINGLE PROTEIN,P24530,Homo sapiens,Homo sapiens
49,29,Sodium/potassium-transporting ATPase alpha-1 c...,SINGLE PROTEIN,P05023,Homo sapiens,Homo sapiens
51,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,P34972,Homo sapiens,Homo sapiens
62,185,TNF-alpha,SINGLE PROTEIN,P01375,Homo sapiens,Homo sapiens
66,251,Platelet activating factor receptor,SINGLE PROTEIN,P25105,Homo sapiens,Homo sapiens
92,114,Adenosine A1 receptor,SINGLE PROTEIN,P30542,Homo sapiens,Homo sapiens
103,87,Cannabinoid CB1 receptor,SINGLE PROTEIN,P21554,Homo sapiens,Homo sapiens


In [50]:
nr_targets_assessed_known = (set(faers_all['accession']) | set(sider_all['accession'])) & set(known_merged['Accession'])
nr_targets_assessed_sign = set(sign_overview_unbound['accession']) & set(known_merged['Accession'])

In [51]:
nr_targets_assessed_known - nr_targets_assessed_sign

{'P07550',
 'P08908',
 'P0DMS8',
 'P10275',
 'P14867',
 'P22303',
 'P25021',
 'P28222',
 'Q13936',
 'Q14524'}

In [57]:
', '.join(list(known_merged.loc[known_merged['Accession'].isin(list(nr_targets_assessed_known - nr_targets_assessed_sign)),'ChEMBL target name'].drop_duplicates()))

'Androgen Receptor, GABA receptor alpha-1 subunit, Acetylcholinesterase, Beta-2 adrenergic receptor, Histamine H2 receptor, Serotonin 1a (5-HT1a) receptor, Sodium channel protein type V alpha subunit, Voltage-gated L-type calcium channel alpha-1C subunit, Serotonin 1b (5-HT1b) receptor, Adenosine A3 receptor'

In [28]:
sign_overview_unbound.loc[sign_overview_unbound['Previously reported safety target']==0]

Unnamed: 0,accession,Target name,Target Class,dataset,Previously reported safety target
0,P21918,Dopamine D5 receptor,Family A G protein-coupled receptor,FAERS,0
1,P21917,Dopamine D4 receptor,Family A G protein-coupled receptor,FAERS,0
2,P10636,Microtubule-associated protein tau,Unclassified protein,FAERS,0
3,P35462,Dopamine D3 receptor,Family A G protein-coupled receptor,Both,0
4,P25100,Alpha-1d adrenergic receptor,Family A G protein-coupled receptor,Both,0
5,P00918,Carbonic anhydrase II,Lyase,Both,0
6,P22748,Carbonic anhydrase IV,Lyase,Both,0
7,P50406,Serotonin 6 (5-HT6) receptor,Family A G protein-coupled receptor,SIDER,0
8,Q9ULX7,Carbonic anhydrase XIV,Lyase,SIDER,0
9,Q16790,Carbonic anhydrase IX,Lyase,SIDER,0


### Example of AE in investigations in dataset but not related to AEs

In [35]:
meddra_hier = pd.read_excel(basedir + '/analysis/data/all_faers_and_sider_aes_hier_output.xlsx', skiprows=4)
meddra_hier_selection = meddra_hier.loc[meddra_hier['Primary SOC']=='Y',[' Term','HLT','SOC','PT']].drop_duplicates()
meddra_hier_selection['HLT'] = meddra_hier_selection['HLT'].apply(lambda x: x.upper())

In [36]:
faers_all_soc = faers_all.merge(meddra_hier_selection, left_on='Adverse Event', right_on=' Term')

In [40]:
faers_all_soc.loc[faers_all_soc['SOC'].str.contains('Investi'),'Adverse Event'].value_counts()

EOSINOPHIL COUNT INCREASED                           103
BLOOD LACTATE DEHYDROGENASE INCREASED                103
BLOOD AMYLASE INCREASED                              102
BLOOD TRIGLYCERIDES INCREASED                        102
CLOSTRIDIUM TEST POSITIVE                            102
CARDIAC MURMUR                                       102
ELECTROENCEPHALOGRAM ABNORMAL                        101
NEUTROPHIL COUNT INCREASED                           101
COMPUTERISED TOMOGRAM ABNORMAL                       101
BLOOD PRESSURE DIASTOLIC DECREASED                   100
PLATELET COUNT INCREASED                             100
BLOOD SODIUM DECREASED                               100
GENERAL PHYSICAL CONDITION ABNORMAL                  100
ELECTROCARDIOGRAM T WAVE ABNORMAL                    100
FIBRIN D DIMER INCREASED                             100
HAEMATOCRIT DECREASED                                100
BLOOD ALKALINE PHOSPHATASE INCREASED                 100
ELECTROCARDIOGRAM QRS COMPLEX P