In [1]:
import pandas as pd 
import numpy as np
import networkx as nx
from lccsig import LCC_zscore

In [2]:
#read ppi network
G = nx.from_pandas_edgelist(pd.read_csv('data/Gene_Gene.csv'), 'source', 'target')

In [3]:
#evidence count for disease_gene association
path = '/Users/f.nasirian/Dropbox (CCNR)/Biology/99_Toolbox/data/Open_Targets_Platform/data/out'
dgs = pd.read_csv(path+'/disease_gene_v2.csv').drop('targets', axis=1).drop_duplicates()
dgs = dgs[dgs.GeneId.notnull()]
dgs = dgs[(~dgs.source.isin(['literature']))]
#dgs = dgs.groupby(['efoId', 'efoName', 'GeneId', 'source'])['evidenceCount'].apply(sum).reset_index()

In [4]:
dgs.source.unique()

array(['genetic_association', 'known_drug', 'somatic_mutation',
       'rna_expression', 'affected_pathway', 'animal_model'], dtype=object)

In [5]:
#disease therapeutic area
ta = dgs[['efoId']].drop_duplicates().merge(
    pd.read_csv(path+'/therapeutic_areas_v2.csv').rename(columns={'efoName':'taName'}), on='efoId', how='left')    

In [7]:
ta.taName.unique()

array(['infectious disease or post-infectious disorder',
       'infectious disease',
       'musculoskeletal or connective tissue disease',
       'respiratory or thoracic disease',
       'reproductive system or breast disease', 'urinary system disease',
       'hematologic disease', 'genetic, familial or congenital disease',
       'cancer or benign tumor', 'immune system disease',
       'gastrointestinal disease', 'endocrine system disease',
       'integumentary system disease', 'nutritional or metabolic disease',
       'nervous system disease', 'cardiovascular disease',
       'pancreas disease', 'psychiatric disorder',
       'disorder of visual system', 'measurement', 'biological process',
       'pregnancy or perinatal disease',
       'injury, poisoning or other complication', 'phenotype',
       'disorder of ear', nan], dtype=object)

In [5]:
#diseases classified in cvd area
cvd_area = set(ta[ta.taName == 'cardiovascular disease']['efoId']) - {'EFO_0000319'}

In [6]:
dgs = dgs[(dgs.efoId.isin(cvd_area)) & (dgs.source == 'genetic_association') & (dgs.score > 0.1)]

In [7]:
#cvds with at least 20 associations
df = dgs.groupby(['efoId', 'efoName'])['GeneId'].size().reset_index(name='Count')
sel_cvd = df[df.Count > 20].efoId.unique()
len(sel_cvd)

53

In [10]:
dgs[dgs.efoId.isin(sel_cvd)].groupby(['efoId', 'efoName'])['GeneId'].size().sort_values()

efoId            efoName                                                     
EFO_0010977      macrovascular complications of diabetes                          21
Orphanet_217656  familial isolated arrhythmogenic right ventricular dysplasia     22
Orphanet_871     familial progressive cardiac conduction defect                   23
MONDO_0004596    cor pulmonale                                                    23
Orphanet_91387   familial thoracic aortic aneurysm and aortic dissection          24
Orphanet_1480    ventricular septal defect                                        24
EFO_0005306      ventricular tachycardia                                          25
EFO_0004246      mucocutaneous lymph node syndrome                                25
Orphanet_104     leber hereditary optic neuropathy                                26
EFO_0003827      pulmonary embolism                                               26
MONDO_0024573    familial hypertrophic cardiomyopathy                   

In [46]:
cvd_sel = dgs[dgs.efoId.isin(sel_cvd)].copy()

In [19]:
pvalues = cvd_sel.groupby('efoId')['GeneId'].apply(set).apply(lambda x: LCC_zscore(G, x)[0])

In [37]:
df_lcc = dgs[dgs.efoId.isin(sel_cvd)].groupby(['efoId', 'efoName'])['GeneId'].size().reset_index()
df_lcc['lcc_sig'] = pvalues.reset_index().GeneId.to_list()
df_lcc['sig'] = df_lcc.lcc_sig.apply(lambda x: True if x<=0.05 else False)

In [38]:
df_lcc.sig.value_counts()

True     31
False    22
Name: sig, dtype: int64

In [40]:
dict_temp = dict(zip(df_lcc.efoId, df_lcc.sig))

In [47]:
cvd_sel['lcc_sig'] = cvd_sel.efoId.apply(lambda x: dict_temp[x])

In [48]:
cvd_sel.to_csv('data/cvd_genes.csv', index=False)

### possible non-therapeutic areas for Amla

In [13]:
neg_df = dgs[dgs.efoId.isin(
    ta[ta.taName.isin(['immune system disease', 'respiratory or thoracic disease', 'infectious disease'])].efoId.unique())]

In [33]:
neg_df = neg_df[(neg_df.source == 'genetic_association') & (neg_df.score > 0.1)]

In [34]:
df = neg_df.groupby(['efoId', 'efoName'])['GeneId'].size().sort_values().reset_index(name='Count')

In [35]:
df[df.Count > 100]

Unnamed: 0,efoId,efoName,Count
1114,EFO_0003779,hashimoto's thyroiditis,101
1115,MONDO_0021661,coronary atherosclerosis,102
1116,EFO_0003911,atrial flutter,111
1117,EFO_0001071,lung carcinoma,113
1118,EFO_1002011,adult onset asthma,118
1119,Orphanet_145,hereditary breast and ovarian cancer syndrome,129
1120,EFO_0000538,hypertrophic cardiomyopathy,139
1121,EFO_0003948,gastroesophageal reflux disease,142
1122,EFO_0001060,celiac disease,149
1123,EFO_0000407,dilated cardiomyopathy,149


candidates are: celiac disease (EFO_0001060), chronic obstructive pulmonary disease (EFO_0000341), multiple sclerosis (MONDO_0005301), rheumatoid arthritis (EFO_0000685), type 1 diabetes mellitus (MONDO_0005147), systemic lupus (EFO_0002690), crohn's disease (EFO_0000384)

In [37]:
neg_df[neg_df.efoId.isin(['EFO_0001060', 'EFO_0000341', 'MONDO_0005301',
 'EFO_0000685', 'MONDO_0005147', 'EFO_0002690', 'EFO_0000384'])].to_csv('data/amla_non_therapeutic_areas.csv', index=False)