# LBD for NGS - How many nodes are needed to connect the gene with the phenotype

In [1]:
# Libraries
import pandas as pd
from py2neo import Graph

from datetime import datetime

# Accessing PubMed
from Bio import Entrez


In [3]:
# Connect to the Neo4J database
# graph = Graph("bolt://212.235.239.171:7687", 
#               user='neo4j', 
#               password='semMedDb2020', 
#               name='semmed43')

# graph = Graph("bolt://212.235.239.171:7687", 
#               user='neo4j', 
#               password='semMedDb2020', 
#               name='semmed4321')

graph = Graph("bolt://localhost:7687", 
              user='neo4j', 
              password='semmed40', 
              name='semmed40')


## Load variants table

In [43]:
# Load gene table

variants_df = pd.read_csv('Test_Data/VARIANTS.tsv', sep='\t')


---

## Example 1: PX4957

In [44]:
# Select candidate genes for patient

patient = 'PX4957'
candidate_genes = variants_df.loc[variants_df['PatientID'] == patient]['GeneName'].to_list()


In [61]:
# Loop queries

results = {}

for gene_name in candidate_genes:
    print(gene_name)

    query = f"\
    MATCH (x:Concept), (y:Concept) \
    WHERE (x.name =~ '(?i).*{gene_name}.*') AND \
    (y:sosy) AND y.cui in ['C0242422', 'C0015371', 'C0278061', 'C0013421', 'C0026650'] \
    MATCH p = allShortestPaths((x)-[:CAUSES|AFFECTS*..6]-(y)) \
    WHERE all(n in nodes(p)[1..-1] WHERE n:gngm OR n:aapp OR n:dsyn OR n:patf) \
    RETURN \
    count(p) as count, length(p) as length \
    LIMIT 100"
    
    out = graph.run(query).to_data_frame()
    if len(out) > 0:
        out_series = out.iloc[0]
    else:
        out_series = pd.Series({'count': 0, 'length': 0})
    out_series.name = gene_name
    
    results[gene_name] = out_series
    


ABCA7
ABCC3
ABCC8
ADAMTS20
AKAP11
ALPP
ANKRD36C
ANKRD36C
AQP9
ARAP1
ARHGEF1
ASB18
ATN1
ATN1
ATP2A3
ATXN2
AZI2
B3GAT2
BMP10
BOP1
C2CD4D
CBL
CCDC88C
CDC20
CFHR3
COL26A1
CPT1A
CREBBP
CSF1R
CUBN
CYP11B1
CYP26A1
CYP2R1
CYP4F11
DBF4
DCP1B
DDN
DDX53
DNAH17
DYSF
EAF1
EFS
EPB41L2
FANCA
FBN1
FBXO17
FEM1C
FGD5
FMR1NB
FRMPD1
GPS1
GRIK4
GYS2
HTT
HTT
IFRD2
INTS1
KCNN3
KCNN3
KCP
KIAA1841
KIF27
KIRREL3
LARP4B
LRTM2
MAML1
MAN1C1
MAP3K14
MB21D2
MED14
MPHOSPH8
MSLN
MUC19
NPC1
NR2C1
NR4A2
NTN5
NUDCD3
NUTM2A
NYNRIN
PAPSS2
PAX9
PCOLCE2
PDE10A
PHACTR3
PICK1
PKD1
PKD1
PKD1L3
PKD2L1
PMPCA
PNKP
POTED
PTCH2
PTPN7
RECQL4
RFPL4B
RHOXF2B
SACS
SCRIB
SEC31A
SEC31A
SECISBP2L
SEMA3B
SENP5
SGK1
SGO1
SH2D2A
SIGLEC1
SMARCC2
SPG11
SPINT1
STARD10
STAU2
SUN2
SYNE4
SYTL4
TAF1
TEX11
TNF
TPSAB1
TTN
TTN
UMAD1
VPS13C
VWDE
WSCD2
ZBBX
ZC3H12D
ZNF517
ZNF614
ZNF682


In [64]:
results_df = pd.DataFrame(results).T

In [67]:
results_df.loc[results_df['count'] > 0].sort_values(by=['length', 'count'], ascending=(True, False))

Unnamed: 0,count,length
DYSF,304,2
TNF,153,2
NR4A2,7,2
HTT,5,2
ATXN2,4,2
SGK1,4,2
RECQL4,3,2
VPS13C,3,2
ABCA7,2,2
AQP9,2,2


## Example 2: PX8847

In [155]:
# Select candidate genes for patient

patient = 'PX8847'
candidate_genes = variants_df.loc[variants_df['PatientID'] == patient]['GeneName'].to_list()


In [156]:
len(candidate_genes)

107

In [159]:
variants_df.loc[variants_df['PatientID'] == patient].iloc[51:100]

Unnamed: 0,GeneName,PatientID,VariantPosition,FunctionalImpact,CADDscore,Entrez.Gene.ID,Ensembl.Gene.ID,HTZinSLO,HMZinSLO,GnomADAlleleCount,GnomADHMZ,SIFT,PolyPhen2,MutationTaster,ClinVar,SIFT_modified,PolyPhen2_modified,MutationTaster_modified
256,LAMA3,PX8847,chr18:21492774C>T,MODERATE,28.0,3909,ENSG00000053747,1,0,0,0,Deleterious,,Disease causing,,-1.0,,-1.0
260,LDHA,PX8847,chr11:18427109C>T,MODERATE,27.700001,3939,ENSG00000134333,2,0,1,0,Deleterious,Possibly damaging,Disease causing,,-1.0,-1.0,-1.0
262,LIN7A,PX8847,chr12:81283065C>G,MODERATE,26.200001,8825,ENSG00000111052,1,0,0,0,Deleterious,Probably damaging,Disease causing,,-1.0,-1.0,-1.0
265,LPA,PX8847,chr6:161056313C>T,MODERATE,22.5,4018,ENSG00000198670,1,0,0,0,Deleterious,,Benign,,-1.0,,-1.0
267,LRRC75B,PX8847,chr22:24982098G>A,MODERATE,21.700001,388886,ENSG00000178026,3,0,5,0,Deleterious,Benign,Benign,,-1.0,-1.0,-1.0
268,LRSAM1,PX8847,chr9:130243534ATT>A,MODIFIER,-1.0,90678,ENSG00000148356,0,0,11,0,,,,,,,
276,MAP2K2,PX8847,chr19:4097323C>T,MODERATE,24.6,5605,ENSG00000126934,2,0,2,0,Tolerated,Probably damaging,Disease causing,VUS,-1.0,-1.0,-1.0
278,MAP3K21,PX8847,chr1:233463913T>A,MODERATE,24.9,84451,ENSG00000143674,1,0,0,0,Deleterious,Probably damaging,Disease causing,,-1.0,-1.0,-1.0
279,MATR3,PX8847,chr5:138653129G>T,MODIFIER,-1.0,9782,ENSG00000015479,2,0,1,0,,,,,,,
281,MBTPS1,PX8847,chr16:84094333C>A,MODERATE,22.200001,8720,ENSG00000140943,2,0,0,0,Tolerated,Benign,Disease causing,,-1.0,-1.0,-1.0


In [110]:
# List of symptoms CUI

symptoms_cui = [
    'C0343146', 
    'C0221369', 
    'C0231679', 
    'C0241521', 
    'C0410740', 
    'C2674737', 
    'C0685409', 
    'C0009917', 
    'C0239830', 
    'C0018564', 
    'C1836019', 
    'C0009081', 
    'C0016506', 
    'C1096086', 
    'C0231678', 
    'C1406835', 
    'C0221369', 
    'C2674738', 
]

symptoms_cui_query_string = "','".join(symptoms_cui)


In [144]:
# Loop queries

results = {}

for gene_name in candidate_genes:
    print(gene_name)

    query = f"\
    MATCH (x:Concept), (y:Concept) \
    WHERE (x.name =~ '(?i).*{gene_name}.*') AND \
    y.cui in ['C0343146','C0221369', 'C0231679', 'C0241521','C0410740', 'C2674737', \
    'C0685409', 'C0009917', 'C0239830', 'C0018564', 'C1836019', 'C0009081', 'C0016506', 'C1096086', 'C0231678', \
    'C1406835', 'C0221369', 'C2674738'] \
    MATCH p = allShortestPaths((x)-[:CAUSES|AFFECTS|STIMULATES|INHIBITS|AUGMENTS|ASSOCIATED_WITH*..6]-(y)) \
    WHERE all(n in nodes(p)[1..-1] WHERE n:gngm OR n:aapp OR n:dsyn OR n:patf) \
    RETURN \
    count(p) as count, length(p) as length \
    LIMIT 100"
    
    out = graph.run(query).to_data_frame()
    if len(out) > 0:
        out_series = out.iloc[0]
    else:
        out_series = pd.Series({'count': 0, 'length': 0})
    out_series.name = gene_name
    
    results[gene_name] = out_series
    


AAMDC
ACCS
ADAMTS12
ADAMTS18
ADGRV1
AKAP3
ANKRD11
ANKRD36B
ANKRD36C
ANKRD36C
APOBEC3F
ARHGAP31
ARHGEF37
ARID1A
ARRB2
ATXN3
ATXN3
C2
CAPN12
CATSPERG
CCDC36
CDHR4
CEACAM7
CELSR1
CHML
COA5
COL6A6
DAG1
DLC1
DNAH5
DNAJA3
DNAJB3
DYNC1H1
EFHC1
EMID1
ENTHD1
FAM81A
FBRS
FLVCR1
GABRP
GDPD3
GRIN2A
HCAR1
HRNR
IGFBP7
INSRR
ITGB2
KCNN3
KDR
KEAP1
KIR2DS4
LAMA3
LDHA
LIN7A
LPA
LRRC75B
LRSAM1
MAP2K2
MAP3K21
MATR3
MBTPS1
MICAL2
MIR1268A
MKS1
MSTO1
MTMR14
MTMR14
NDUFV3
OBSCN
OLFML3
ORC3
PACS1
PHACTR3
PIP5KL1
PSPH
RPS6KB2
SBSN
SCN10A
SDHA
SETX
SLC2A13
SLC38A4
SNORD68
SOWAHC
SPAG9
SPG7
SPTBN5
TAS1R3
TCHH
TDP1
TFF2
TMEM107
TMEM43
TMEM87B
TPK1
TRRAP
TRUB1
TTC39C
UBR4
VPS13B
ZFP57
ZGRF1
ZNF200
ZNF43
ZNF467
ZNF560
ZNF714


In [145]:
patient_2_df = pd.DataFrame(results).T

In [146]:
patient_2_df.loc[patient_2_df['count'] > 0].sort_values(by=['length', 'count'], ascending=(True, False))

Unnamed: 0,count,length
LPA,373,2
FLVCR1,41,2
FBRS,21,2
GRIN2A,18,2
MBTPS1,15,2
...,...,...
ADGRV1,11,3
CEACAM7,9,3
GDPD3,8,3
GABRP,2,3


## Example 3: PX9097

In [148]:
# Select candidate genes for patient

patient = 'PX9097'
candidate_genes = variants_df.loc[variants_df['PatientID'] == patient]['GeneName'].to_list()


In [149]:
candidate_genes

['A2M',
 'ABCA3',
 'ACADM',
 'ACBD7',
 'ACVRL1',
 'ADAMTSL2',
 'ADAMTSL2',
 'ADCY4',
 'ADCY4',
 'ADGRV1',
 'ADSL',
 'AHR',
 'ALDH18A1',
 'ALDH5A1',
 'ALDH5A1',
 'ALG9',
 'ALG9',
 'AMER1',
 'ANK3',
 'ANKH',
 'ANKRD36C',
 'ANKRD36C',
 'ANOS1',
 'ANTXR2',
 'ARFGEF2',
 'ARHGAP17',
 'ARHGEF1',
 'ARMC2',
 'ASPM',
 'ATM',
 'ATP5A1',
 'ATRX',
 'ATXN1',
 'ATXN7L2',
 'BRAF',
 'BTN2A1',
 'BTN2A1',
 'BUB1B',
 'C10orf111',
 'C14orf93',
 'C16orf96',
 'C16orf96',
 'CACNA1A',
 'CACNA1E',
 'CADPS',
 'CADPS',
 'CADPS2',
 'CBS',
 'CCDC88B',
 'CD93',
 'CDK5',
 'CDON',
 'CELA2B',
 'CEP164',
 'CFAP58',
 'CHN2',
 'CLN5',
 'CNKSR2',
 'COL18A1',
 'COL4A1',
 'COL4A1',
 'CPOX',
 'CPT1A',
 'CRIPAK',
 'CTBP2',
 'CUL4B',
 'CYBA',
 'DDX5',
 'DEPDC5',
 'DEPDC5',
 'DGKH',
 'DIAPH1',
 'DOCK8',
 'DPP10',
 'DPYSL3',
 'DSCC1',
 'DUSP8',
 'DZIP1L',
 'EAF2',
 'EBP',
 'EFHC1',
 'EFNB1',
 'EGFLAM',
 'EIF4B',
 'ELMO3',
 'ENTPD5',
 'ENTPD5',
 'ENTPD5',
 'ENTPD5',
 'EPHA6',
 'ERMARD',
 'FAM153B',
 'FAM3C',
 'FAM83A',
 'FARS2',
 

In [150]:
# List of symptoms CUI

symptoms_cui = [
    'C0424230', 
    'C0037769', 
    'C0037769', 
    'C1836833', 
    'C0023012', 
    'C0037822', 
    'C0543888', 
    'C0085584', 
    'C0002871', 
    'C0391870', 
    'C0850715', 
    'C0560046', 
    'C0575081', 
    'C0026650'
]

symptoms_cui_query_string = "','".join(symptoms_cui)


In [151]:
# Loop queries

results = {}

for gene_name in candidate_genes:
    print(gene_name)

    query = f"\
    MATCH (x:Concept), (y:Concept) \
    WHERE (x.name =~ '(?i).*{gene_name}.*') AND \
    y.cui in ['C0424230', 'C0037769', 'C0037769', 'C1836833', 'C0023012', 'C0037822', 'C0543888', \
    'C0085584', 'C0002871', 'C0391870', 'C0850715', 'C0560046', 'C0575081', 'C0026650'] \
    MATCH p = allShortestPaths((x)-[:CAUSES|AFFECTS|STIMULATES|INHIBITS|AUGMENTS|ASSOCIATED_WITH*..6]-(y)) \
    WHERE all(n in nodes(p)[1..-1] WHERE n:gngm OR n:aapp OR n:dsyn OR n:patf) \
    RETURN \
    count(p) as count, length(p) as length \
    LIMIT 100"
    
    out = graph.run(query).to_data_frame()
    if len(out) > 0:
        out_series = out.iloc[0]
    else:
        out_series = pd.Series({'count': 0, 'length': 0})
    out_series.name = gene_name
    
    results[gene_name] = out_series
    


A2M
ABCA3
ACADM
ACBD7
ACVRL1
ADAMTSL2
ADAMTSL2
ADCY4
ADCY4
ADGRV1
ADSL
AHR
ALDH18A1
ALDH5A1
ALDH5A1
ALG9
ALG9
AMER1
ANK3
ANKH
ANKRD36C
ANKRD36C
ANOS1
ANTXR2
ARFGEF2
ARHGAP17
ARHGEF1
ARMC2
ASPM
ATM
ATP5A1
ATRX
ATXN1
ATXN7L2
BRAF
BTN2A1
BTN2A1
BUB1B
C10orf111
C14orf93
C16orf96
C16orf96
CACNA1A
CACNA1E
CADPS
CADPS
CADPS2
CBS
CCDC88B
CD93
CDK5
CDON
CELA2B
CEP164
CFAP58
CHN2
CLN5
CNKSR2
COL18A1
COL4A1
COL4A1
CPOX
CPT1A
CRIPAK
CTBP2
CUL4B
CYBA
DDX5
DEPDC5
DEPDC5
DGKH
DIAPH1
DOCK8
DPP10
DPYSL3
DSCC1
DUSP8
DZIP1L
EAF2
EBP
EFHC1
EFNB1
EGFLAM
EIF4B
ELMO3
ENTPD5
ENTPD5
ENTPD5
ENTPD5
EPHA6
ERMARD
FAM153B
FAM3C
FAM83A
FARS2
FBN3
FDXACB1
FDXACB1
FGFR2
FGFR2
FGFR3
FMN2
FMR1
FOXA2
FOXG1-AS1
FOXK1
FOXO1
FUCA1
GFM1
GINS1
GJC2
GLIS2
GLRA1
GMPPB
GNAQ
GOLGA6L6
GPHN
GPHN
GRIN2A
GTF3C6
HARS2
HCFC1
HDAC1
HEPH
HNRNPU
HNRNPU
HTATIP2
IL20RA
IMPDH2
KANSL3
KCNIP3
KCNN3
KCNN3
KCNQ2
KDM6A
KDM6A
KIAA1671
KIF1A
KIF3C
KIR2DS4
KRAS
KRT38
LAMB1
LAMB1
LIAS
LMBRD1
LMO7
LRFN2
LTBP1
MAN1B1
MAN2A2
MANBA
MCM7
MID2
MIP
MIR1268A

In [152]:
patient_3_df = pd.DataFrame(results).T

In [153]:
patient_3_df.loc[patient_3_df['count'] > 0].sort_values(by=['length', 'count'], ascending=(True, False)).iloc[0:50]

Unnamed: 0,count,length
VIT,46215,2
MUT,8837,2
MIP,2866,2
NF1,929,2
FGFR3,550,2
KRAS,514,2
FOXO1,495,2
BRAF,367,2
GNAQ,350,2
SLC6A1,199,2


In [154]:
patient_3_df.loc['TMEM63B']

count     0
length    0
Name: TMEM63B, dtype: int64

---

In [139]:
url_table = 'http://sembt.mf.uni-lj.si/user_guide/SemBT_semantic_types.html'

tables = pd.read_html(url_table, skiprows=1)
table_df = tables[0]
table_df.columns = ['Abbreviation', 'Semantic_type', 'Relation_count', 'Instance_count']


In [141]:
table_df.sort_values(by='Relation_count', ascending=False).iloc[0:30]

Unnamed: 0,Abbreviation,Semantic_type,Relation_count,Instance_count
1,aapp,"Amino Acid, Peptide, or Protein",4345793,11503829
4,gngm,Gene or Genome,3422406,6946503
0,dsyn,Disease or Syndrome,2603234,12591865
10,orch,Organic Chemical,1550322,3485563
3,bpoc,"Body Part, Organ, or Organ Component",1392967,8711584
9,phsu,Pharmacologic Substance,1100842,4226225
5,topp,Therapeutic or Preventive Procedure,991912,5108457
11,bacs,Biologically Active Substance,951043,2922549
7,cell,Cell,807092,4346530
6,neop,Neoplastic Process,802419,4650747


In [142]:
url_table = 'http://sembt.mf.uni-lj.si/user_guide/SemBT_relation_types_and_instances_counts.html'

tables = pd.read_html(url_table, skiprows=1)
table_df = tables[0]
table_df.columns = ['Relation_type', 'Relation_count', 'Instance_count']


In [143]:
table_df.sort_values(by='Relation_count', ascending=False)

Unnamed: 0,Relation_type,Relation_count,Instance_count
1,LOCATION_OF,2033678,9560753
5,COEXISTS_WITH,1149505,2496628
2,PART_OF,1132906,8736983
6,AFFECTS,1008068,2124063
3,TREATS,993417,5435929
7,INTERACTS_WITH,956926,1824826
0,PROCESS_OF,739217,12908669
9,ASSOCIATED_WITH,544318,1316494
10,CAUSES,525082,1164132
12,STIMULATES,442904,845725
