In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio import Entrez
import json
import sqlite3
from bs4 import BeautifulSoup
import time

# Diseases - MeSH

In [2]:
# replace and lower
def relo(name):
    tmp = name.strip().lower().replace('-', ' ').replace('/', ' ').replace(',', '').split(' ')
    tmp.sort()
    return tuple(tmp)

## d2024.bin

In [3]:
mesh_terms = {}
mtn_ui = {} # MeSH Tree Number - Unique ID
with open('entities/d2024.bin', 'r', encoding='UTF-8') as f:
    for row in f:
        row = row.strip()
        if row == '*NEWRECORD':
            tmp_mesh_term = {
                'heading': '',
                'mesh scope note': '',
                'mesh tree numbers': set(),
                'entries': set()
            }
        elif row[:4] == 'MH =':
            tmp_mesh_term['heading'] = row[5:]
        elif row[:11] == 'PRINT ENTRY':
            tmp_mesh_term['entries'].add(relo(row[14:].split('|')[0]))
        elif row[:5] == 'ENTRY':
            tmp_mesh_term['entries'].add(relo(row[8:].split('|')[0]))
        elif row[:2] == 'MN':
            tmp_mesh_term['mesh tree numbers'].add(row[5:])
        elif row[:2] == 'MS':
            tmp_mesh_term['mesh scope note'] = row[5:]
        elif row[:2] == 'UI':
            unique_id = row[5:]
            flag = False
            for mtn in tmp_mesh_term['mesh tree numbers']:
                if mtn[:1] == 'C':
                    flag = True
                    mtn_ui[mtn] = unique_id
            if flag:
                mesh_terms[unique_id] = tmp_mesh_term

In [4]:
print(len(mesh_terms))
print(len(mtn_ui))

5032
13109


In [5]:
mesh_terms

{'D000006': {'heading': 'Abdomen, Acute',
  'mesh scope note': 'A clinical syndrome with acute abdominal pain that is severe, localized, and rapid in onset. Acute abdomen may be caused by a variety of disorders, injuries, or diseases.',
  'mesh tree numbers': {'C23.888.592.612.054.200', 'C23.888.821.030.249'},
  'entries': {('abdomen', 'acute'), ('abdomens', 'acute')}},
 'D000007': {'heading': 'Abdominal Injuries',
  'mesh scope note': 'General or unspecified injuries involving organs in the abdominal cavity.',
  'mesh tree numbers': {'C26.017'},
  'entries': {('abdominal', 'injuries'), ('abdominal', 'injury')}},
 'D000008': {'heading': 'Abdominal Neoplasms',
  'mesh scope note': 'New abnormal growth of tissue in the ABDOMEN.',
  'mesh tree numbers': {'C04.588.033'},
  'entries': {('abdominal', 'neoplasm'), ('abdominal', 'neoplasms')}},
 'D000012': {'heading': 'Abetalipoproteinemia',
  'mesh scope note': 'An autosomal recessive disorder of lipid metabolism. It is caused by mutation of 

In [7]:
{'D007938': mesh_terms['D007938']}

{'D007938': {'heading': 'Leukemia',
  'mesh scope note': 'A progressive, malignant disease of the blood-forming organs, characterized by distorted proliferation and development of leukocytes and their precursors in the blood and bone marrow. Leukemias were originally termed acute or chronic based on life expectancy but now are classified according to cellular maturity. Acute leukemias consist of predominately immature cells; chronic leukemias are composed of more mature cells. (From The Merck Manual, 2006)',
  'mesh tree numbers': {'C04.557.337', 'C15.378.508'},
  'entries': {('leucocythaemia',),
   ('leucocythaemias',),
   ('leucocythemia',),
   ('leucocythemias',),
   ('leukemias',)}}}

In [7]:
pd.DataFrame({
    'ID': mesh_terms.keys(),
    'Disease Name': [mesh_terms[key]['heading'] for key in mesh_terms.keys()],
    'MeSH Scope Note': [mesh_terms[key]['mesh scope note'] for key in mesh_terms.keys()]
}).to_csv('our_data/nodes/diseases.tsv', sep='\t', index=False)

In [8]:
disease_node_list = []
disease_edge_list = []
for unique_id in list(mesh_terms):
    disease_node_list.append((unique_id, mesh_terms[unique_id]))
    for mtn in mesh_terms[unique_id]['mesh tree numbers']:
        if (mtn[:1] == 'C') and ('.' in mtn):
            father_number = '.'.join(mtn.split('.')[:-1])
            disease_edge_list.append((unique_id, mtn_ui[father_number], mtn, father_number))
print(len(disease_node_list))
print(len(disease_edge_list))

5032
13086


In [9]:
disease_edge_list

[('D000006', 'D015746', 'C23.888.821.030.249', 'C23.888.821.030'),
 ('D000006', 'D015746', 'C23.888.592.612.054.200', 'C23.888.592.612.054'),
 ('D000007', 'D014947', 'C26.017', 'C26'),
 ('D000008', 'D009371', 'C04.588.033', 'C04.588'),
 ('D000012',
  'D006995',
  'C18.452.584.500.875.440.500',
  'C18.452.584.500.875.440'),
 ('D000012',
  'D006995',
  'C16.320.565.398.500.440.500',
  'C16.320.565.398.500.440'),
 ('D000012',
  'D006995',
  'C18.452.584.563.500.440.500',
  'C18.452.584.563.500.440'),
 ('D000012',
  'D006995',
  'C18.452.648.398.500.440.500',
  'C18.452.648.398.500.440'),
 ('D000013', 'D009358', 'C16.131', 'C16'),
 ('D000014', 'D000013', 'C16.131.042', 'C16.131'),
 ('D000015', 'D000013', 'C16.131.077', 'C16.131'),
 ('D000016', 'D011832', 'C26.733.031', 'C26.733'),
 ('D000016', 'D000013', 'C16.131.080', 'C16.131'),
 ('D000022', 'D011248', 'C12.050.703.039', 'C12.050.703'),
 ('D000026', 'D000022', 'C12.050.703.039.089', 'C12.050.703.039'),
 ('D000027', 'D000022', 'C12.050.70

In [11]:
pd.DataFrame({
    'ID1': [edge[0] for edge in disease_edge_list],
    'ID2': [edge[1] for edge in disease_edge_list],
    'MeSH Tree Number 1': [edge[2] for edge in disease_edge_list],
    'MeSH Tree Number 2': [edge[3] for edge in disease_edge_list]
}).to_csv('our_data/edges/disease_disease.tsv', sep='\t', index=False)

## output disease_tree.json

In [34]:
def get_children(mtn):
    children = []
    for tmp_mtn in mtn_ui:
        if (len(tmp_mtn) == (len(mtn) + 4)) & (tmp_mtn[:len(mtn)] == mtn):
            children.append(tmp_mtn)
    return children

In [37]:
# how to traverse a tree
def traverse_tree(mtn):
    children = get_children(mtn)
    if len(children) == 0:
        return {'id': mtn, 'label': mesh_terms[mtn_ui[mtn]]['heading']}
    else:
        return {'id': mtn, 'label': mesh_terms[mtn_ui[mtn]]['heading'], 'children': [traverse_tree(child) for child in children]}

In [38]:
tree = [traverse_tree('C01'), traverse_tree('C04'), traverse_tree('C05'), traverse_tree('C06'), \
    traverse_tree('C07'), traverse_tree('C08'), traverse_tree('C09'), traverse_tree('C10'), \
        traverse_tree('C11'), traverse_tree('C12'), traverse_tree('C14'), \
            traverse_tree('C15'), traverse_tree('C16'), traverse_tree('C17'), traverse_tree('C18'), \
                traverse_tree('C19'), traverse_tree('C20'), traverse_tree('C21'), traverse_tree('C22'), \
                    traverse_tree('C23'), traverse_tree('C24'), traverse_tree('C25'), traverse_tree('C26')]

In [41]:
tree[0]['children']

[{'id': 'C01.936', 'label': 'Waterborne Diseases'},
 {'id': 'C01.918', 'label': 'Vaccine-Preventable Diseases'},
 {'id': 'C01.920',
  'label': 'Vector Borne Diseases',
  'children': [{'id': 'C01.920.852',
    'label': 'Mosquito-Borne Diseases',
    'children': [{'id': 'C01.920.852.937', 'label': 'Zika Virus Infection'},
     {'id': 'C01.920.852.157',
      'label': 'Dengue',
      'children': [{'id': 'C01.920.852.157.200', 'label': 'Severe Dengue'}]},
     {'id': 'C01.920.852.188', 'label': 'Dirofilariasis'},
     {'id': 'C01.920.852.250', 'label': 'Elephantiasis, Filarial'},
     {'id': 'C01.920.852.500',
      'label': 'Encephalitis, Arbovirus',
      'children': [{'id': 'C01.920.852.500.340',
        'label': 'Encephalitis, California'},
       {'id': 'C01.920.852.500.345', 'label': 'Encephalitis, Japanese'},
       {'id': 'C01.920.852.500.350', 'label': 'Encephalitis, St. Louis'},
       {'id': 'C01.920.852.500.655',
        'label': 'Encephalomyelitis, Equine',
        'children':

In [42]:
# save the tree as json
with open('our_data/edges/disease_tree.json', 'w') as f:
    json.dump(tree, f)

In [44]:
with open('our_data/edges/disease_mtn_ui.json', 'w') as f:
    json.dump(mtn_ui, f)

# MiRNAs - miRBase

## aliases.txt

In [14]:
id_aliases = {}
with open('entities/aliases.txt', 'r', encoding='UTF-8') as f:
    for row in f:
        row = row.strip()
        tmp_id_aliases = row.split('\t')
        if tmp_id_aliases[1][:3] == 'hsa':
            id = tmp_id_aliases[0]
            aliases = tmp_id_aliases[1].split(';')[:-1]
            id_aliases[id] = set(aliases)

## hairpin.fa

In [16]:
hairpins = SeqIO.parse('entities/hairpin.fa', 'fasta')

# id, name, description, seq
pre_mirnas = {}
for hairpin in hairpins:
    if hairpin.name[:3] == 'hsa':
        tmp_pre_mirna = {}
        id = hairpin.description.split(' ')[1]
        tmp_pre_mirna['name'] = hairpin.name
        tmp_pre_mirna['seq'] = hairpin.seq
        tmp_pre_mirna['aliases'] = id_aliases[id]
        pre_mirnas[id] = tmp_pre_mirna
print(len(pre_mirnas))

1917


In [None]:
# pd.DataFrame({
#     'ID': pre_mirnas.keys(),
#     'Name': [pre_mirnas[key]['name'] for key in pre_mirnas.keys()],
#     'Sequence': [pre_mirnas[key]['seq'] for key in pre_mirnas.keys()]
# }).to_csv('our_data/nodes/pre_mirnas.tsv', sep='\t', index=False)

## mature.fa

In [18]:
matures = SeqIO.parse('entities/mature.fa', 'fasta')

# id, name, description, seq
mature_mirnas = {}
for mature in matures:
    if mature.name[:3] == 'hsa':
        tmp_mature_mirna = {}
        id = mature.description.split(' ')[1]
        tmp_mature_mirna['name'] = mature.name
        tmp_mature_mirna['seq'] = mature.seq
        tmp_mature_mirna['aliases'] = id_aliases[id]
        mature_mirnas[id] = tmp_mature_mirna
print(len(mature_mirnas))

2656


In [None]:
# pd.DataFrame({
#     'ID': mature_mirnas.keys(),
#     'Name': [mature_mirnas[key]['name'] for key in mature_mirnas.keys()],
#     'Sequence': [mature_mirnas[key]['seq'] for key in mature_mirnas.keys()]
# }).to_csv('our_data/nodes/mature_mirnas.tsv', sep='\t', index=False)

## miRNA.xls

In [21]:
mirnas = pd.read_excel('entities/miRNA.xls')
mirnas = mirnas[['Accession', 'ID', 'Sequence', 'Mature1_Acc', 'Mature1_ID', 'Mature1_Seq', 'Mature2_Acc', 'Mature2_ID', 'Mature2_Seq']]
mirnas = mirnas[mirnas['ID'].str.contains('hsa')]
mirnas

Unnamed: 0,Accession,ID,Sequence,Mature1_Acc,Mature1_ID,Mature1_Seq,Mature2_Acc,Mature2_ID,Mature2_Seq
57,MI0000060,hsa-let-7a-1,UGGGAUGAGGUAGUAGGUUGUAUAGUUUUAGGGUCACACCCACCAC...,MIMAT0000062,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,MIMAT0004481,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC
58,MI0000061,hsa-let-7a-2,AGGUUGAGGUAGUAGGUUGUAUAGUUUAGAAUUACAUCAAGGGAGA...,MIMAT0000062,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,MIMAT0010195,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC
59,MI0000062,hsa-let-7a-3,GGGUGAGGUAGUAGGUUGUAUAGUUUGGGGCUCUGCCCUGCUAUGG...,MIMAT0000062,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,MIMAT0004481,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC
60,MI0000063,hsa-let-7b,CGGGGUGAGGUAGUAGGUUGUGUGGUUUCAGGGCAGUGAUGUUGCC...,MIMAT0000063,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,MIMAT0004482,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC
61,MI0000064,hsa-let-7c,GCAUCCGGGUUGAGGUAGUAGGUUGUAUGGUUUAGAGUUACACCCU...,MIMAT0000064,hsa-let-7c-5p,UGAGGUAGUAGGUUGUAUGGUU,MIMAT0026472,hsa-let-7c-3p,CUGUACAACCUUCUAGCUUUCC
...,...,...,...,...,...,...,...,...,...
37307,MI0039734,hsa-mir-12132,UUAACAUCUUUUCCAUCAUAAUUCUCAUAGUAAUAAUAGUAAUGUU...,MIMAT0049026,hsa-miR-12132,UAUUACUGUGAGAAUUAUGAUG,,,
37308,MI0039735,hsa-mir-12133,GAAGUGUACUUUUUAAUGGUGCCAAACAGCAGUUGAUCUAUAAUAA...,MIMAT0049027,hsa-miR-12133,CUUGGCACCAUUAAAAAGUACA,,,
37312,MI0039739,hsa-mir-12135,UGUGGAUAUUCUUUUUUGAUACUACAGCAAAACUCAGCAAGUUGUA...,MIMAT0049031,hsa-miR-12135,UAAAGGUUUGUUUGUAAA,,,
37313,MI0039740,hsa-mir-12136,GAAAAAGUCAUGGAGGCCAUGGGGUUGGCUUGAAACCAGCUUUGGG...,MIMAT0049032,hsa-miR-12136,GAAAAAGUCAUGGAGGCC,,,


In [22]:
mirnas.isnull().sum(axis=0)

Accession        0
ID               0
Sequence         0
Mature1_Acc      0
Mature1_ID       0
Mature1_Seq      0
Mature2_Acc    955
Mature2_ID     955
Mature2_Seq    955
dtype: int64

In [23]:
mirnas.to_csv('our_data/nodes/mirnas.tsv', sep='\t', index=False)

In [24]:
mirna_pre_mature_list = []
for index, row in mirnas[['Accession', 'Mature1_Acc', 'Mature2_Acc']].iterrows():
    mirna_pre_mature_list.append((row['Accession'], row['Mature1_Acc']))
    if not pd.isna(row['Mature2_Acc']):
        mirna_pre_mature_list.append((row['Accession'], row['Mature2_Acc']))
print(len(mirna_pre_mature_list))

2879


In [26]:
mirna_pre_mature = pd.DataFrame({
    'miRBase Pre ID': [edge[0] for edge in mirna_pre_mature_list],
    'miRBase Mature ID': [edge[1] for edge in mirna_pre_mature_list]
})
mirna_pre_mature

Unnamed: 0,miRBase Pre ID,miRBase Mature ID
0,MI0000060,MIMAT0000062
1,MI0000060,MIMAT0004481
2,MI0000061,MIMAT0000062
3,MI0000061,MIMAT0010195
4,MI0000062,MIMAT0000062
...,...,...
2874,MI0039734,MIMAT0049026
2875,MI0039735,MIMAT0049027
2876,MI0039739,MIMAT0049031
2877,MI0039740,MIMAT0049032


In [46]:
{'MIMAT0000062': mature_mirnas['MIMAT0000062']}

{'MIMAT0000062': {'name': 'hsa-let-7a-5p',
  'seq': Seq('UGAGGUAGUAGGUUGUAUAGUU'),
  'aliases': {'hsa-let-7a', 'hsa-let-7a-5p'}}}

In [45]:
{'MIMAT0004481': mature_mirnas['MIMAT0004481']}

{'MIMAT0004481': {'name': 'hsa-let-7a-3p',
  'seq': Seq('CUAUACAAUCUACUGUCUUUC'),
  'aliases': {'hsa-let-7a*', 'hsa-let-7a-3p'}}}

## miFam.dat

In [29]:
mirna_families = {}
with open('entities/miFam.dat', 'r', encoding='UTF-8') as f:
    for row in f:
        row = row.strip()
        if row[:2] == 'AC':
            tmp_mirna_family = {
                'id': row[5:],
                'name': '',
                'members': set()
            }
        elif row[:2] == 'ID':
            tmp_mirna_family['name'] = row[5:]
        elif row[:2] == 'MI':
            if row[16:19] == 'hsa':
                tmp_mirna_family['members'].add(row[5:14])
                # tmp_mirna_family['members'].add((row[5:14], row[16:])) # .add( (ID, Name) )
        elif row == '//':
            if len(tmp_mirna_family['members']) > 0:
                mirna_families[tmp_mirna_family['id']] = tmp_mirna_family
print(len(mirna_families))

589


In [52]:
# pd.DataFrame({
#     'ID': mirna_families.keys(),
#     'Name': [mirna_families[key]['name'] for key in mirna_families.keys()],
#     'Members': [','.join(list(member)) for member in [mirna_families[key]['members'] for key in mirna_families.keys()]]
# }).to_csv('our_data/nodes/mirna_families.tsv', sep='\t', index=False)

In [30]:
mirna_family_edge_list = []
for mirna_family_id in mirna_families.keys():
    mirna_family = mirna_families[mirna_family_id]
    members = list(mirna_family['members'])
    # print(members)
    member_num = len(members)
    for i in range(member_num - 1):
        for j in range(i + 1, member_num):
            mirna_family_edge_list.append((members[i], members[j]))
            # print((members[i], members[j]))
print(len(mirna_family_edge_list))

4513


In [31]:
pd.DataFrame({
    'ID1': [edge[0] for edge in mirna_family_edge_list],
    'ID2': [edge[1] for edge in mirna_family_edge_list]
}).to_csv('our_data/edges/families_pre_mirna_pre_mirna.tsv', sep='\t', index=False)

# PCGs - HGNC

In [33]:
pcgs_hgnc = pd.read_table('entities/protein-coding_gene.txt')
pcgs_hgnc.head(3)

  pcgs_hgnc = pd.read_table('entities/protein-coding_gene.txt')


Unnamed: 0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_ids,lncipedia,gtrnadb,agr,mane_select,gencc
0,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
1,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,...,,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,
2,HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,...,,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7


In [34]:
pcgs_hgnc.columns

Index(['hgnc_id', 'symbol', 'name', 'locus_group', 'locus_type', 'status',
       'location', 'location_sortable', 'alias_symbol', 'alias_name',
       'prev_symbol', 'prev_name', 'gene_group', 'gene_group_id',
       'date_approved_reserved', 'date_symbol_changed', 'date_name_changed',
       'date_modified', 'entrez_id', 'ensembl_gene_id', 'vega_id', 'ucsc_id',
       'ena', 'refseq_accession', 'ccds_id', 'uniprot_ids', 'pubmed_id',
       'mgd_id', 'rgd_id', 'lsdb', 'cosmic', 'omim_id', 'mirbase', 'homeodb',
       'snornabase', 'bioparadigms_slc', 'orphanet', 'pseudogene.org',
       'horde_id', 'merops', 'imgt', 'iuphar', 'kznf_gene_catalog',
       'mamit-trnadb', 'cd', 'lncrnadb', 'enzyme_id',
       'intermediate_filament_db', 'rna_central_ids', 'lncipedia', 'gtrnadb',
       'agr', 'mane_select', 'gencc'],
      dtype='object')

In [35]:
pcgs_hgnc[['locus_group', 'locus_type']].drop_duplicates()

Unnamed: 0,locus_group,locus_type
0,protein-coding gene,gene with protein product


In [36]:
pcgs_hgnc = pcgs_hgnc[['hgnc_id', 'symbol', 'name', 'alias_symbol', 'alias_name', 'prev_symbol', 'prev_name', 'gene_group', 'gene_group_id', 'entrez_id', 'ensembl_gene_id']]
pcgs_hgnc

Unnamed: 0,hgnc_id,symbol,name,alias_symbol,alias_name,prev_symbol,prev_name,gene_group,gene_group_id,entrez_id,ensembl_gene_id
0,HGNC:5,A1BG,alpha-1-B glycoprotein,,,,,Immunoglobulin like domain containing,594,1,ENSG00000121410
1,HGNC:24086,A1CF,APOBEC1 complementation factor,ACF|ASP|ACF64|ACF65|APOBEC1CF,,,,RNA binding motif containing,725,29974,ENSG00000148584
2,HGNC:7,A2M,alpha-2-macroglobulin,FWP007|S863-7|CPAMD5,,,,Alpha-2-macroglobulin family,2148,2,ENSG00000175899
3,HGNC:23336,A2ML1,alpha-2-macroglobulin like 1,FLJ25179|p170,,CPAMD9,"C3 and PZP-like, alpha-2-macroglobulin domain ...",Alpha-2-macroglobulin family,2148,144568,ENSG00000166535
4,HGNC:30005,A3GALT2,"alpha 1,3-galactosyltransferase 2",IGBS3S|IGB3S,iGb3 synthase|isoglobotriaosylceramide synthase,A3GALT2P,"alpha 1,3-galactosyltransferase 2, pseudogene",Glycosyltransferase family 6,429,127550,ENSG00000184389
...,...,...,...,...,...,...,...,...,...,...,...
19253,HGNC:32058,ZYG11A,"zyg-11 family member A, cell cycle regulator",ZYG11,,,zyg-11 homolog A (C. elegans),ZYG11 cell cycle regulator family|Armadillo li...,6|1492,440590,ENSG00000203995
19254,HGNC:25820,ZYG11B,"zyg-11 family member B, cell cycle regulator",FLJ13456,,ZYG11,zyg-11 homolog (C. elegans)|zyg-11 homolog B (...,ZYG11 cell cycle regulator family|Armadillo li...,6|1492,79699,ENSG00000162378
19255,HGNC:13200,ZYX,zyxin,,,,,Zyxin family|MicroRNA protein coding host genes,1402|1691,7791,ENSG00000159840
19256,HGNC:29027,ZZEF1,zinc finger ZZ-type and EF-hand domain contain...,KIAA0399|ZZZ4|FLJ10821,,,"zinc finger, ZZ-type with EF hand domain 1",Zinc fingers ZZ-type|EF-hand domain containing,91|863,23140,ENSG00000074755


In [37]:
pcgs_hgnc.isna().sum()

hgnc_id                0
symbol                 0
name                   0
alias_symbol        3645
alias_name         12606
prev_symbol        12013
prev_name           5883
gene_group          4620
gene_group_id       4620
entrez_id              0
ensembl_gene_id       36
dtype: int64

In [38]:
pcgs_hgnc[['symbol', 'entrez_id']].drop_duplicates(subset=['entrez_id'])

Unnamed: 0,symbol,entrez_id
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A3GALT2,127550
...,...,...
19253,ZYG11A,440590
19254,ZYG11B,79699
19255,ZYX,7791
19256,ZZEF1,23140


In [39]:
pcgs = {}
for index, row in pcgs_hgnc.iterrows():
    tmp_pcg = {
        'symbol': row['symbol'],
        'name': row['name'],
        'ensembl_gene_id': '',
        'alias_symbols': set(),
        'alias_names': set(),
        'prev_symbols': set(),
        'prev_names': set(),
        'gene_groups': set(),
        'gene_group_ids': set()
    }
    if not pd.isna(row['ensembl_gene_id']):
        tmp_pcg['ensembl_gene_id'] = row['ensembl_gene_id']
    if not pd.isna(row['alias_symbol']):
        tmp_pcg['alias_symbols'] = set(row['alias_symbol'].split('|'))
    if not pd.isna(row['alias_name']):
        tmp_pcg['alias_names'] = set(row['alias_name'].split('|'))
    if not pd.isna(row['prev_symbol']):
        tmp_pcg['prev_symbols'] = set(row['prev_symbol'].split('|'))
    if not pd.isna(row['prev_name']):
        tmp_pcg['prev_names'] = set(row['prev_name'].split('|'))
    if not pd.isna(row['gene_group']):
        tmp_pcg['gene_groups'] = set(row['gene_group'].split('|'))
    if not pd.isna(row['gene_group_id']):
        tmp_pcg['gene_group_ids'] = set(row['gene_group_id'].split('|'))
    pcgs[str(row['entrez_id'])] = tmp_pcg
print(len(pcgs))

19258


In [31]:
{'144568': pcgs['144568']}

{'144568': {'symbol': 'A2ML1',
  'name': 'alpha-2-macroglobulin like 1',
  'ensembl_gene_id': 'ENSG00000166535',
  'alias_symbols': {'FLJ25179', 'p170'},
  'alias_names': set(),
  'prev_symbols': {'CPAMD9'},
  'prev_names': {'C3 and PZP-like, alpha-2-macroglobulin domain containing 9'},
  'gene_groups': {'Alpha-2-macroglobulin family'},
  'gene_group_ids': {'2148'}}}

In [42]:
pd.DataFrame({
    'ID': pcgs.keys(),
    'Symbol': [pcgs[key]['symbol'] for key in pcgs.keys()],
    'Name': [pcgs[key]['name'] for key in pcgs.keys()],
    'Gene Groups': [', '.join(pcgs[key]['gene_groups']) for key in pcgs.keys()]
}).to_csv('our_data/nodes/pcgs.tsv', sep='\t', index=False)

# PCG-PCG - HumanNet

## HumanNet-FN.tsv

In [45]:
pcg_edge_list = pd.read_csv('associations/HumanNet-FN.tsv', sep='\t', header=None)
pcg_edge_list.columns = ['ID1', 'ID2', 'Score']
# convert to string
pcg_edge_list['ID1'] = pcg_edge_list['ID1'].astype(str)
pcg_edge_list['ID2'] = pcg_edge_list['ID2'].astype(str)
pcg_edge_list

Unnamed: 0,ID1,ID2,Score
0,7046,7048,5.987545
1,5211,5213,5.951359
2,5213,5214,5.922024
3,5211,5214,5.920467
4,5160,5162,5.913664
...,...,...,...
977490,2124,23643,1.485350
977491,7205,9722,1.485342
977492,10209,397,1.485340
977493,5997,6129,1.485339


In [46]:
pcg_edge_list = pcg_edge_list[pcg_edge_list['ID1'].isin(pcgs.keys()) & pcg_edge_list['ID2'].isin(pcgs.keys())]

In [48]:
pcg_edge_list.drop_duplicates(subset=['ID1', 'ID2'])

Unnamed: 0,ID1,ID2,Score
0,7046,7048,5.987545
1,5211,5213,5.951359
2,5213,5214,5.922024
3,5211,5214,5.920467
4,5160,5162,5.913664
...,...,...,...
977490,2124,23643,1.485350
977491,7205,9722,1.485342
977492,10209,397,1.485340
977493,5997,6129,1.485339


In [49]:
pcg_edge_list.to_csv('our_data/edges/pcg_pcg_humannet.tsv', sep='\t', index=False)

# MiRNA-Disease - RNADisease

## RNADiseasev4.0_RNA-disease_experiment_miRNA.xlsx

In [50]:
mda_rnadisease = pd.read_excel('associations/RNADiseasev4.0_RNA-disease_experiment_miRNA.xlsx')
mda_rnadisease

Unnamed: 0,RDID,specise,RNA Symbol,RNA Type,Disease Name,DO ID,MeSH ID,KEGG disease ID,PMID,score
0,RD-E-mi-00088983,Homo sapiens,AP001273.1,miRNA,Pancreatic Neuroendocrine Tumor,,,,32183367.0,0.999831
1,RD-E-mi-00088984,Homo sapiens,AP001524.1,miRNA,Pancreatic Neuroendocrine Tumor,,,,32183367.0,0.999831
2,RD-E-mi-00089413,Caenorhabditis elegans,cel-let-7,miRNA,Alzheimer Disease,DOID:10652,D000544,,18262516.0,0.973352
3,RD-E-mi-00089414,Caenorhabditis elegans,cel-let-7,miRNA,Parkinson Disease,DOID:14330,D010300,,20091141.0,0.328976
4,RD-E-mi-00090759,Homo sapiens,hsa-let-7a-1,miRNA,Acth-Independent Macronodular Adrenal Hyperplasia,,,,18840638.0,0.694506
...,...,...,...,...,...,...,...,...,...,...
214381,RD-E-mi-00303270,Sus scrofa,vsRNA_15627,miRNA,Adverse Response To Chemotherapy(Neutropenia/L...,,,,34481273.0,0.968971
214382,RD-E-mi-00303271,Sus scrofa,vsRNA_15766,miRNA,Adverse Response To Chemotherapy(Neutropenia/L...,,,,34481273.0,0.968971
214383,RD-E-mi-00303272,Sus scrofa,vsRNA_17027,miRNA,Adverse Response To Lamotrigine And Phenytoin,,,,34481273.0,0.968971
214384,RD-E-mi-00303273,Sus scrofa,vsRNA_7505,miRNA,Affective Disorder,,,,34481273.0,0.968971


In [51]:
print('Originally:', mda_rnadisease.shape[0], 'rows')
mda_rnadisease = mda_rnadisease[mda_rnadisease['specise'] == 'Homo sapiens']
print('Homo sapiens only:', mda_rnadisease.shape[0], 'rows')
mda_rnadisease = mda_rnadisease[['RNA Symbol', 'Disease Name', 'MeSH ID', 'PMID', 'score']]
mda_rnadisease = mda_rnadisease.dropna(subset=['PMID'])
print('With PMID:', mda_rnadisease.shape[0], 'rows')
mda_rnadisease

Originally: 214386 rows
Homo sapiens only: 192536 rows
With PMID: 159415 rows


Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score
0,AP001273.1,Pancreatic Neuroendocrine Tumor,,32183367.0,0.999831
1,AP001524.1,Pancreatic Neuroendocrine Tumor,,32183367.0,0.999831
4,hsa-let-7a-1,Acth-Independent Macronodular Adrenal Hyperplasia,,18840638.0,0.694506
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638.0,0.694506
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563.0,0.973352
...,...,...,...,...,...
213973,solexa-3277-272,Pancreatic Cancer,D010190,25149530.0,0.328976
213974,solexa-578-1915,Pancreatic Cancer,D010190,25149530.0,0.328976
213975,solexa-826-1288,Pancreatic Cancer,D010190,25149530.0,0.328976
214377,unconservative_12_239971,Drug-Induced Liver Injury,D056486,32355577.0,0.328976


## Match to MeSH terms

In [52]:
mda_rnadisease.isna().sum(axis=0)

RNA Symbol          0
Disease Name        0
MeSH ID         25083
PMID                0
score               0
dtype: int64

In [53]:
rnadisease_diseasename_meshid = {}
count1 = 0
count2 = 0
for disease in mda_rnadisease[mda_rnadisease['MeSH ID'].isna()]['Disease Name'].drop_duplicates():
    # A special phenomenon in RNADisease: No word 'Neoplasms', there is only 'Neoplasm'.
    disease = relo(disease.lower().replace('neoplasm', 'neoplasms'))
    for unique_id in mesh_terms.keys():
        tmp_mesh_term = mesh_terms[unique_id]
        if disease == relo(tmp_mesh_term['heading']):
            rnadisease_diseasename_meshid[disease] = unique_id
            count1 = count1 + 1
            break
        if disease in tmp_mesh_term['entries']:
            rnadisease_diseasename_meshid[disease] = unique_id
            count2 = count2 + 1
            break
print('matched to the heading:', count1)
print('matched to the entries:', count2)
rnadisease_diseasename_meshid

matched to the heading: 105
matched to the entries: 137


{('breast', 'neoplasms'): 'D001943',
 ('colon', 'neoplasms'): 'D003110',
 ('digestive', 'neoplasms', 'system'): 'D004067',
 ('endometrial', 'neoplasms'): 'D016889',
 ('gastric', 'neoplasms'): 'D013274',
 ('gastrointestinal', 'neoplasms'): 'D005770',
 ('and', 'head', 'neck', 'neoplasms'): 'D006258',
 ('failure', 'heart'): 'D006333',
 ('inflammation',): 'D007249',
 ('lung', 'neoplasms'): 'D008175',
 ('leukemia', 'myelogenous'): 'D007951',
 ('neoplasms',): 'D009369',
 ('carcinoma', 'cell', 'oral', 'squamous'): 'D000077195',
 ('neoplasms', 'ovarian'): 'D010051',
 ('neoplasms', 'pituitary'): 'D010911',
 ('neoplasms', 'prostate'): 'D011471',
 ('gland', 'neoplasms', 'salivary'): 'D012468',
 ('brain', 'neoplasms', 'stem'): 'D020295',
 ('carcinosarcoma',): 'D002296',
 ('adenoma', 'follicular'): 'D000236',
 ('dysplasia', 'ectodermal', 'hypohidrotic'): 'D053358',
 ('disease', 'heart', 'ischemic'): 'D017202',
 ('liver', 'neoplasms'): 'D008113',
 ('disease', 'pancreatic'): 'D010182',
 ('inhalation'

In [54]:
for index, row in mda_rnadisease[mda_rnadisease['MeSH ID'].isna()].iterrows():
    disease = relo(row['Disease Name'].lower().replace('neoplasm', 'neoplasms'))
    if disease in rnadisease_diseasename_meshid.keys():
        mda_rnadisease.loc[index, 'MeSH ID'] = rnadisease_diseasename_meshid[disease]
mda_rnadisease

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score
0,AP001273.1,Pancreatic Neuroendocrine Tumor,,32183367.0,0.999831
1,AP001524.1,Pancreatic Neuroendocrine Tumor,,32183367.0,0.999831
4,hsa-let-7a-1,Acth-Independent Macronodular Adrenal Hyperplasia,,18840638.0,0.694506
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638.0,0.694506
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563.0,0.973352
...,...,...,...,...,...
213973,solexa-3277-272,Pancreatic Cancer,D010190,25149530.0,0.328976
213974,solexa-578-1915,Pancreatic Cancer,D010190,25149530.0,0.328976
213975,solexa-826-1288,Pancreatic Cancer,D010190,25149530.0,0.328976
214377,unconservative_12_239971,Drug-Induced Liver Injury,D056486,32355577.0,0.328976


In [55]:
mda_rnadisease[mda_rnadisease['MeSH ID'].isna()]

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score
0,AP001273.1,Pancreatic Neuroendocrine Tumor,,32183367.0,0.999831
1,AP001524.1,Pancreatic Neuroendocrine Tumor,,32183367.0,0.999831
4,hsa-let-7a-1,Acth-Independent Macronodular Adrenal Hyperplasia,,18840638.0,0.694506
57,hsa-let-7a-1,Colorectal Adenocarcinoma,,20413677.0,0.328976
87,hsa-let-7a-1,Glioblastoma Multiforme Somatic,,17363563.0,0.694506
...,...,...,...,...,...
213941,SCoV-2-miR-5,Sars-Cov2,,34159729.0,0.328976
213942,SCoV-2-miR-6,Sars-Cov2,,34159729.0,0.328976
213943,SCoV-2-miR-7,Sars-Cov2,,34159729.0,0.328976
213944,SCoV-2-miR-8,Sars-Cov2,,34159729.0,0.328976


In [32]:
mda_rnadisease.isna().sum(axis=0)

RNA Symbol          0
Disease Name        0
MeSH ID         11327
PMID                0
score               0
dtype: int64

In [33]:
mda_rnadisease = mda_rnadisease[mda_rnadisease['MeSH ID'].isin(mesh_terms.keys())]
print('Matched to MeSH terms:', mda_rnadisease.shape[0], 'rows')
mda_rnadisease

Matched to MeSH terms: 144160 rows


Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638.0,0.694506
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563.0,0.973352
7,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,18308931.0,0.973352
8,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,20425795.0,0.973352
9,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,23676502.0,0.973352
...,...,...,...,...,...
213973,solexa-3277-272,Pancreatic Cancer,D010190,25149530.0,0.328976
213974,solexa-578-1915,Pancreatic Cancer,D010190,25149530.0,0.328976
213975,solexa-826-1288,Pancreatic Cancer,D010190,25149530.0,0.328976
214377,unconservative_12_239971,Drug-Induced Liver Injury,D056486,32355577.0,0.328976


## Match to miRBase

In [34]:
rnadisease_rnaname_mirbaseid = {}
count1 = 0
count2 = 0
count3 = 0
count4 = 0
for rna_name in mda_rnadisease['RNA Symbol'].drop_duplicates():
    flag = True
    for mature_mirna_id in mature_mirnas.keys():
        tmp_mature_mirna = mature_mirnas[mature_mirna_id]
        if rna_name == tmp_mature_mirna['name']:
            flag = False
            rnadisease_rnaname_mirbaseid[rna_name] = mature_mirna_id
            count1 = count1 + 1
            break
        if rna_name in tmp_mature_mirna['aliases']:
            flag = False
            rnadisease_rnaname_mirbaseid[rna_name] = mature_mirna_id
            count2 = count2 + 1
            break
    if flag:
        for pre_mirna_id in pre_mirnas.keys():
            tmp_pre_mirna = pre_mirnas[pre_mirna_id]
            if rna_name == tmp_pre_mirna['name']:
                rnadisease_rnaname_mirbaseid[rna_name] = pre_mirna_id
                count3 = count3 + 1
                break
            if rna_name in tmp_pre_mirna['aliases']:
                rnadisease_rnaname_mirbaseid[rna_name] = pre_mirna_id
                count4 = count4 + 1
                break
print('Mature miRNA, name:', count1)
print('Mature miRNA, aliases:', count2)
print('Pri-miRNA, name', count3)
print('Pri-miRNA, aliases:', count4)
rnadisease_rnaname_mirbaseid

Mature miRNA, name: 2588
Mature miRNA, aliases: 90
Pri-miRNA, name 326
Pri-miRNA, aliases: 22


{'hsa-let-7a-1': 'MI0000060',
 'hsa-let-7a-2': 'MI0000061',
 'hsa-let-7a-3': 'MI0000062',
 'hsa-let-7b': 'MIMAT0000063',
 'hsa-let-7c': 'MIMAT0000064',
 'hsa-let-7d': 'MIMAT0000065',
 'hsa-let-7e': 'MIMAT0000066',
 'hsa-let-7f-1': 'MI0000067',
 'hsa-let-7f-2': 'MI0000068',
 'hsa-mir-15a': 'MI0000069',
 'hsa-mir-16-1': 'MI0000070',
 'hsa-mir-17': 'MI0000071',
 'hsa-mir-18a': 'MI0000072',
 'hsa-mir-19a': 'MI0000073',
 'hsa-mir-19b-1': 'MI0000074',
 'hsa-mir-19b-2': 'MI0000075',
 'hsa-mir-20a': 'MI0000076',
 'hsa-mir-21': 'MI0000077',
 'hsa-mir-22': 'MI0000078',
 'hsa-mir-23a': 'MI0000079',
 'hsa-mir-24-1': 'MI0000080',
 'hsa-mir-25': 'MI0000082',
 'hsa-mir-26a-1': 'MI0000083',
 'hsa-mir-26b': 'MI0000084',
 'hsa-mir-27a': 'MI0000085',
 'hsa-mir-28': 'MI0000086',
 'hsa-mir-29a': 'MI0000087',
 'hsa-mir-30a': 'MI0000088',
 'hsa-mir-31': 'MI0000089',
 'hsa-mir-32': 'MI0000090',
 'hsa-mir-33a': 'MI0000091',
 'hsa-mir-92a-1': 'MI0000093',
 'hsa-mir-92a-2': 'MI0000094',
 'hsa-mir-93': 'MI0000095

In [35]:
mda_rnadisease = mda_rnadisease.join(
    pd.DataFrame(
        {'miRBase ID': rnadisease_rnaname_mirbaseid.values()}, 
        index=rnadisease_rnaname_mirbaseid.keys()
    ), on='RNA Symbol'
)
mda_rnadisease

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638.0,0.694506,MI0000060
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563.0,0.973352,MI0000060
7,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,18308931.0,0.973352,MI0000060
8,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,20425795.0,0.973352,MI0000060
9,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,23676502.0,0.973352,MI0000060
...,...,...,...,...,...,...
213973,solexa-3277-272,Pancreatic Cancer,D010190,25149530.0,0.328976,
213974,solexa-578-1915,Pancreatic Cancer,D010190,25149530.0,0.328976,
213975,solexa-826-1288,Pancreatic Cancer,D010190,25149530.0,0.328976,
214377,unconservative_12_239971,Drug-Induced Liver Injury,D056486,32355577.0,0.328976,


In [36]:
mda_rnadisease = mda_rnadisease.dropna(subset=['miRBase ID'])
print('Matched to miRBase:', mda_rnadisease.shape[0], 'rows')
mda_rnadisease

Matched to miRBase: 137262 rows


Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638.0,0.694506,MI0000060
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563.0,0.973352,MI0000060
7,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,18308931.0,0.973352,MI0000060
8,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,20425795.0,0.973352,MI0000060
9,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,23676502.0,0.973352,MI0000060
...,...,...,...,...,...,...
210373,hsa-miR-92,T-Cell Leukemia,D015458,15944707.0,0.994838,MIMAT0000092
210374,hsa-miR-92,Temporal Lobe Epilepsy,D004833,22615744.0,0.328976,MIMAT0000092
210375,hsa-miR-92,Tonsil Cancer,D014067,26867589.0,0.328976,MIMAT0000092
210376,hsa-miR-92,Vascular Disease,D014652,28464406.0,0.328976,MIMAT0000092


## Bio.Entrez Year

In [37]:
mda_rnadisease['PMID'] = mda_rnadisease['PMID'].astype(int).astype(str)
mda_rnadisease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mda_rnadisease['PMID'] = mda_rnadisease['PMID'].astype(int).astype(str)


Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638,0.694506,MI0000060
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563,0.973352,MI0000060
7,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,18308931,0.973352,MI0000060
8,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,20425795,0.973352,MI0000060
9,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,23676502,0.973352,MI0000060
...,...,...,...,...,...,...
210373,hsa-miR-92,T-Cell Leukemia,D015458,15944707,0.994838,MIMAT0000092
210374,hsa-miR-92,Temporal Lobe Epilepsy,D004833,22615744,0.328976,MIMAT0000092
210375,hsa-miR-92,Tonsil Cancer,D014067,26867589,0.328976,MIMAT0000092
210376,hsa-miR-92,Vascular Disease,D014652,28464406,0.328976,MIMAT0000092


In [38]:
pmid_list = mda_rnadisease['PMID'].drop_duplicates().values
print(len(pmid_list))
pmid_lists = [
    ','.join(pmid_list[:9999]), 
    ','.join(pmid_list[9999: 9999 * 2]), 
    ','.join(pmid_list[9999 * 2: 9999 * 3]), 
    ','.join(pmid_list[9999 * 3:])
]

36308


In [40]:
rnadisese_pmid_year = {}
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[0], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/rnadisease_pubmed_records_1.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
rnadiseaes_articles = result.find_all('PubmedArticle')
print(len(rnadiseaes_articles))
for article in rnadiseaes_articles:
    rnadisese_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(rnadisese_pmid_year))
pd.DataFrame({'PMID': rnadisese_pmid_year.keys(), 'Year': rnadisese_pmid_year.values()}).to_csv('our_data/csv/rnadisese_pmid_year_1.csv')

Got Data!
9985
9985


In [41]:
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[1], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/rnadisease_pubmed_records_2.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
rnadiseaes_articles = result.find_all('PubmedArticle')
print(len(rnadiseaes_articles))
for article in rnadiseaes_articles:
    rnadisese_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(rnadisese_pmid_year))
pd.DataFrame({'PMID': rnadisese_pmid_year.keys(), 'Year': rnadisese_pmid_year.values()}).to_csv('our_data/csv/rnadisese_pmid_year_2.csv')

Got Data!
9990
19975


In [42]:
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[2], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/rnadisease_pubmed_records_3.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
rnadiseaes_articles = result.find_all('PubmedArticle')
print(len(rnadiseaes_articles))
for article in rnadiseaes_articles:
    rnadisese_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(rnadisese_pmid_year))
pd.DataFrame({'PMID': rnadisese_pmid_year.keys(), 'Year': rnadisese_pmid_year.values()}).to_csv('our_data/csv/rnadisese_pmid_year_3.csv')

Got Data!
9983
29958


In [43]:
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[3], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/rnadisease_pubmed_records_4.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
rnadiseaes_articles = result.find_all('PubmedArticle')
print(len(rnadiseaes_articles))
for article in rnadiseaes_articles:
    rnadisese_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(rnadisese_pmid_year))
pd.DataFrame({'PMID': rnadisese_pmid_year.keys(), 'Year': rnadisese_pmid_year.values()}).to_csv('our_data/csv/rnadisese_pmid_year_4.csv')

Got Data!
6304
36262


In [44]:
mda_rnadisease = mda_rnadisease.join(
    pd.DataFrame(
        {'Year': rnadisese_pmid_year.values()},
        index=rnadisese_pmid_year.keys()
    ), on='PMID'
)
mda_rnadisease

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID,Year
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638,0.694506,MI0000060,2009
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563,0.973352,MI0000060,2007
7,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,18308931,0.973352,MI0000060,2008
8,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,20425795,0.973352,MI0000060,2010
9,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,23676502,0.973352,MI0000060,2013
...,...,...,...,...,...,...,...
210373,hsa-miR-92,T-Cell Leukemia,D015458,15944707,0.994838,MIMAT0000092,2005
210374,hsa-miR-92,Temporal Lobe Epilepsy,D004833,22615744,0.328976,MIMAT0000092,2012
210375,hsa-miR-92,Tonsil Cancer,D014067,26867589,0.328976,MIMAT0000092,2016
210376,hsa-miR-92,Vascular Disease,D014652,28464406,0.328976,MIMAT0000092,2018


In [45]:
mda_rnadisease.to_csv('our_data/csv/mda_rnadisease.csv')

In [46]:
mda_rnadisease.isna().sum()

RNA Symbol       0
Disease Name     0
MeSH ID          0
PMID             0
score            0
miRBase ID       0
Year            79
dtype: int64

In [47]:
mda_rnadisease[mda_rnadisease['Year'].isna()]

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID,Year
2040,hsa-let-7e,Synovial Sarcoma,D013584,21140508,0.694506,MIMAT0000066,
3678,hsa-mir-129-2,Gastric Neoplasm,D013274,21213213,1.000000,MI0000473,
3975,hsa-mir-34b,Gastric Neoplasm,D013274,21213213,1.000000,MI0000742,
3982,hsa-mir-34b,Stomach Cancer,D013274,21213213,1.000000,MI0000742,
7360,hsa-miR-17-5p,Multiple Sclerosis,D009103,20201009,1.000000,MIMAT0000070,
...,...,...,...,...,...,...,...
194392,hsa-miR-6869-5p,Colorectal Cancer,D015179,28636562,0.901460,MIMAT0027638,
195918,hsa-miR-8075,Colorectal Cancer,D015179,28636562,0.901460,MIMAT0031002,
196919,hsa-miR-217-3p,Pancreatic Ductal Adenocarcinoma,D021441,28624807,1.000000,MIMAT0037308,
197098,hsa-miR-101-2-5p,Prostate Neoplasm,D011471,19285253,0.328976,MIMAT0037312,


In [48]:
print('We\'ve manually checked that these PMIDs are unavailable.')
mda_rnadisease = mda_rnadisease.dropna(subset=['Year'])

We've manually checked that these PMIDs are unavailable.


In [70]:
mda_rnadisease = pd.read_csv('our_data/csv/mda_rnadisease.csv', index_col='Unnamed: 0')
mda_rnadisease = mda_rnadisease.dropna(subset=['Year'])
mda_rnadisease['Year'] = mda_rnadisease['Year'].astype('int').astype(str)
mda_rnadisease['PMID'] = mda_rnadisease['PMID'].astype(str)
mda_rnadisease

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID,Year
5,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638,0.694506,MI0000060,2009
6,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563,0.973352,MI0000060,2007
7,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,18308931,0.973352,MI0000060,2008
8,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,20425795,0.973352,MI0000060,2010
9,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,23676502,0.973352,MI0000060,2013
...,...,...,...,...,...,...,...
210373,hsa-miR-92,T-Cell Leukemia,D015458,15944707,0.994838,MIMAT0000092,2005
210374,hsa-miR-92,Temporal Lobe Epilepsy,D004833,22615744,0.328976,MIMAT0000092,2012
210375,hsa-miR-92,Tonsil Cancer,D014067,26867589,0.328976,MIMAT0000092,2016
210376,hsa-miR-92,Vascular Disease,D014652,28464406,0.328976,MIMAT0000092,2018


In [57]:
# Take a look at those mature miRNAs with more than one pri-miRNAs
mda_rnadisease[mda_rnadisease['miRBase ID'].isin(mirna_pre_mature['miRBase Mature ID'].value_counts().index[mirna_pre_mature['miRBase Mature ID'].value_counts() > 1])]

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID,Year
4494,hsa-let-7a-5p,Acute Kidney Injury,D058186,32790341,0.999980,MIMAT0000062,2021
4495,hsa-let-7a-5p,Acute Myeloid Leukemia,D015470,18056805,0.328976,MIMAT0000062,2008
4496,hsa-let-7a-5p,Acute Myeloid Leukemia,D015470,26526573,0.328976,MIMAT0000062,2017
4497,hsa-let-7a-5p,Adenoid Cystic Carcinoma,D003528,17322030,0.973352,MIMAT0000062,2007
4498,hsa-let-7a-5p,Adrenal Cortical Carcinoma,D018268,21859927,0.635617,MIMAT0000062,2012
...,...,...,...,...,...,...,...
210373,hsa-miR-92,T-Cell Leukemia,D015458,15944707,0.994838,MIMAT0000092,2005
210374,hsa-miR-92,Temporal Lobe Epilepsy,D004833,22615744,0.328976,MIMAT0000092,2012
210375,hsa-miR-92,Tonsil Cancer,D014067,26867589,0.328976,MIMAT0000092,2016
210376,hsa-miR-92,Vascular Disease,D014652,28464406,0.328976,MIMAT0000092,2018


In [59]:
mda_rnadisease = pd.merge(mda_rnadisease, mirna_pre_mature, left_on='miRBase ID', right_on='miRBase Mature ID', how='outer')
mda_rnadisease['miRBase Pre ID'] = mda_rnadisease['miRBase Pre ID'].fillna(mda_rnadisease['miRBase ID'])
mda_rnadisease = mda_rnadisease.dropna(subset=['miRBase ID'])
mda_rnadisease = mda_rnadisease.drop(labels='miRBase Mature ID', axis=1)
mda_rnadisease

Unnamed: 0,RNA Symbol,Disease Name,MeSH ID,PMID,score,miRBase ID,Year,miRBase Pri ID
0,hsa-let-7a-1,Acth-Secreting Pituitary Adenoma,D049913,18840638,0.694506,MI0000060,2009,MI0000060
1,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,17363563,0.973352,MI0000060,2007,MI0000060
2,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,18308931,0.973352,MI0000060,2008,MI0000060
3,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,20425795,0.973352,MI0000060,2010,MI0000060
4,hsa-let-7a-1,Acute Myeloid Leukemia,D015470,23676502,0.973352,MI0000060,2013,MI0000060
...,...,...,...,...,...,...,...,...
154690,hsa-mir-486,Chronic Lymphocytic Leukemia,D015451,33304362,0.968971,MI0002470,2020,MI0002470
154691,hsa-mir-486,Lung Neoplasm,D008175,28124991,0.988338,MI0002470,2017,MI0002470
154692,hsa-mir-486,Lung Neoplasm,D008175,28334118,0.988338,MI0002470,2019,MI0002470
154693,hsa-mir-509,Neoplasm,D009369,25144722,0.328976,MI0003196,2015,MI0003196


In [60]:
mda_rnadisease.to_csv('our_data/edges/mirna_disease_rnadisease.tsv', sep='\t', index=False)
print('Unified as pri-miRNA:', mda_rnadisease.shape[0], 'rows')

Unified as pri-miRNA: 154695 rows


In [63]:
print('There are', mda_rnadisease['MeSH ID'].drop_duplicates().shape[0], 'diseases')
print('There are', mda_rnadisease['miRBase Pre ID'].drop_duplicates().shape[0], 'miRNAs')
print('There are', mda_rnadisease[['MeSH ID', 'miRBase Pre ID']].drop_duplicates().shape[0], 'associations')

There are 975 diseases
There are 1874 miRNAs
There are 57349 associations


In [64]:
mda_rnadisease['Year'].value_counts()

2021    21062
2016    16389
2015    14262
2013    11326
2012    11302
2014    10732
2017    10240
2020     9511
2022     8816
2011     8637
2019     6962
2010     5721
2008     4914
2018     4683
2009     2809
2007     2478
2005     2093
2006     1678
2023      938
2004      114
2024       18
2003        4
1991        3
2002        2
1993        1
Name: Year, dtype: int64

# MiRNA-Disease HMDD 3.2

## alldata.xlsx

In [68]:
mda_hmdd_3 = pd.read_excel('associations/alldata.xlsx')
mda_hmdd_3['category'] = mda_hmdd_3['category'].str.replace('-', '_')
mda_hmdd_3

Unnamed: 0,category,mir,disease,pmid,description
0,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...
1,circulation_biomarker_diagnosis_down,hsa-mir-16,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...
2,circulation_biomarker_diagnosis_down,hsa-mir-143,Colon Neoplasms,16195701,downregulation
3,circulation_biomarker_diagnosis_down,hsa-mir-145,Colon Neoplasms,16195701,downregulation
4,circulation_biomarker_diagnosis_down,hsa-mir-223,"Leukemia, Lymphocytic, Chronic, B-Cell",16251535,downregulated
...,...,...,...,...,...
35542,tissue_expression_up,hsa-mir-3648,Neoplasms [unspecific],30384176,Identification of key differentially expressed...
35543,tissue_expression_up,hsa-mir-3687,Neoplasms [unspecific],30384176,Identification of key differentially expressed...
35544,tissue_expression_up,hsa-mir-4746,Neoplasms [unspecific],30384176,Identification of key differentially expressed...
35545,tissue_expression_up,hsa-mir-760,Neoplasms [unspecific],30384176,Identification of key differentially expressed...


In [70]:
mda_hmdd_3.isna().sum()

category       0
mir            0
disease        0
pmid           0
description    0
dtype: int64

In [71]:
mda_hmdd_3['category'].value_counts()

other                                   8474
target gene                             7252
tissue_expression_ns                    3261
tissue_expression_up                    3047
tissue_expression_down                  2592
circulation_biomarker_diagnosis_ns      2463
therapeutic target                      2167
circulation_biomarker_diagnosis_up      1097
genetics_GWAS                           1080
epigenetics                              671
circulation_biomarker_diagnosis_down     654
genetics_overexpression_suppress         573
transcription factor target              554
lncRNA target                            390
circulation_biomarker_prognosis_ns       350
genetics_overexpression_promote          282
genetics_knock down_promote              228
genetics_knock down_suppress             202
circulation_biomarker_prognosis_up       137
circulation_biomarker_prognosis_down      73
Name: category, dtype: int64

## Match to MeSH terms

In [72]:
hmdd_diseasename_meshid = {}
count1 = 0
count2 = 0
for disease in mda_hmdd_3['disease'].drop_duplicates():
    if disease[-13:] == ' [unspecific]':
        disease = disease[:-13]
    disease = relo(disease)
    for unique_id in mesh_terms.keys():
        tmp_mesh_term = mesh_terms[unique_id]
        if disease == relo(tmp_mesh_term['heading']):
            hmdd_diseasename_meshid[disease] = unique_id
            count1 = count1 + 1
            break
        if disease in tmp_mesh_term['entries']:
            hmdd_diseasename_meshid[disease] = unique_id
            count2 = count2 + 1
            break
print('matched to the heading:', count1)
print('matched to the entries:', count2)
hmdd_diseasename_meshid

matched to the heading: 442
matched to the entries: 177


{('b', 'cell', 'chronic', 'leukemia', 'lymphocytic'): 'D015451',
 ('colon', 'neoplasms'): 'D003110',
 ('breast', 'neoplasms'): 'D001943',
 ('leukemia',): 'D007938',
 ('lung', 'neoplasms'): 'D008175',
 ('carcinoma', 'colorectal'): 'D015179',
 ('acute', 'leukemia', 'promyelocytic'): 'D015473',
 ('neoplasms',): 'D009369',
 ('neoplasms', 'pituitary'): 'D010911',
 ('carcinoma', 'hepatocellular'): 'D006528',
 ('cardiomyopathy', 'hypertrophic'): 'D002312',
 ('neoplasms', 'thyroid'): 'D013964',
 ('asthma',): 'D001249',
 ('b', 'cell', 'lymphoma'): 'D016393',
 ('neoplasms', 'ovarian'): 'D010051',
 ('polycythemia', 'vera'): 'D011087',
 ('acute', 'leukemia', 'myeloid'): 'D015470',
 ('leukemia', 'lymphoblastic'): 'D054198',
 ('hodgkin', 'lymphoma'): 'D006689',
 ('nasopharyngeal', 'neoplasms'): 'D009303',
 ('degeneration', 'retinal'): 'D012162',
 ('carcinoma', 'cell', 'renal'): 'D002292',
 ('myelodysplastic', 'syndromes'): 'D009190',
 ('diseases', 'neurodegenerative'): 'D019636',
 ('lupus', 'vulgari

In [73]:
for index, row in mda_hmdd_3.iterrows():
    disease = row['disease']
    if disease[-13:] == ' [unspecific]':
        disease = disease[:-13]
    disease = relo(disease)
    if disease in hmdd_diseasename_meshid.keys():
        mda_hmdd_3.loc[index, 'MeSH ID'] = hmdd_diseasename_meshid[disease]
mda_hmdd_3

Unnamed: 0,category,mir,disease,pmid,description,MeSH ID
0,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451
1,circulation_biomarker_diagnosis_down,hsa-mir-16,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451
2,circulation_biomarker_diagnosis_down,hsa-mir-143,Colon Neoplasms,16195701,downregulation,D003110
3,circulation_biomarker_diagnosis_down,hsa-mir-145,Colon Neoplasms,16195701,downregulation,D003110
4,circulation_biomarker_diagnosis_down,hsa-mir-223,"Leukemia, Lymphocytic, Chronic, B-Cell",16251535,downregulated,D015451
...,...,...,...,...,...,...
35542,tissue_expression_up,hsa-mir-3648,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369
35543,tissue_expression_up,hsa-mir-3687,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369
35544,tissue_expression_up,hsa-mir-4746,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369
35545,tissue_expression_up,hsa-mir-760,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369


In [74]:
print('Originally:', mda_hmdd_3.shape[0], 'rows')
mda_hmdd_3 = mda_hmdd_3.dropna(subset=['MeSH ID'])
print('Matched to MeSH:', mda_hmdd_3.shape[0], 'rows')
mda_hmdd_3

Originally: 35547 rows
Matched to MeSH: 31234 rows


Unnamed: 0,category,mir,disease,pmid,description,MeSH ID
0,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451
1,circulation_biomarker_diagnosis_down,hsa-mir-16,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451
2,circulation_biomarker_diagnosis_down,hsa-mir-143,Colon Neoplasms,16195701,downregulation,D003110
3,circulation_biomarker_diagnosis_down,hsa-mir-145,Colon Neoplasms,16195701,downregulation,D003110
4,circulation_biomarker_diagnosis_down,hsa-mir-223,"Leukemia, Lymphocytic, Chronic, B-Cell",16251535,downregulated,D015451
...,...,...,...,...,...,...
35542,tissue_expression_up,hsa-mir-3648,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369
35543,tissue_expression_up,hsa-mir-3687,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369
35544,tissue_expression_up,hsa-mir-4746,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369
35545,tissue_expression_up,hsa-mir-760,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369


## Match to miRBase

In [75]:
mda_hmdd_3['mir'].drop_duplicates()

0         hsa-mir-15a
1          hsa-mir-16
2         hsa-mir-143
3         hsa-mir-145
4         hsa-mir-223
             ...     
35498    hsa-mir-3195
35503     hsa-mir-21b
35531      hsa-mir-7a
35534    hsa-mir-3146
35544    hsa-mir-4746
Name: mir, Length: 1112, dtype: object

In [76]:
hmdd_rnaname_mirbaseid = {}
count1 = 0
count2 = 0
count3 = 0
count4 = 0
for rna_name in mda_hmdd_3['mir'].drop_duplicates():
    flag = True
    for pre_mirna_id in pre_mirnas.keys():
        tmp_pre_mirna = pre_mirnas[pre_mirna_id]
        if rna_name == tmp_pre_mirna['name']:
            flag = False
            hmdd_rnaname_mirbaseid[rna_name] = pre_mirna_id
            count3 = count3 + 1
            break
        if rna_name in tmp_pre_mirna['aliases']:
            flag = False
            hmdd_rnaname_mirbaseid[rna_name] = pre_mirna_id
            count4 = count4 + 1
            break
    if flag:
        for mature_mirna_id in mature_mirnas.keys():
            tmp_mature_mirna = mature_mirnas[mature_mirna_id]
            if rna_name == tmp_mature_mirna['name']:
                hmdd_rnaname_mirbaseid[rna_name] = mature_mirna_id
                count1 = count1 + 1
                break
            if rna_name in tmp_mature_mirna['aliases']:
                hmdd_rnaname_mirbaseid[rna_name] = mature_mirna_id
                count2 = count2 + 1
                break
print('Mature miRNA, name:', count1)
print('Mature miRNA, aliases:', count2)
print('Pri-miRNA, name', count3)
print('Pri-miRNA, aliases:', count4)
hmdd_rnaname_mirbaseid

Mature miRNA, name: 0
Mature miRNA, aliases: 2
Pri-miRNA, name 857
Pri-miRNA, aliases: 73


{'hsa-mir-15a': 'MI0000069',
 'hsa-mir-16': 'MI0000070',
 'hsa-mir-143': 'MI0000459',
 'hsa-mir-145': 'MI0000461',
 'hsa-mir-223': 'MI0000300',
 'hsa-mir-29a': 'MI0000087',
 'hsa-mir-29c': 'MI0000735',
 'hsa-mir-10b': 'MI0000267',
 'hsa-mir-125b-1': 'MI0000446',
 'hsa-mir-125b-2': 'MI0000470',
 'hsa-mir-106': 'MI0000113',
 'hsa-mir-10a': 'MI0000266',
 'hsa-mir-126': 'MI0000471',
 'hsa-mir-17': 'MI0000071',
 'hsa-mir-20': 'MI0000076',
 'hsa-let-7a-1': 'MI0000060',
 'hsa-let-7a-2': 'MI0000061',
 'hsa-let-7a-3': 'MI0000062',
 'hsa-let-7b': 'MI0000063',
 'hsa-let-7c': 'MI0000064',
 'hsa-let-7d': 'MI0000065',
 'hsa-let-7e': 'MI0000066',
 'hsa-let-7f-1': 'MI0000067',
 'hsa-let-7f-2': 'MI0000068',
 'hsa-let-7g': 'MI0000433',
 'hsa-let-7i': 'MI0000434',
 'hsa-mir-181a-2': 'MI0000269',
 'hsa-mir-181b-1': 'MI0000270',
 'hsa-mir-181b-2': 'MI0000683',
 'hsa-mir-181d': 'MI0003139',
 'hsa-mir-155': 'MI0000681',
 'hsa-mir-15b': 'MI0000438',
 'hsa-mir-16-1': 'MI0000070',
 'hsa-mir-16-2': 'MI0000115',


In [77]:
mda_hmdd_3 = mda_hmdd_3.join(
    pd.DataFrame(
        {'miRBase ID': hmdd_rnaname_mirbaseid.values()}, 
        index=hmdd_rnaname_mirbaseid.keys()
    ), on='mir'
)
mda_hmdd_3

Unnamed: 0,category,mir,disease,pmid,description,MeSH ID,miRBase ID
0,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451,MI0000069
1,circulation_biomarker_diagnosis_down,hsa-mir-16,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451,MI0000070
2,circulation_biomarker_diagnosis_down,hsa-mir-143,Colon Neoplasms,16195701,downregulation,D003110,MI0000459
3,circulation_biomarker_diagnosis_down,hsa-mir-145,Colon Neoplasms,16195701,downregulation,D003110,MI0000461
4,circulation_biomarker_diagnosis_down,hsa-mir-223,"Leukemia, Lymphocytic, Chronic, B-Cell",16251535,downregulated,D015451,MI0000300
...,...,...,...,...,...,...,...
35542,tissue_expression_up,hsa-mir-3648,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0016048
35543,tissue_expression_up,hsa-mir-3687,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,
35544,tissue_expression_up,hsa-mir-4746,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0017385
35545,tissue_expression_up,hsa-mir-760,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0005567


In [78]:
mda_hmdd_3 = mda_hmdd_3.dropna(subset=['miRBase ID'])
print('Matched to miRBase:', mda_hmdd_3.shape[0], 'rows')
mda_hmdd_3

Matched to miRBase: 28463 rows


Unnamed: 0,category,mir,disease,pmid,description,MeSH ID,miRBase ID
0,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451,MI0000069
1,circulation_biomarker_diagnosis_down,hsa-mir-16,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451,MI0000070
2,circulation_biomarker_diagnosis_down,hsa-mir-143,Colon Neoplasms,16195701,downregulation,D003110,MI0000459
3,circulation_biomarker_diagnosis_down,hsa-mir-145,Colon Neoplasms,16195701,downregulation,D003110,MI0000461
4,circulation_biomarker_diagnosis_down,hsa-mir-223,"Leukemia, Lymphocytic, Chronic, B-Cell",16251535,downregulated,D015451,MI0000300
...,...,...,...,...,...,...,...
35541,tissue_expression_up,hsa-mir-301b,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0005568
35542,tissue_expression_up,hsa-mir-3648,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0016048
35544,tissue_expression_up,hsa-mir-4746,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0017385
35545,tissue_expression_up,hsa-mir-760,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0005567


## Bio.Entrez Year

In [79]:
mda_hmdd_3['pmid'] = mda_hmdd_3['pmid'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mda_hmdd_3['pmid'] = mda_hmdd_3['pmid'].astype(str)


In [80]:
pmid_list = mda_hmdd_3['pmid'].drop_duplicates().values
print(len(pmid_list))
pmid_lists = [
    ','.join(pmid_list[:9999]),
    ','.join(pmid_list[9999:])
]

15305


In [86]:
hmdd_pmid_year = {}
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[0], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/hmdd_pubmed_records_1.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
hmdd_articles = result.find_all('PubmedArticle')
print(len(hmdd_articles))
for article in hmdd_articles:
    hmdd_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(hmdd_pmid_year))
pd.DataFrame({'PMID': hmdd_pmid_year.keys(), 'Year': hmdd_pmid_year.values()}).to_csv('our_data/csv/hmdd_3_pmid_year_1.csv')

Got Data!
9977
9977


In [87]:
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[1], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/hmdd_pubmed_records_2.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
hmdd_articles = result.find_all('PubmedArticle')
print(len(hmdd_articles))
for article in hmdd_articles:
    hmdd_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(hmdd_pmid_year))
pd.DataFrame({'PMID': hmdd_pmid_year.keys(), 'Year': hmdd_pmid_year.values()}).to_csv('our_data/csv/hmdd_3_pmid_year_2.csv')

Got Data!
5303
15280


In [88]:
mda_hmdd_3 = mda_hmdd_3.join(
    pd.DataFrame(
        {'Year': hmdd_pmid_year.values()},
        index=hmdd_pmid_year.keys()
    ), on='pmid'
)
mda_hmdd_3

Unnamed: 0,category,mir,disease,pmid,description,MeSH ID,miRBase ID,Year
0,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451,MI0000069,2005
1,circulation_biomarker_diagnosis_down,hsa-mir-16,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451,MI0000070,2005
2,circulation_biomarker_diagnosis_down,hsa-mir-143,Colon Neoplasms,16195701,downregulation,D003110,MI0000459,2006
3,circulation_biomarker_diagnosis_down,hsa-mir-145,Colon Neoplasms,16195701,downregulation,D003110,MI0000461,2006
4,circulation_biomarker_diagnosis_down,hsa-mir-223,"Leukemia, Lymphocytic, Chronic, B-Cell",16251535,downregulated,D015451,MI0000300,2005
...,...,...,...,...,...,...,...,...
35541,tissue_expression_up,hsa-mir-301b,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0005568,2019
35542,tissue_expression_up,hsa-mir-3648,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0016048,2019
35544,tissue_expression_up,hsa-mir-4746,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0017385,2019
35545,tissue_expression_up,hsa-mir-760,Neoplasms [unspecific],30384176,Identification of key differentially expressed...,D009369,MI0005567,2019


In [89]:
mda_hmdd_3[mda_hmdd_3['Year'].isna()]

Unnamed: 0,category,mir,disease,pmid,description,MeSH ID,miRBase ID,Year
865,circulation_biomarker_diagnosis_ns,hsa-mir-20a,Myocardial Infarction,21949348,deregulated between blood of patients with Uns...,D009203,MI0000076,
3470,circulation_biomarker_diagnosis_up,hsa-let-7e,Synovial Sarcoma,21140508,upregulated,D013584,MI0000066,
3471,circulation_biomarker_diagnosis_up,hsa-mir-125a,Synovial Sarcoma,21140508,miR-125a-3p: upregulated,D013584,MI0000469,
3472,circulation_biomarker_diagnosis_up,hsa-mir-99b,Synovial Sarcoma,21140508,upregulated,D013584,MI0000746,
3474,circulation_biomarker_diagnosis_up,hsa-mir-122,Liver Neoplasms,21154767,"in serum, miR-21, miR-122, and miR-223 were si...",D008113,MI0000442,
3475,circulation_biomarker_diagnosis_up,hsa-mir-223,Liver Neoplasms,21154767,"in serum, miR-21, miR-122, and miR-223 were si...",D008113,MI0000300,
4834,epigenetics,hsa-mir-126,Systemic Lupus Erythematosus,21165896,MicroRNA-126 regulates DNA methylation in CD4(...,D008180,MI0000471,
4835,epigenetics,hsa-mir-129-2,Gastric Neoplasms,21213213,upstream CpG-rich regions of mir-34b and mir-1...,D013274,MI0000473,
4836,epigenetics,hsa-mir-34b,Gastric Neoplasms,21213213,upstream CpG-rich regions of mir-34b and mir-1...,D013274,MI0000742,
5413,epigenetics,hsa-mir-101,Glioblastoma,29251856,Studies of intragenic and distant intergenic a...,D005909,MI0000103,


In [90]:
print('We\'ve manually checked that these MeSH IDs are unavailable.')
mda_hmdd_3 = mda_hmdd_3.dropna(subset=['Year'])

We've manually checked that these MeSH IDs are unavailable.


In [91]:
mda_hmdd_3['miRBase ID'][mda_hmdd_3['miRBase ID'].str.contains('MIMAT')].value_counts()

MIMAT0000062    151
MIMAT0000067     45
Name: miRBase ID, dtype: int64

In [92]:
mda_hmdd_3 = pd.merge(mda_hmdd_3, mirna_pre_mature, left_on='miRBase ID', right_on='miRBase Mature ID', how='outer')
mda_hmdd_3['miRBase Pre ID'] = mda_hmdd_3['miRBase Pre ID'].fillna(mda_hmdd_3['miRBase ID'])
mda_hmdd_3 = mda_hmdd_3.dropna(subset=['miRBase ID'])
mda_hmdd_3 = mda_hmdd_3.drop(labels='miRBase Mature ID', axis=1)
mda_hmdd_3

Unnamed: 0,category,mir,disease,pmid,description,MeSH ID,miRBase ID,Year,miRBase Pri ID
0,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",15737576,Some human miRNAs are linked to leukemias: the...,D015451,MI0000069,2005,MI0000069
1,circulation_biomarker_diagnosis_down,hsa-mir-15a,Pituitary Neoplasms,17028302,Downregulation of miR-15 and miR-16 miRNAs als...,D010911,MI0000069,2007,MI0000069
2,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",17327404,Both approaches show that miR-21 and miR-155 a...,D015451,MI0000069,2007,MI0000069
3,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Leukemia, Lymphocytic, Chronic, B-Cell",18362358,lost or downregulated,D015451,MI0000069,2008,MI0000069
4,circulation_biomarker_diagnosis_down,hsa-mir-15a,"Diabetes Mellitus, Type 2",20651284,decreased in plasma,D003924,MI0000069,2010,MI0000069
...,...,...,...,...,...,...,...,...,...
28772,tissue_expression_up,hsa-mir-6724,Chronic Obstructive Pulmonary Disease,30115538,FGFR1 was also a predicted target for some up-...,D029424,MI0022559,2018,MI0022559
28773,tissue_expression_up,hsa-mir-920,Colon Neoplasms,30127618,"seven miRNAs (hsa-miR-920, hsa-miR-636, hsa-mi...",D003110,MI0005712,2022,MI0005712
28774,tissue_expression_up,hsa-mir-3195,Traumatic Brain Injury,30226895,The expression levels of miR-3195 and miR-328-...,D000070642,MI0014240,2019,MI0014240
28775,tissue_expression_up,hsa-mir-3146,Rhinosinusitis,30378273,"Five upregulated miRNAs, including miR-210-5p,...",D000096825,MI0014172,2019,MI0014172


In [93]:
mda_hmdd_3.to_csv('our_data/edges/mirna_disease_hmdd_3.tsv', sep='\t', index=False)
print('Unified as pri-miRNA:', mda_hmdd_3.shape[0], 'rows')

Unified as pri-miRNA: 28777 rows


In [94]:
print('There are', mda_hmdd_3['MeSH ID'].drop_duplicates().shape[0], 'diseases')
print('There are', mda_hmdd_3['miRBase Pre ID'].drop_duplicates().shape[0], 'miRNAs')
print('There are', mda_hmdd_3[['MeSH ID', 'miRBase Pre ID']].drop_duplicates().shape[0], 'associations')

There are 569 diseases
There are 865 miRNAs
There are 13081 associations


# MiRNA-Disease HMDD 4.0

## alldata_v4.xlsx

In [97]:
mda_hmdd_4 = pd.read_excel('associations/alldata_v4.xlsx')
mda_hmdd_4['code'] = mda_hmdd_4['code'].str.replace('-', '_')
mda_hmdd_4

Unnamed: 0,code,PMID,miRNA,disease,description
0,other,31214494,hsa-mir-200b,nonpapillary renal cell carcinoma,Methods: We examined the expression patterns o...
1,other,35749917,hsa-mir-200b,uterine corpus endometrial carcinoma,"OBJECTIVE: In this study, we focused on five m..."
2,circulation_biomarker_diagnosis_ns,35421672,hsa-mir-21,hereditary diffuse gastric cancer,The receiver operating characteristic curve ge...
3,other,31273630,hsa-mir-21,hereditary diffuse gastric cancer,The aim of this study was to investigate the e...
4,other,31095782,hsa-mir-21,hereditary diffuse gastric cancer,"Here, we aimed to investigate the expression r..."
...,...,...,...,...,...
53525,lncRNA target,31210326,hsa-mir-548c,Myocardial Fibrosis,CONCLUSIONS: MiR-548c-3p could improve myocard...
53526,lncRNA target,36224570,hsa-mir-4036,Hypertrophy Of Ligamentum Flavum,"Furthermore, miR-4036 negatively regulated by ..."
53527,transcription factor target,35497882,hsa-mir-3918,Primary Central Nervous System Lymphoma,The results of KEGG revealed that the targetin...
53528,lncRNA target,35280029,hsa-mir-3907,Meibomian Gland Carcinoma,"Moreover, miR-3907 can play a role in promotin..."


In [98]:
mda_hmdd_4.isna().sum()

code           0
PMID           0
miRNA          0
disease        0
description    0
dtype: int64

In [100]:
mda_hmdd_4['code'].value_counts()

other                                   17316
lncRNA target                            9099
therapeutic target                       4293
genetics_overexpression_promote          3685
transcription factor target              3558
circulation_biomarker_diagnosis_ns       3418
genetics_knock down_promote              2351
circulation_biomarker_diagnosis_up       2167
exosome                                  1544
genetics_GWAS                            1321
genetics_overexpression_suppress         1040
genetics_knock down_suppress              886
circRNA target                            737
circulation_biomarker_prognosis_ns        446
epigenetics                               383
circulation_biomarker_diagnosis_down      370
circulation_biomarker_prognosis_up        247
virus_miRNA                               173
tissue_expression_ns                      162
tissue_expression_up                      147
tissue_expression_down                     92
target gene                       

## Match to MeSH terms

In [101]:
hmdd_diseasename_meshid = {}
count1 = 0
count2 = 0
for disease in mda_hmdd_4['disease'].drop_duplicates():
    if disease[-13:] == ' [unspecific]':
        disease = disease[:-13]
    disease = relo(disease)
    for unique_id in mesh_terms.keys():
        tmp_mesh_term = mesh_terms[unique_id]
        if disease == relo(tmp_mesh_term['heading']):
            hmdd_diseasename_meshid[disease] = unique_id
            count1 = count1 + 1
            break
        if disease in tmp_mesh_term['entries']:
            hmdd_diseasename_meshid[disease] = unique_id
            count2 = count2 + 1
            break
print('matched to the heading:', count1)
print('matched to the entries:', count2)
hmdd_diseasename_meshid

matched to the heading: 1578
matched to the entries: 20


{('1', 'imperfecta', 'osteogenesis', 'type'): 'D010013',
 ('acute', 'cell', 'leukemia', 'lymphoblastic', 't'): 'D054218',
 ('c1', 'disease', 'niemann', 'pick', 'type'): 'D052556',
 ('cardiomyopathy', 'dilated'): 'D002311',
 ('cleft', 'isolated', 'palate'): 'D002972',
 ('asthma', 'cough', 'variant'): 'D000096823',
 ('carcinoma', 'cell', 'lung', 'non', 'small'): 'D002289',
 ('1b', 'charcot', 'disease', 'marie', 'tooth', 'type'): 'D002607',
 ('inflammatory', 'neovascular', 'vitreoretinopathy'): 'D018630',
 ('cancer', 'prostate'): 'D011471',
 ('hereditary', 'ia', 'lymphedema'): 'D008209',
 ('3', 'charcot', 'disease', 'marie', 'tooth', 'type'): 'D015417',
 ('and', 'injuries', 'wounds'): 'D014947',
 ('diabetes', 'mellitus'): 'D003920',
 ('colonic', 'neoplasms'): 'D003110',
 ('neoplasms', 'prostatic'): 'D011471',
 ('lung', 'neoplasms'): 'D008175',
 ('diseases', 'kidney'): 'D007674',
 ('diseases', 'pituitary'): 'D010900',
 ('failure', 'heart'): 'D006333',
 ('alzheimer', 'disease'): 'D000544',


In [102]:
for index, row in mda_hmdd_4.iterrows():
    disease = row['disease']
    if disease[-13:] == ' [unspecific]':
        disease = disease[:-13]
    disease = relo(disease)
    if disease in hmdd_diseasename_meshid.keys():
        mda_hmdd_4.loc[index, 'MeSH ID'] = hmdd_diseasename_meshid[disease]
mda_hmdd_4

Unnamed: 0,code,PMID,miRNA,disease,description,MeSH ID
0,other,31214494,hsa-mir-200b,nonpapillary renal cell carcinoma,Methods: We examined the expression patterns o...,
1,other,35749917,hsa-mir-200b,uterine corpus endometrial carcinoma,"OBJECTIVE: In this study, we focused on five m...",
2,circulation_biomarker_diagnosis_ns,35421672,hsa-mir-21,hereditary diffuse gastric cancer,The receiver operating characteristic curve ge...,
3,other,31273630,hsa-mir-21,hereditary diffuse gastric cancer,The aim of this study was to investigate the e...,
4,other,31095782,hsa-mir-21,hereditary diffuse gastric cancer,"Here, we aimed to investigate the expression r...",
...,...,...,...,...,...,...
53525,lncRNA target,31210326,hsa-mir-548c,Myocardial Fibrosis,CONCLUSIONS: MiR-548c-3p could improve myocard...,
53526,lncRNA target,36224570,hsa-mir-4036,Hypertrophy Of Ligamentum Flavum,"Furthermore, miR-4036 negatively regulated by ...",
53527,transcription factor target,35497882,hsa-mir-3918,Primary Central Nervous System Lymphoma,The results of KEGG revealed that the targetin...,
53528,lncRNA target,35280029,hsa-mir-3907,Meibomian Gland Carcinoma,"Moreover, miR-3907 can play a role in promotin...",


In [103]:
print('Originally:', mda_hmdd_4.shape[0], 'rows')
mda_hmdd_4 = mda_hmdd_4.dropna(subset=['MeSH ID'])
print('Matched to MeSH:', mda_hmdd_4.shape[0], 'rows')
mda_hmdd_4

Originally: 53530 rows
Matched to MeSH: 49580 rows


Unnamed: 0,code,PMID,miRNA,disease,description,MeSH ID
44,other,32047494,hsa-mir-155,osteogenesis imperfecta type 1,Methods: Expression profiles of miR3p and miR5...,D010013
45,other,35395974,hsa-mir-155,"Acute Lymphoblastic Leukemia, T-Cell","CONCLUSION: The expressions of miR-211, miR-15...",D054218
54,other,30870990,hsa-mir-155,Niemann-Pick disease type C1,"Here, we propose that miR-155 may be a novel i...",D052556
79,circulation_biomarker_diagnosis_ns,32319375,hsa-mir-146a,"Acute Lymphoblastic Leukemia, T-Cell",The expression levels of miR-146a and miR-221 ...,D054218
146,other,30792263,hsa-mir-182,Dilated Cardiomyopathy,"Likewise, 56% of BAG3+/DCM+, significantly co-...",D002311
...,...,...,...,...,...,...
53429,other,36406123,hsa-mir-151,"Ovarian Failure, Premature",We previously screened 6 differentially expres...,D016649
53468,lncRNA target,35845923,hsa-mir-100,"Ovarian Failure, Premature",miR5p regulation influenced the role of HUCMSC...,D016649
53509,transcription factor target,33166004,hsa-mir-146,"Ovarian Failure, Premature",CONCLUSIONS: This study demonstrated that HFHS...,D016649
53516,lncRNA target,35034562,hsa-mir-23a,"Ovarian Failure, Premature","In conclusion, TRERNA1 may sponge miR-23a to s...",D016649


## Match to miRBase

In [104]:
mda_hmdd_4['miRNA'].drop_duplicates()

44          hsa-mir-155
79         hsa-mir-146a
146         hsa-mir-182
175         hsa-mir-122
204         hsa-mir-361
              ...      
53235      hsa-mir-4775
53236    hsa-mir-134-2p
53237      hsa-mir-6859
53238      hsa-mir-4679
53240      hsa-mir-6755
Name: miRNA, Length: 1850, dtype: object

In [105]:
hmdd_rnaname_mirbaseid = {}
count1 = 0
count2 = 0
count3 = 0
count4 = 0
for rna_name in mda_hmdd_4['miRNA'].drop_duplicates():
    flag = True
    for pre_mirna_id in pre_mirnas.keys():
        tmp_pre_mirna = pre_mirnas[pre_mirna_id]
        if rna_name == tmp_pre_mirna['name']:
            flag = False
            hmdd_rnaname_mirbaseid[rna_name] = pre_mirna_id
            count3 = count3 + 1
            break
        if rna_name in tmp_pre_mirna['aliases']:
            flag = False
            hmdd_rnaname_mirbaseid[rna_name] = pre_mirna_id
            count4 = count4 + 1
            break
    if flag:
        for mature_mirna_id in mature_mirnas.keys():
            tmp_mature_mirna = mature_mirnas[mature_mirna_id]
            if rna_name == tmp_mature_mirna['name']:
                hmdd_rnaname_mirbaseid[rna_name] = mature_mirna_id
                count1 = count1 + 1
                break
            if rna_name in tmp_mature_mirna['aliases']:
                hmdd_rnaname_mirbaseid[rna_name] = mature_mirna_id
                count2 = count2 + 1
                break
print('Mature miRNA, name:', count1)
print('Mature miRNA, aliases:', count2)
print('Pri-miRNA, name', count3)
print('Pri-miRNA, aliases:', count4)
hmdd_rnaname_mirbaseid

Mature miRNA, name: 0
Mature miRNA, aliases: 2
Pri-miRNA, name 1160
Pri-miRNA, aliases: 76


{'hsa-mir-155': 'MI0000681',
 'hsa-mir-146a': 'MI0000477',
 'hsa-mir-182': 'MI0000272',
 'hsa-mir-122': 'MI0000442',
 'hsa-mir-361': 'MI0000760',
 'hsa-mir-221': 'MI0000298',
 'hsa-mir-34a': 'MI0000268',
 'hsa-mir-338': 'MI0000814',
 'hsa-mir-143': 'MI0000459',
 'hsa-mir-222': 'MI0000299',
 'hsa-mir-20b': 'MI0001519',
 'hsa-mir-210': 'MI0000286',
 'hsa-mir-137': 'MI0000454',
 'hsa-mir-106a': 'MI0000113',
 'hsa-mir-363': 'MI0000764',
 'hsa-mir-34b': 'MI0000742',
 'hsa-mir-369': 'MI0000777',
 'hsa-mir-34c': 'MI0000743',
 'hsa-mir-211': 'MI0000287',
 'hsa-mir-449': 'MI0001648',
 'hsa-mir-663b': 'MI0006336',
 'hsa-mir-22': 'MI0000078',
 'hsa-mir-365-2': 'MI0000769',
 'hsa-mir-630': 'MI0003644',
 'hsa-mir-17': 'MI0000071',
 'hsa-mir-96': 'MI0000098',
 'hsa-mir-18a': 'MI0000072',
 'hsa-mir-409': 'MI0001735',
 'hsa-mir-145': 'MI0000461',
 'hsa-mir-26a': 'MI0000083',
 'hsa-mir-494': 'MI0003134',
 'hsa-mir-495': 'MI0003135',
 'hsa-mir-497': 'MI0003138',
 'hsa-mir-374a': 'MI0000782',
 'hsa-mir-6

In [106]:
mda_hmdd_4 = mda_hmdd_4.join(
    pd.DataFrame(
        {'miRBase ID': hmdd_rnaname_mirbaseid.values()}, 
        index=hmdd_rnaname_mirbaseid.keys()
    ), on='miRNA'
)
mda_hmdd_4

Unnamed: 0,code,PMID,miRNA,disease,description,MeSH ID,miRBase ID
44,other,32047494,hsa-mir-155,osteogenesis imperfecta type 1,Methods: Expression profiles of miR3p and miR5...,D010013,MI0000681
45,other,35395974,hsa-mir-155,"Acute Lymphoblastic Leukemia, T-Cell","CONCLUSION: The expressions of miR-211, miR-15...",D054218,MI0000681
54,other,30870990,hsa-mir-155,Niemann-Pick disease type C1,"Here, we propose that miR-155 may be a novel i...",D052556,MI0000681
79,circulation_biomarker_diagnosis_ns,32319375,hsa-mir-146a,"Acute Lymphoblastic Leukemia, T-Cell",The expression levels of miR-146a and miR-221 ...,D054218,MI0000477
146,other,30792263,hsa-mir-182,Dilated Cardiomyopathy,"Likewise, 56% of BAG3+/DCM+, significantly co-...",D002311,MI0000272
...,...,...,...,...,...,...,...
53429,other,36406123,hsa-mir-151,"Ovarian Failure, Premature",We previously screened 6 differentially expres...,D016649,MI0000809
53468,lncRNA target,35845923,hsa-mir-100,"Ovarian Failure, Premature",miR5p regulation influenced the role of HUCMSC...,D016649,MI0000102
53509,transcription factor target,33166004,hsa-mir-146,"Ovarian Failure, Premature",CONCLUSIONS: This study demonstrated that HFHS...,D016649,MI0000477
53516,lncRNA target,35034562,hsa-mir-23a,"Ovarian Failure, Premature","In conclusion, TRERNA1 may sponge miR-23a to s...",D016649,MI0000079


In [107]:
mda_hmdd_4 = mda_hmdd_4.dropna(subset=['miRBase ID'])
print('Matched to miRBase:', mda_hmdd_4.shape[0], 'rows')
mda_hmdd_4

Matched to miRBase: 42652 rows


Unnamed: 0,code,PMID,miRNA,disease,description,MeSH ID,miRBase ID
44,other,32047494,hsa-mir-155,osteogenesis imperfecta type 1,Methods: Expression profiles of miR3p and miR5...,D010013,MI0000681
45,other,35395974,hsa-mir-155,"Acute Lymphoblastic Leukemia, T-Cell","CONCLUSION: The expressions of miR-211, miR-15...",D054218,MI0000681
54,other,30870990,hsa-mir-155,Niemann-Pick disease type C1,"Here, we propose that miR-155 may be a novel i...",D052556,MI0000681
79,circulation_biomarker_diagnosis_ns,32319375,hsa-mir-146a,"Acute Lymphoblastic Leukemia, T-Cell",The expression levels of miR-146a and miR-221 ...,D054218,MI0000477
146,other,30792263,hsa-mir-182,Dilated Cardiomyopathy,"Likewise, 56% of BAG3+/DCM+, significantly co-...",D002311,MI0000272
...,...,...,...,...,...,...,...
53429,other,36406123,hsa-mir-151,"Ovarian Failure, Premature",We previously screened 6 differentially expres...,D016649,MI0000809
53468,lncRNA target,35845923,hsa-mir-100,"Ovarian Failure, Premature",miR5p regulation influenced the role of HUCMSC...,D016649,MI0000102
53509,transcription factor target,33166004,hsa-mir-146,"Ovarian Failure, Premature",CONCLUSIONS: This study demonstrated that HFHS...,D016649,MI0000477
53516,lncRNA target,35034562,hsa-mir-23a,"Ovarian Failure, Premature","In conclusion, TRERNA1 may sponge miR-23a to s...",D016649,MI0000079


## Bio.Entrez Year

In [114]:
mda_hmdd_4['PMID'] = mda_hmdd_4['PMID'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mda_hmdd_4['PMID'] = mda_hmdd_4['PMID'].astype(str)


In [118]:
pmid_list = mda_hmdd_4['PMID'].drop_duplicates().values
print(len(pmid_list))
pmid_lists = [
    ','.join(pmid_list[:9999]), 
    ','.join(pmid_list[9999: 9999 * 2]), 
    ','.join(pmid_list[9999 * 2: 9999 * 3]), 
    ','.join(pmid_list[9999 * 3:])
]

30896


In [119]:
hmdd_pmid_year = {}
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[0], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/hmdd_pubmed_records_1.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
hmdd_articles = result.find_all('PubmedArticle')
print(len(hmdd_articles))
for article in hmdd_articles:
    hmdd_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(hmdd_pmid_year))
pd.DataFrame({'PMID': hmdd_pmid_year.keys(), 'Year': hmdd_pmid_year.values()}).to_csv('our_data/csv/hmdd_4_pmid_year_1.csv')

Got Data!
9994
9994


In [120]:
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[1], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/hmdd_pubmed_records_1.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
hmdd_articles = result.find_all('PubmedArticle')
print(len(hmdd_articles))
for article in hmdd_articles:
    hmdd_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(hmdd_pmid_year))
pd.DataFrame({'PMID': hmdd_pmid_year.keys(), 'Year': hmdd_pmid_year.values()}).to_csv('our_data/csv/hmdd_4_pmid_year_2.csv')

Got Data!
9994
19988


In [121]:
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[2], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/hmdd_pubmed_records_1.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
hmdd_articles = result.find_all('PubmedArticle')
print(len(hmdd_articles))
for article in hmdd_articles:
    hmdd_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(hmdd_pmid_year))
pd.DataFrame({'PMID': hmdd_pmid_year.keys(), 'Year': hmdd_pmid_year.values()}).to_csv('our_data/csv/hmdd_4_pmid_year_3.csv')

Got Data!
9987
29975


In [122]:
Entrez.email = 'zhouyi2@stu.scu.edu.cn'
handle = Entrez.efetch(db='pubmed', id=pmid_lists[3], retmod='xml')
records = handle.read()
print('Got Data!')
result = BeautifulSoup(records, 'xml')
# with open('our_data/hmdd_pubmed_records_1.xml', 'w', encoding='utf-8') as f:
#     f.write(result.prettify())
hmdd_articles = result.find_all('PubmedArticle')
print(len(hmdd_articles))
for article in hmdd_articles:
    hmdd_pmid_year[article.find('PMID').text] = article.find('Year').text
print(len(hmdd_pmid_year))
pd.DataFrame({'PMID': hmdd_pmid_year.keys(), 'Year': hmdd_pmid_year.values()}).to_csv('our_data/csv/hmdd_4_pmid_year_4.csv')

Got Data!
899
30874


In [123]:
mda_hmdd_4 = mda_hmdd_4.join(
    pd.DataFrame(
        {'Year': hmdd_pmid_year.values()},
        index=hmdd_pmid_year.keys()
    ), on='PMID'
)
mda_hmdd_4

Unnamed: 0,code,PMID,miRNA,disease,description,MeSH ID,miRBase ID,Year
44,other,32047494,hsa-mir-155,osteogenesis imperfecta type 1,Methods: Expression profiles of miR3p and miR5...,D010013,MI0000681,2020
45,other,35395974,hsa-mir-155,"Acute Lymphoblastic Leukemia, T-Cell","CONCLUSION: The expressions of miR-211, miR-15...",D054218,MI0000681,2022
54,other,30870990,hsa-mir-155,Niemann-Pick disease type C1,"Here, we propose that miR-155 may be a novel i...",D052556,MI0000681,2019
79,circulation_biomarker_diagnosis_ns,32319375,hsa-mir-146a,"Acute Lymphoblastic Leukemia, T-Cell",The expression levels of miR-146a and miR-221 ...,D054218,MI0000477,2020
146,other,30792263,hsa-mir-182,Dilated Cardiomyopathy,"Likewise, 56% of BAG3+/DCM+, significantly co-...",D002311,MI0000272,2020
...,...,...,...,...,...,...,...,...
53429,other,36406123,hsa-mir-151,"Ovarian Failure, Premature",We previously screened 6 differentially expres...,D016649,MI0000809,2022
53468,lncRNA target,35845923,hsa-mir-100,"Ovarian Failure, Premature",miR5p regulation influenced the role of HUCMSC...,D016649,MI0000102,2022
53509,transcription factor target,33166004,hsa-mir-146,"Ovarian Failure, Premature",CONCLUSIONS: This study demonstrated that HFHS...,D016649,MI0000477,2021
53516,lncRNA target,35034562,hsa-mir-23a,"Ovarian Failure, Premature","In conclusion, TRERNA1 may sponge miR-23a to s...",D016649,MI0000079,2022


In [124]:
mda_hmdd_4[mda_hmdd_4['Year'].isna()]

Unnamed: 0,code,PMID,miRNA,disease,description,MeSH ID,miRBase ID,Year
2432,exosome,36775554,hsa-mir-21,Rett Syndrome,"In the present study, we sought to determine w...",D015518,MI0000077,
3867,genetics_overexpression_suppress,36775560,hsa-mir-150,Colonic Neoplasms,"Mechanistically, miR-150-3p can downregulate S...",D003110,MI0000479,
6309,genetics_overexpression_promote,36775547,hsa-mir-182,Spinal Cord Injuries,These findings suggest that upregulation of mi...,D013119,MI0000272,
8479,genetics_overexpression_promote,36775453,hsa-mir-96,"Carcinoma, Hepatocellular","MiR5p transfection promoted proliferation, mig...",D006528,MI0000098,
18128,other,36775545,hsa-mir-140,Alcoholism,Prostacyclin synthase (PTGIS) levels are decre...,D000437,MI0000456,
21493,exosome,36775383,hsa-mir-1246,Glioma,"In conclusion, our findings suggest that the h...",D005910,MI0006381,
21714,exosome,36775383,hsa-mir-10b,Glioma,"In conclusion, our findings suggest that the h...",D005910,MI0000267,
23345,genetics_overexpression_promote,36775444,hsa-mir-193a,"Carcinoma, Non-Small-Cell Lung","In conclusion, our study highlights a novel re...",D002289,MI0000487,
23451,genetics_overexpression_promote,36775421,hsa-mir-148a,Nasopharyngeal Carcinoma,Our results suggest that STAT3 regulates SRGN ...,D000077274,MI0000253,
25960,genetics_overexpression_promote,36775374,hsa-mir-34b,Asthma,Our study provides evidence that EZH2 promotes...,D001249,MI0000742,


In [125]:
print('We\'ve manually checked that these MeSH IDs are unavailable.')
mda_hmdd_4 = mda_hmdd_4.dropna(subset=['Year'])

We've manually checked that these MeSH IDs are unavailable.


In [126]:
mda_hmdd_4['miRBase ID'][mda_hmdd_4['miRBase ID'].str.contains('MIMAT')].value_counts()

MIMAT0000062    186
MIMAT0000067     74
Name: miRBase ID, dtype: int64

In [127]:
mda_hmdd_4 = pd.merge(mda_hmdd_4, mirna_pre_mature, left_on='miRBase ID', right_on='miRBase Mature ID', how='outer')
mda_hmdd_4['miRBase Pre ID'] = mda_hmdd_4['miRBase Pre ID'].fillna(mda_hmdd_4['miRBase ID'])
mda_hmdd_4 = mda_hmdd_4.dropna(subset=['miRBase ID'])
mda_hmdd_4 = mda_hmdd_4.drop(labels='miRBase Mature ID', axis=1)
mda_hmdd_4

Unnamed: 0,code,PMID,miRNA,disease,description,MeSH ID,miRBase ID,Year,miRBase Pri ID
0,other,32047494,hsa-mir-155,osteogenesis imperfecta type 1,Methods: Expression profiles of miR3p and miR5...,D010013,MI0000681,2020,MI0000681
1,other,35395974,hsa-mir-155,"Acute Lymphoblastic Leukemia, T-Cell","CONCLUSION: The expressions of miR-211, miR-15...",D054218,MI0000681,2022,MI0000681
2,other,30870990,hsa-mir-155,Niemann-Pick disease type C1,"Here, we propose that miR-155 may be a novel i...",D052556,MI0000681,2019,MI0000681
3,other,30563325,hsa-mir-155,Colonic Neoplasms,"Finally, the biosensor was successfully applie...",D003110,MI0000681,2020,MI0000681
4,other,36511578,hsa-mir-155,Colonic Neoplasms,"In the second group, miR5p allows to predict l...",D003110,MI0000681,2023,MI0000681
...,...,...,...,...,...,...,...,...,...
43067,transcription factor target,33959508,hsa-mir-6836,"Carcinoma, Hepatocellular","Apigenin can inhibit the growth of HCC cells, ...",D006528,MI0022682,2022,MI0022682
43068,transcription factor target,33959508,hsa-mir-6892,"Carcinoma, Hepatocellular","Apigenin can inhibit the growth of HCC cells, ...",D006528,MI0022739,2022,MI0022739
43069,transcription factor target,33959508,hsa-mir-7107,"Carcinoma, Hepatocellular","Apigenin can inhibit the growth of HCC cells, ...",D006528,MI0022958,2022,MI0022958
43070,lncRNA target,32636648,hsa-mir-4775,Lung Neoplasms,LncRNA SLC7A11-AS1 Contributes to Lung Cancer ...,D008175,MI0017418,2022,MI0017418


In [128]:
mda_hmdd_4.to_csv('our_data/edges/mirna_disease_hmdd_4.tsv', sep='\t', index=False)
print('Unified as pri-miRNA:', mda_hmdd_4.shape[0], 'rows')

Unified as pri-miRNA: 43072 rows


In [129]:
print('There are', mda_hmdd_4['MeSH ID'].drop_duplicates().shape[0], 'diseases')
print('There are', mda_hmdd_4['miRBase Pre ID'].drop_duplicates().shape[0], 'miRNAs')
print('There are', mda_hmdd_4[['MeSH ID', 'miRBase Pre ID']].drop_duplicates().shape[0], 'associations')

There are 1484 diseases
There are 1179 miRNAs
There are 23559 associations


# MiRNA-PCG - miRTarBase

## hsa_MTI.tsv

In [103]:
mga_mirtarbase = pd.read_excel('associations/hsa_MTI.xlsx')
mga_mirtarbase

Unnamed: 0,miRTarBase ID,miRNA,Species (miRNA),Target Gene,Target Gene (Entrez ID),Species (Target Gene),Experiments,Support Type,References (PMID)
0,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,Luciferase reporter assay//Western blot//North...,Functional MTI,18632605
1,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,HITS-CLIP,Functional MTI (Weak),22473208
2,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,Luciferase reporter assay//qRT-PCR//Western blot,Functional MTI,23911400
3,MIRT000006,hsa-miR-146a-5p,Homo sapiens,CXCR4,7852,Homo sapiens,qRT-PCR//Luciferase reporter assay//Western blot,Functional MTI,18568019
4,MIRT000006,hsa-miR-146a-5p,Homo sapiens,CXCR4,7852,Homo sapiens,Microarray,Functional MTI (Weak),20375304
...,...,...,...,...,...,...,...,...,...
957035,MIRT755297,hsa-miR-4762-3p,Homo sapiens,LINC01555,439927,Homo sapiens,CLIP-Seq dataset analysis,,22100165
957036,MIRT755298,hsa-miR-5011-5p,Homo sapiens,TMEM263,90488,Homo sapiens,CLIP-Seq dataset analysis,,22100165
957037,MIRT755299,hsa-miR-4480,Homo sapiens,TMEM263,90488,Homo sapiens,CLIP-Seq dataset analysis,,22100165
957038,MIRT755300,hsa-miR-190a-3p,Homo sapiens,TMEM263,90488,Homo sapiens,CLIP-Seq dataset analysis,,22100165


In [104]:
mga_mirtarbase.isna().sum()

miRTarBase ID                   0
miRNA                           0
Species (miRNA)                 0
Target Gene                     0
Target Gene (Entrez ID)         0
Species (Target Gene)           0
Experiments                     0
Support Type               454388
References (PMID)               0
dtype: int64

In [105]:
print('Originally:', mga_mirtarbase.shape[0], 'rows')
mga_mirtarbase = mga_mirtarbase[['miRNA', 'Target Gene', 'Target Gene (Entrez ID)']].drop_duplicates()
print('De-duplicated:', mga_mirtarbase.shape[0], 'rows')
mga_mirtarbase['Target Gene (Entrez ID)'] = mga_mirtarbase['Target Gene (Entrez ID)'].astype(str)
mga_mirtarbase

Originally: 957040 rows
De-duplicated: 396788 rows


Unnamed: 0,miRNA,Target Gene,Target Gene (Entrez ID)
0,hsa-miR-20a-5p,HIF1A,3091
3,hsa-miR-146a-5p,CXCR4,7852
6,hsa-miR-122-5p,CYP7A1,1581
7,hsa-miR-222-3p,STAT5A,6776
9,hsa-miR-21-5p,RASGRP1,10125
...,...,...,...
957035,hsa-miR-4762-3p,LINC01555,439927
957036,hsa-miR-5011-5p,TMEM263,90488
957037,hsa-miR-4480,TMEM263,90488
957038,hsa-miR-190a-3p,TMEM263,90488


## Match to HGNC

In [119]:
mga_not_in_pcg = mga_mirtarbase[~(mga_mirtarbase['Target Gene (Entrez ID)'].isin(pcgs.keys()))]
mga_not_in_pcg

Unnamed: 0,miRNA,Target Gene,Target Gene (Entrez ID)
311,hsa-miR-196a-5p,SPRR2C,6702
3104,hsa-miR-1-3p,MGC27345,157247
3170,hsa-miR-1-3p,PTMAP7,326626
3504,hsa-miR-210-3p,XIST,7503
4408,hsa-miR-197-3p,MMP23A,8511
...,...,...,...
957031,hsa-miR-3662,LINC01555,439927
957032,hsa-miR-1252-3p,LINC01555,439927
957033,hsa-miR-4766-5p,LINC01555,439927
957034,hsa-miR-6807-5p,LINC01555,439927


In [120]:
mirtarbase_genename_entrezid = {}
count1 = 0
count2 = 0
count3 = 0
for genename in mga_not_in_pcg['Target Gene'].drop_duplicates():
    for pcg_entrezid in pcgs.keys():
        if genename == pcgs[pcg_entrezid]['symbol']:
            mirtarbase_genename_entrezid[genename] = pcg_entrezid
            count1 = count1 + 1
            break
        if genename in pcgs[pcg_entrezid]['alias_symbols']:
            mirtarbase_genename_entrezid[genename] = pcg_entrezid
            count2 = count2 + 1
            break
        if genename in pcgs[pcg_entrezid]['prev_symbols']:
            mirtarbase_genename_entrezid[genename] = pcg_entrezid
            count3 = count3 + 1
            break
print('Matched to the symbol:', count1)
print('Matched to alias symbols:', count2)
print('Matched to previous symbols:', count3)
mirtarbase_genename_entrezid

Matched to the symbol: 9
Matched to alias symbols: 11
Matched to previous symbols: 16


{'TIAF1': '399687',
 'C2orf48': '6241',
 'KIAA1107': '284697',
 'LGTN': '1939',
 'TEC': '7006',
 'TMEM133': '143872',
 'FRAXA': '2332',
 'C9orf47': '1903',
 'MICALCL': '9645',
 'KIAA0754': '23499',
 'TXNRD3NB': '114112',
 'HCC': '84668',
 'ST2': '9173',
 'MIA2': '387885',
 'CXorf30': '286464',
 'AREGB': '374',
 'PRG4': '10216',
 'ANXA8L2': '728113',
 'TMEM257': '84631',
 'KIAA0899': '161',
 'PRO0992': '7965',
 'KIAA1491': '55833',
 'KIAA1307': '23352',
 'KIAA0060': '10007',
 'CCDC7': '79741',
 'FAM21B': '387680',
 'SETD5': '55209',
 'DYT10': '112476',
 'TBC1D3G': '101060321',
 'CBWD7': '644019',
 'C10orf12': '84458',
 'NBPF20': '100288142',
 'TIMM23B': '100652748',
 'CSE': '1491',
 'GAS1': '2619',
 'KL': '9365'}

In [124]:
mga_mirtarbase = mga_mirtarbase.join(
    pd.DataFrame(
        {'Gene Entrez ID': mirtarbase_genename_entrezid.values()},
        index=mirtarbase_genename_entrezid.keys()
    ), on='Target Gene'
)
mga_mirtarbase['Gene Entrez ID'] = mga_mirtarbase['Gene Entrez ID'].fillna(mga_mirtarbase['Target Gene (Entrez ID)'])
mga_mirtarbase

Unnamed: 0,miRNA,Target Gene,Target Gene (Entrez ID),Gene Entrez ID
0,hsa-miR-20a-5p,HIF1A,3091,3091
3,hsa-miR-146a-5p,CXCR4,7852,7852
6,hsa-miR-122-5p,CYP7A1,1581,1581
7,hsa-miR-222-3p,STAT5A,6776,6776
9,hsa-miR-21-5p,RASGRP1,10125,10125
...,...,...,...,...
957035,hsa-miR-4762-3p,LINC01555,439927,439927
957036,hsa-miR-5011-5p,TMEM263,90488,90488
957037,hsa-miR-4480,TMEM263,90488,90488
957038,hsa-miR-190a-3p,TMEM263,90488,90488


In [127]:
mga_mirtarbase = mga_mirtarbase[mga_mirtarbase['Gene Entrez ID'].isin(pcgs.keys())]
print('Matched to HGNC:', mga_mirtarbase.shape[0], 'rows')
mga_mirtarbase

Matched to HGNC: 391105 rows


Unnamed: 0,miRNA,Target Gene,Target Gene (Entrez ID),Gene Entrez ID
0,hsa-miR-20a-5p,HIF1A,3091,3091
3,hsa-miR-146a-5p,CXCR4,7852,7852
6,hsa-miR-122-5p,CYP7A1,1581,1581
7,hsa-miR-222-3p,STAT5A,6776,6776
9,hsa-miR-21-5p,RASGRP1,10125,10125
...,...,...,...,...
957030,hsa-miR-6077,RITA1,84934,84934
957036,hsa-miR-5011-5p,TMEM263,90488,90488
957037,hsa-miR-4480,TMEM263,90488,90488
957038,hsa-miR-190a-3p,TMEM263,90488,90488


## Match to miRBase

In [128]:
mga_mirtarbase['miRNA'].drop_duplicates()

0          hsa-miR-20a-5p
3         hsa-miR-146a-5p
6          hsa-miR-122-5p
7          hsa-miR-222-3p
9           hsa-miR-21-5p
               ...       
951263        hsa-miR-541
951360        hsa-miR-145
951388       hsa-miR-1245
951412        hsa-miR-616
951446    hsa-miR-3647-5p
Name: miRNA, Length: 2945, dtype: object

In [129]:
mirtarbase_rnaname_mirbaseid = {}
count1 = 0
count2 = 0
count3 = 0
count4 = 0
for rna_name in mga_mirtarbase['miRNA'].drop_duplicates():
    flag = True
    for mature_mirna_id in mature_mirnas.keys():
        tmp_mature_mirna = mature_mirnas[mature_mirna_id]
        if rna_name == tmp_mature_mirna['name']:
            flag = False
            mirtarbase_rnaname_mirbaseid[rna_name] = mature_mirna_id
            count1 = count1 + 1
            break
        if rna_name in tmp_mature_mirna['aliases']:
            flag = False
            mirtarbase_rnaname_mirbaseid[rna_name] = mature_mirna_id
            count2 = count2 + 1
            break
    if flag:
        for pre_mirna_id in pre_mirnas.keys():
            tmp_pre_mirna = pre_mirnas[pre_mirna_id]
            if rna_name == tmp_pre_mirna['name']:
                mirtarbase_rnaname_mirbaseid[rna_name] = pre_mirna_id
                count3 = count3 + 1
                break
            if rna_name in tmp_pre_mirna['aliases']:
                mirtarbase_rnaname_mirbaseid[rna_name] = pre_mirna_id
                count4 = count4 + 1
                break
print('Mature miRNA, name:', count1)
print('Mature miRNA, aliases:', count2)
print('Pri-miRNA, name', count3)
print('Pri-miRNA, aliases:', count4)
mirtarbase_rnaname_mirbaseid

Mature miRNA, name: 2547
Mature miRNA, aliases: 278
Pri-miRNA, name 0
Pri-miRNA, aliases: 0


{'hsa-miR-20a-5p': 'MIMAT0000075',
 'hsa-miR-146a-5p': 'MIMAT0000449',
 'hsa-miR-122-5p': 'MIMAT0000421',
 'hsa-miR-222-3p': 'MIMAT0000279',
 'hsa-miR-21-5p': 'MIMAT0000076',
 'hsa-miR-148a-3p': 'MIMAT0000243',
 'hsa-miR-93-5p': 'MIMAT0000093',
 'hsa-miR-9-5p': 'MIMAT0000441',
 'hsa-miR-765': 'MIMAT0003945',
 'hsa-miR-7-5p': 'MIMAT0000252',
 'hsa-miR-661': 'MIMAT0003324',
 'hsa-miR-659-3p': 'MIMAT0003337',
 'hsa-miR-562': 'MIMAT0003226',
 'hsa-miR-559': 'MIMAT0003223',
 'hsa-miR-548d-3p': 'MIMAT0003323',
 'hsa-miR-532-5p': 'MIMAT0002888',
 'hsa-miR-504-5p': 'MIMAT0002875',
 'hsa-miR-485-3p': 'MIMAT0002176',
 'hsa-miR-451a': 'MIMAT0001631',
 'hsa-miR-429': 'MIMAT0001536',
 'hsa-miR-375': 'MIMAT0000728',
 'hsa-miR-373-3p': 'MIMAT0000726',
 'hsa-miR-372-3p': 'MIMAT0000724',
 'hsa-miR-34b-5p': 'MIMAT0000685',
 'hsa-miR-34a-5p': 'MIMAT0000255',
 'hsa-miR-346': 'MIMAT0000773',
 'hsa-miR-328-3p': 'MIMAT0000752',
 'hsa-miR-326': 'MIMAT0000756',
 'hsa-miR-31-5p': 'MIMAT0000089',
 'hsa-miR-302d-

In [133]:
mga_mirtarbase = mga_mirtarbase.join(
    pd.DataFrame(
        {'miRBase ID': mirtarbase_rnaname_mirbaseid.values()},
        index=mirtarbase_rnaname_mirbaseid.keys()
    ), on='miRNA'
)
mga_mirtarbase

Unnamed: 0,miRNA,Target Gene,Target Gene (Entrez ID),Gene Entrez ID,miRBase ID
0,hsa-miR-20a-5p,HIF1A,3091,3091,MIMAT0000075
3,hsa-miR-146a-5p,CXCR4,7852,7852,MIMAT0000449
6,hsa-miR-122-5p,CYP7A1,1581,1581,MIMAT0000421
7,hsa-miR-222-3p,STAT5A,6776,6776,MIMAT0000279
9,hsa-miR-21-5p,RASGRP1,10125,10125,MIMAT0000076
...,...,...,...,...,...
957030,hsa-miR-6077,RITA1,84934,84934,MIMAT0023702
957036,hsa-miR-5011-5p,TMEM263,90488,90488,MIMAT0021045
957037,hsa-miR-4480,TMEM263,90488,90488,MIMAT0019014
957038,hsa-miR-190a-3p,TMEM263,90488,90488,MIMAT0026482


In [136]:
mga_mirtarbase = pd.merge(mga_mirtarbase, mirna_pre_mature, left_on='miRBase ID', right_on='miRBase Mature ID', how='outer').dropna(subset=['miRBase ID'])
mga_mirtarbase = mga_mirtarbase.drop(labels='miRBase Mature ID', axis=1)
mga_mirtarbase

Unnamed: 0,miRNA,Target Gene,Target Gene (Entrez ID),Gene Entrez ID,miRBase ID,miRBase Pre ID
0,hsa-miR-20a-5p,HIF1A,3091,3091,MIMAT0000075,MI0000076
1,hsa-miR-20a-5p,TCEAL1,9338,9338,MIMAT0000075,MI0000076
2,hsa-miR-20a-5p,CCND1,595,595,MIMAT0000075,MI0000076
3,hsa-miR-20a-5p,E2F1,1869,1869,MIMAT0000075,MI0000076
4,hsa-miR-20a-5p,BMPR2,659,659,MIMAT0000075,MI0000076
...,...,...,...,...,...,...
433222,hsa-miR-1471,CDK4,1019,1019,MIMAT0007349,MI0007076
433223,hsa-miR-1471,SHH,6469,6469,MIMAT0007349,MI0007076
433224,hsa-miR-1471,NF2,4771,4771,MIMAT0007349,MI0007076
433225,hsa-miR-1471,TBXA2R,6915,6915,MIMAT0007349,MI0007076


In [139]:
mga_mirtarbase = mga_mirtarbase.drop_duplicates(subset=['Gene Entrez ID', 'miRBase Pre ID'])
mga_mirtarbase

Unnamed: 0,miRNA,Target Gene,Target Gene (Entrez ID),Gene Entrez ID,miRBase ID,miRBase Pre ID
0,hsa-miR-20a-5p,HIF1A,3091,3091,MIMAT0000075,MI0000076
1,hsa-miR-20a-5p,TCEAL1,9338,9338,MIMAT0000075,MI0000076
2,hsa-miR-20a-5p,CCND1,595,595,MIMAT0000075,MI0000076
3,hsa-miR-20a-5p,E2F1,1869,1869,MIMAT0000075,MI0000076
4,hsa-miR-20a-5p,BMPR2,659,659,MIMAT0000075,MI0000076
...,...,...,...,...,...,...
433222,hsa-miR-1471,CDK4,1019,1019,MIMAT0007349,MI0007076
433223,hsa-miR-1471,SHH,6469,6469,MIMAT0007349,MI0007076
433224,hsa-miR-1471,NF2,4771,4771,MIMAT0007349,MI0007076
433225,hsa-miR-1471,TBXA2R,6915,6915,MIMAT0007349,MI0007076


In [140]:
mga_mirtarbase.to_csv('our_data/edges/mirna_pcg_mirtarbase.tsv', sep='\t', index=False)

In [141]:
print('There are', mga_mirtarbase['Gene Entrez ID'].drop_duplicates().shape[0], 'PCGs')
print('There are', mga_mirtarbase['miRBase Pre ID'].drop_duplicates().shape[0], 'miRNAs')
print('There are', mga_mirtarbase[['miRBase Pre ID', 'Gene Entrez ID']].drop_duplicates().shape[0], 'associations')

There are 14795 PCGs
There are 1855 miRNAs
There are 413498 associations


# PCG-Disease - DisGeNet

## disgenet_2020.db

In [142]:
conn = sqlite3.connect('associations/disgenet_2020.db')
pd.read_sql('select * from geneDiseaseNetwork limit 3;', conn)

Unnamed: 0,NID,diseaseNID,geneNID,source,association,associationType,sentence,pmid,score,EL,EI,year
0,1130681,2107,793,BEFREE,,Biomarker,No correlation could be found between Broder's...,1000501,0.1,,0.956175,1976
1,261998,431,775,BEFREE,,GeneticVariation,"However, there are few reports describing soma...",10021299,0.4,,0.987013,1999
2,369637,450,775,BEFREE,,GeneticVariation,WAF1 genotype and endometrial cancer susceptib...,10021299,0.02,,1.0,1999


In [143]:
gda_disgenet = pd.read_sql('select diseaseNID, geneNID, source, association, associationType, pmid, score, EL, EI from geneDiseaseNetwork', conn)
gda_disgenet

Unnamed: 0,diseaseNID,geneNID,source,association,associationType,pmid,score,EL,EI
0,2107,793,BEFREE,,Biomarker,1000501.0,0.10,,0.956175
1,431,775,BEFREE,,GeneticVariation,10021299.0,0.40,,0.987013
2,450,775,BEFREE,,GeneticVariation,10021299.0,0.02,,1.000000
3,10033,775,BEFREE,,GeneticVariation,10021299.0,0.07,,1.000000
4,431,7385,BEFREE,,Biomarker,10021369.0,0.01,,1.000000
...,...,...,...,...,...,...,...,...,...
3261319,30235,1698,ORPHANET,,ChromosomalRearrangement,,0.30,,
3261320,30244,4952,CLINVAR,,GeneticVariation,,0.20,,1.000000
3261321,30255,5756,GENOMICS_ENGLAND,,Biomarker,,0.30,strong,
3261322,30255,5756,GENOMICS_ENGLAND,,Biomarker,,0.30,strong,


In [144]:
gda_disgenet['association'].describe()

count    88228.000000
mean         0.070374
std          0.255779
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: association, dtype: float64

In [145]:
print('Not sure what\'s the meaning of the field \'association\', here I just drop it.')
del gda_disgenet['association']

Not sure what's the meaning of the field 'association', here I just drop it.


In [146]:
gda_disgenet['score'].describe()

count    3.261324e+06
mean     1.796928e-01
std      2.365360e-01
min      1.000000e-02
25%      3.000000e-02
50%      1.000000e-01
75%      2.000000e-01
max      1.000000e+00
Name: score, dtype: float64

In [147]:
gda_disgenet['source'].value_counts()

source
BEFREE              2700332
HPO                  164198
CLINVAR               98551
CTD_human             88180
GWASCAT               56795
LHGDN                 52794
GENOMICS_ENGLAND      20759
UNIPROT               17564
GWASDB                14663
RGD                   13093
MGD                    8569
ORPHANET               8340
CLINGEN                7865
PSYGENET               7214
CGI                    1841
CTD_mouse               518
CTD_rat                  48
Name: count, dtype: int64

In [148]:
gda_disgenet['associationType'].value_counts()

associationType
Biomarker                        1708286
GeneticVariation                  800444
AlteredExpression                 638378
CausalMutation                     65588
PosttranslationalModification      28738
Therapeutic                        10744
GermlineCausalMutation              6275
SusceptibilityMutation              1260
GenomicAlterations                   397
ModifyingMutation                    326
FusionGene                           315
SomaticCausalMutation                285
ChromosomalRearrangement             223
GermlineModifyingMutation             65
Name: count, dtype: int64

In [149]:
print('Originally:', gda_disgenet.shape[0], 'rows')
gda_disgenet

Originally: 3261324 rows


Unnamed: 0,diseaseNID,geneNID,source,associationType,pmid,score,EL,EI
0,2107,793,BEFREE,Biomarker,1000501.0,0.10,,0.956175
1,431,775,BEFREE,GeneticVariation,10021299.0,0.40,,0.987013
2,450,775,BEFREE,GeneticVariation,10021299.0,0.02,,1.000000
3,10033,775,BEFREE,GeneticVariation,10021299.0,0.07,,1.000000
4,431,7385,BEFREE,Biomarker,10021369.0,0.01,,1.000000
...,...,...,...,...,...,...,...,...
3261319,30235,1698,ORPHANET,ChromosomalRearrangement,,0.30,,
3261320,30244,4952,CLINVAR,GeneticVariation,,0.20,,1.000000
3261321,30255,5756,GENOMICS_ENGLAND,Biomarker,,0.30,strong,
3261322,30255,5756,GENOMICS_ENGLAND,Biomarker,,0.30,strong,


## Match to MeSH terms

In [150]:
disgenet_diseases = pd.read_sql('select * from diseaseAttributes', conn)
disgenet_diseases

Unnamed: 0,diseaseNID,diseaseId,diseaseName,type
0,1,C0000727,"Abdomen, Acute",phenotype
1,2,C0000729,Abdominal Cramps,phenotype
2,3,C0000731,Abdomen distended,phenotype
3,4,C0000734,Abdominal mass,phenotype
4,5,C0000735,Abdominal Neoplasms,group
...,...,...,...,...
30288,30289,C4755314,Autosomal recessive cutis laxa type 2B,disease
30289,30290,C4757950,Isolated ATP synthase deficiency,disease
30290,30291,C4757951,Desmoplastic infantile astrocytoma and ganglio...,disease
30291,30292,C4759295,Non-metastatic prostate cancer,disease


In [151]:
disgenet_diseases['type'].drop_duplicates()

0    phenotype
4        group
6      disease
Name: type, dtype: object

In [152]:
disgenet_disease_umlsid_meshid = pd.read_table('associations/disease_mappings.tsv')
disgenet_disease_umlsid_meshid

Unnamed: 0,diseaseId,name,vocabulary,code,vocabularyName
0,C0018923,Hemangiosarcoma,DO,0001816,angiosarcoma
1,C0854893,Angiosarcoma non-metastatic,DO,0001816,angiosarcoma
2,C0033999,Pterygium,DO,0002116,pterygium
3,C0025517,Metabolic Diseases,DO,0014667,disease of metabolism
4,C0155862,Streptococcal pneumonia,DO,0040084,Streptococcus pneumonia
...,...,...,...,...,...
242884,C0279628,Adenocarcinoma Of Esophagus,ORDO,99976,Adenocarcinoma of the esophagus
242885,C0279626,Squamous cell carcinoma of esophagus,ORDO,99977,Squamous cell carcinoma of the esophagus
242886,C0206702,Klatskin Tumor,ORDO,99978,Klatskin tumor
242887,C0007462,Causalgia,ORDO,99994,Complex regional pain syndrome type 2


In [153]:
disgenet_disease_umlsid_meshid['vocabulary'].drop_duplicates()

0              DO
13636         EFO
18678         HPO
39433       ICD10
42221     ICD10CM
52440      ICD9CM
56511       MONDO
75023         MSH
104358        NCI
225070       OMIM
236355       ORDO
Name: vocabulary, dtype: object

In [154]:
disgenet_disease_umlsid_meshid = disgenet_disease_umlsid_meshid[disgenet_disease_umlsid_meshid['vocabulary'] == 'MSH']
disgenet_disease_umlsid_meshid = disgenet_disease_umlsid_meshid[disgenet_disease_umlsid_meshid['code'].str.contains('D')]
print('MeSH Descriptors only:', disgenet_disease_umlsid_meshid.shape[0], 'rows')

MeSH Descriptors only: 23939 rows


In [155]:
disgenet_disease_umlsid_meshid[~(disgenet_disease_umlsid_meshid['code'].isin(mesh_terms.keys()))]

Unnamed: 0,diseaseId,name,vocabulary,code,vocabularyName
80437,C0000921,Accidental Falls,MSH,D000058,Accidental Falls
80438,C0233514,Abnormal behavior,MSH,D000066553,Problem Behavior
80440,C0750731,"Physical Appearance, Body",MSH,D000067029,"Physical Appearance, Body"
80441,C3203533,Psychological Trauma,MSH,D000067073,Psychological Trauma
80443,C0150080,Social Communication Disorder,MSH,D000067404,Social Communication Disorder
...,...,...,...,...,...
104136,C0200665,Platelet mean volume determination (procedure),MSH,D063847,Mean Platelet Volume
104328,C3850153,Ovarian Reserve,MSH,D065851,Ovarian Reserve
104329,C1535926,Neurodevelopmental Disorders,MSH,D065886,Neurodevelopmental Disorders
104340,C4019167,Speech Sound Disorders,MSH,D066229,Speech Sound Disorders


In [156]:
print('In the MeSH tree, they are not \'disease\' (C)')
disgenet_disease_umlsid_meshid = disgenet_disease_umlsid_meshid[disgenet_disease_umlsid_meshid['code'].isin(mesh_terms.keys())]
print('Our MeSH terms only:', disgenet_disease_umlsid_meshid.shape[0], 'rows')
disgenet_disease_umlsid_meshid

In the MeSH tree, they are not 'disease' (C)
Our MeSH terms only: 23256 rows


Unnamed: 0,diseaseId,name,vocabulary,code,vocabularyName
80407,C0000727,"Abdomen, Acute",MSH,D000006,"Abdomen, Acute"
80408,C0000735,Abdominal Neoplasms,MSH,D000008,Abdominal Neoplasms
80409,C0000744,Abetalipoproteinemia,MSH,D000012,Abetalipoproteinemia
80410,C0000768,Congenital Abnormality,MSH,D000013,Congenital Abnormalities
80411,C0000768,Congenital Abnormality,MSH,D000013,Congenital Abnormality
...,...,...,...,...,...
104339,C0158731,Congenital pectus carinatum,MSH,D066166,Pectus Carinatum
104342,C3852953,Pulmonary Arterial Remodeling,MSH,D066253,Vascular Remodeling
104343,C3850148,Vascular Remodeling,MSH,D066253,Vascular Remodeling
104344,C3850148,Vascular Remodeling,MSH,D066253,Pulmonary Arterial Remodeling


In [157]:
print('UMLS IDs:', disgenet_disease_umlsid_meshid['diseaseId'].drop_duplicates().shape[0])
disgenet_disease_umlsid_meshid = disgenet_disease_umlsid_meshid[['diseaseId', 'code']].drop_duplicates()
disgenet_disease_umlsid_meshid = disgenet_disease_umlsid_meshid.set_index('diseaseId')
disgenet_disease_umlsid_meshid

UMLS IDs: 6499


Unnamed: 0_level_0,code
diseaseId,Unnamed: 1_level_1
C0000727,D000006
C0000735,D000008
C0000744,D000012
C0000768,D000013
C0000771,D000014
...,...
C0549159,D066088
C0876994,D066126
C0158731,D066166
C3852953,D066253


In [158]:
disgenet_diseases = disgenet_diseases.join(disgenet_disease_umlsid_meshid, on='diseaseId')
print('Originally:', disgenet_diseases.shape[0], 'UMLS disease terms')
disgenet_diseases = disgenet_diseases.dropna(subset=['code'])
print('With MeSH ID:', disgenet_diseases.shape[0], 'rows')
disgenet_diseases = disgenet_diseases.set_index('diseaseNID')
disgenet_diseases

Originally: 30293 UMLS disease terms
With MeSH ID: 6499 rows


Unnamed: 0_level_0,diseaseId,diseaseName,type,code
diseaseNID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,C0000727,"Abdomen, Acute",phenotype,D000006
2,C0000729,Abdominal Cramps,phenotype,D003085
5,C0000735,Abdominal Neoplasms,group,D000008
6,C0000737,Abdominal Pain,phenotype,D015746
7,C0000744,Abetalipoproteinemia,disease,D000012
...,...,...,...,...
29817,C4721916,HMSN Type V,disease,D015419
29821,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990
29831,C4722227,Hypoprothrombinemias,disease,D007020
29840,C4722330,Generalized Thyroid Hormone Resistance,disease,D018382


In [159]:
gda_disgenet = gda_disgenet.join(disgenet_diseases, on='diseaseNID')
gda_disgenet

Unnamed: 0,diseaseNID,geneNID,source,associationType,pmid,score,EL,EI,diseaseId,diseaseName,type,code
0,2107,793,BEFREE,Biomarker,1000501.0,0.10,,0.956175,C0027651,Neoplasms,group,D009369
1,431,775,BEFREE,GeneticVariation,10021299.0,0.40,,0.987013,C0006826,Malignant Neoplasms,group,D009369
2,450,775,BEFREE,GeneticVariation,10021299.0,0.02,,1.000000,,,,
3,10033,775,BEFREE,GeneticVariation,10021299.0,0.07,,1.000000,C0476089,Endometrial Carcinoma,disease,D016889
4,431,7385,BEFREE,Biomarker,10021369.0,0.01,,1.000000,C0006826,Malignant Neoplasms,group,D009369
...,...,...,...,...,...,...,...,...,...,...,...,...
3261319,30235,1698,ORPHANET,ChromosomalRearrangement,,0.30,,,,,,
3261320,30244,4952,CLINVAR,GeneticVariation,,0.20,,1.000000,,,,
3261321,30255,5756,GENOMICS_ENGLAND,Biomarker,,0.30,strong,,,,,
3261322,30255,5756,GENOMICS_ENGLAND,Biomarker,,0.30,strong,,,,,


In [160]:
gda_disgenet = gda_disgenet.dropna(subset=['code'])
print('Match to MeSH terms:', gda_disgenet.shape[0], 'rows')
gda_disgenet

Match to MeSH terms: 2166659 rows


Unnamed: 0,diseaseNID,geneNID,source,associationType,pmid,score,EL,EI,diseaseId,diseaseName,type,code
0,2107,793,BEFREE,Biomarker,1000501.0,0.10,,0.956175,C0027651,Neoplasms,group,D009369
1,431,775,BEFREE,GeneticVariation,10021299.0,0.40,,0.987013,C0006826,Malignant Neoplasms,group,D009369
3,10033,775,BEFREE,GeneticVariation,10021299.0,0.07,,1.000000,C0476089,Endometrial Carcinoma,disease,D016889
4,431,7385,BEFREE,Biomarker,10021369.0,0.01,,1.000000,C0006826,Malignant Neoplasms,group,D009369
5,2480,7385,BEFREE,Biomarker,10021369.0,0.10,,0.944444,C0032580,Adenomatous Polyposis Coli,disease,D011125
...,...,...,...,...,...,...,...,...,...,...,...,...
3261039,29817,7081,ORPHANET,GermlineCausalMutation,,0.31,,1.000000,C4721916,HMSN Type V,disease,D015419
3261040,29821,4652,CTD_human,Biomarker,,0.31,,1.000000,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990
3261041,29821,19627,CTD_human,Biomarker,,0.30,,,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990
3261044,29831,1579,GENOMICS_ENGLAND,Biomarker,,0.60,strong,0.928571,C4722227,Hypoprothrombinemias,disease,D007020


## Match to HGNC

In [161]:
disgenet_genes = pd.read_sql('select geneNID, geneId, geneName, geneDescription from geneAttributes', conn)
disgenet_genes['geneId'] = disgenet_genes['geneId'].astype(str)
disgenet_genes

Unnamed: 0,geneNID,geneId,geneName,geneDescription
0,1,1,A1BG,alpha-1-B glycoprotein
1,2,2,A2M,alpha-2-macroglobulin
2,3,3,A2MP1,alpha-2-macroglobulin pseudogene 1
3,4,9,NAT1,N-acetyltransferase 1
4,5,10,NAT2,N-acetyltransferase 2
...,...,...,...,...
26132,26133,115482722,H3P33,H3 histone pseudogene 33
26133,26134,115482723,H3P40,H3 histone pseudogene 40
26134,26135,115561815,LNC-LBCS,"lncRNA bladder and prostate cancer suppressor,..."
26135,26136,115804232,CEROX1,cytoplasmic endogenous regulator of oxidative ...


In [162]:
disgenet_genes.dtypes

geneNID             int64
geneId             object
geneName           object
geneDescription    object
dtype: object

In [163]:
disgenet_genes = disgenet_genes[disgenet_genes['geneId'].isin(pcgs.keys())]
print('Our mRNAs only:', disgenet_genes.shape[0], 'rows')
disgenet_genes = disgenet_genes.set_index('geneNID')
disgenet_genes

Our mRNAs only: 16947 rows


Unnamed: 0_level_0,geneId,geneName,geneDescription
geneNID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A1BG,alpha-1-B glycoprotein
2,2,A2M,alpha-2-macroglobulin
4,9,NAT1,N-acetyltransferase 1
5,10,NAT2,N-acetyltransferase 2
6,12,SERPINA3,serpin family A member 3
...,...,...,...
25927,111064649,SMIM33,small integral membrane protein 33
26089,113455421,DERPC,DERPC proline and glycine rich nuclear protein
26090,113523636,SMIM40,small integral membrane protein 40
26091,113523638,SMIM41,small integral membrane protein 41


In [164]:
gda_disgenet = gda_disgenet.join(disgenet_genes, on='geneNID')
gda_disgenet

Unnamed: 0,diseaseNID,geneNID,source,associationType,pmid,score,EL,EI,diseaseId,diseaseName,type,code,geneId,geneName,geneDescription
0,2107,793,BEFREE,Biomarker,1000501.0,0.10,,0.956175,C0027651,Neoplasms,group,D009369,1048,CEACAM5,CEA cell adhesion molecule 5
1,431,775,BEFREE,GeneticVariation,10021299.0,0.40,,0.987013,C0006826,Malignant Neoplasms,group,D009369,1026,CDKN1A,cyclin dependent kinase inhibitor 1A
3,10033,775,BEFREE,GeneticVariation,10021299.0,0.07,,1.000000,C0476089,Endometrial Carcinoma,disease,D016889,1026,CDKN1A,cyclin dependent kinase inhibitor 1A
4,431,7385,BEFREE,Biomarker,10021369.0,0.01,,1.000000,C0006826,Malignant Neoplasms,group,D009369,10297,APC2,APC regulator of WNT signaling pathway 2
5,2480,7385,BEFREE,Biomarker,10021369.0,0.10,,0.944444,C0032580,Adenomatous Polyposis Coli,disease,D011125,10297,APC2,APC regulator of WNT signaling pathway 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3261039,29817,7081,ORPHANET,GermlineCausalMutation,,0.31,,1.000000,C4721916,HMSN Type V,disease,D015419,9927,MFN2,mitofusin 2
3261040,29821,4652,CTD_human,Biomarker,,0.31,,1.000000,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990,6440,SFTPC,surfactant protein C
3261041,29821,19627,CTD_human,Biomarker,,0.30,,,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990,729238,SFTPA2,surfactant protein A2
3261044,29831,1579,GENOMICS_ENGLAND,Biomarker,,0.60,strong,0.928571,C4722227,Hypoprothrombinemias,disease,D007020,2147,F2,"coagulation factor II, thrombin"


In [165]:
gda_disgenet = gda_disgenet.dropna(subset=['geneId'])
print('Match to HNGC:', gda_disgenet.shape[0], 'rows')
gda_disgenet

Match to HNGC: 2033056 rows


Unnamed: 0,diseaseNID,geneNID,source,associationType,pmid,score,EL,EI,diseaseId,diseaseName,type,code,geneId,geneName,geneDescription
0,2107,793,BEFREE,Biomarker,1000501.0,0.10,,0.956175,C0027651,Neoplasms,group,D009369,1048,CEACAM5,CEA cell adhesion molecule 5
1,431,775,BEFREE,GeneticVariation,10021299.0,0.40,,0.987013,C0006826,Malignant Neoplasms,group,D009369,1026,CDKN1A,cyclin dependent kinase inhibitor 1A
3,10033,775,BEFREE,GeneticVariation,10021299.0,0.07,,1.000000,C0476089,Endometrial Carcinoma,disease,D016889,1026,CDKN1A,cyclin dependent kinase inhibitor 1A
4,431,7385,BEFREE,Biomarker,10021369.0,0.01,,1.000000,C0006826,Malignant Neoplasms,group,D009369,10297,APC2,APC regulator of WNT signaling pathway 2
5,2480,7385,BEFREE,Biomarker,10021369.0,0.10,,0.944444,C0032580,Adenomatous Polyposis Coli,disease,D011125,10297,APC2,APC regulator of WNT signaling pathway 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3261039,29817,7081,ORPHANET,GermlineCausalMutation,,0.31,,1.000000,C4721916,HMSN Type V,disease,D015419,9927,MFN2,mitofusin 2
3261040,29821,4652,CTD_human,Biomarker,,0.31,,1.000000,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990,6440,SFTPC,surfactant protein C
3261041,29821,19627,CTD_human,Biomarker,,0.30,,,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990,729238,SFTPA2,surfactant protein A2
3261044,29831,1579,GENOMICS_ENGLAND,Biomarker,,0.60,strong,0.928571,C4722227,Hypoprothrombinemias,disease,D007020,2147,F2,"coagulation factor II, thrombin"


In [166]:
print('It\'s difficult to decide how filter the data...')
print('Although the fields \'source\', \'pmid\', \'score\', \'EL\' and \'EI\' all present the strength of evidence, there are some conflicts between them...')
print('Finally, I choose to trust in the GDA Score.')
gda_disgenet = gda_disgenet[gda_disgenet['score'] >= 0.1]
print('Score >= 0.1:', gda_disgenet.shape[0], 'rows')
gda_disgenet

It's difficult to decide how filter the data...
Although the fields 'source', 'pmid', 'score', 'EL' and 'EI' all present the strength of evidence, there are some conflicts between them...
Finally, I choose to trust in the GDA Score.
Score >= 0.1: 1277194 rows


Unnamed: 0,diseaseNID,geneNID,source,associationType,pmid,score,EL,EI,diseaseId,diseaseName,type,code,geneId,geneName,geneDescription
0,2107,793,BEFREE,Biomarker,1000501.0,0.10,,0.956175,C0027651,Neoplasms,group,D009369,1048,CEACAM5,CEA cell adhesion molecule 5
1,431,775,BEFREE,GeneticVariation,10021299.0,0.40,,0.987013,C0006826,Malignant Neoplasms,group,D009369,1026,CDKN1A,cyclin dependent kinase inhibitor 1A
5,2480,7385,BEFREE,Biomarker,10021369.0,0.10,,0.944444,C0032580,Adenomatous Polyposis Coli,disease,D011125,10297,APC2,APC regulator of WNT signaling pathway 2
9,2480,5845,BEFREE,Biomarker,10021369.0,0.10,,0.769231,C0032580,Adenomatous Polyposis Coli,disease,D011125,8313,AXIN2,axin 2
14,2712,2236,MGD,Biomarker,10021458.0,1.00,strong,0.986842,C0036161,Sandhoff Disease,disease,D012497,3074,HEXB,hexosaminidase subunit beta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3261039,29817,7081,ORPHANET,GermlineCausalMutation,,0.31,,1.000000,C4721916,HMSN Type V,disease,D015419,9927,MFN2,mitofusin 2
3261040,29821,4652,CTD_human,Biomarker,,0.31,,1.000000,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990,6440,SFTPC,surfactant protein C
3261041,29821,19627,CTD_human,Biomarker,,0.30,,,C4721952,Familial Idiopathic Pulmonary Fibrosis,disease,D054990,729238,SFTPA2,surfactant protein A2
3261044,29831,1579,GENOMICS_ENGLAND,Biomarker,,0.60,strong,0.928571,C4722227,Hypoprothrombinemias,disease,D007020,2147,F2,"coagulation factor II, thrombin"


In [167]:
gda_disgenet[['source', 'associationType', 'pmid', 'EL', 'EI', 
    'diseaseName', 'code', 'geneId', 'geneName', 'geneDescription']].to_csv('our_data/edges/pcg_disease_disgenet.tsv', sep='\t', index=False)

In [168]:
print('There are', gda_disgenet['code'].drop_duplicates().shape[0], 'diseases')
print('There are', gda_disgenet['geneId'].drop_duplicates().shape[0], 'PCGs')
print('There are', gda_disgenet[['code', 'geneId']].drop_duplicates().shape[0], 'associations')

There are 2910 diseases
There are 11317 PCGs
There are 134796 associations


In [182]:
gda_disgenet['source'].value_counts()

# Curated Data: UNIPROT, CTD_human, ORPHANET, CLINGEN, GENOMICS_ENGLAND, CGI, PSYGENET
# These data contain GDAs provided by the expert curated resources.

# Animal Models Data: CTD_mouse, CTD_rat, MGD, RGD
# These data include GDAs provided by the resources containing information about animal models (currently rat and mouse) of disease.

# Inferred Data: HPO, CLINVAR, GWASCAT, GWASDB
# These data refer to GDAs inferred from HPO and from VDAs.

# Literature Data: LHGDN, BEFREE

# Conclusion

So far, statistics on the built dataset:  

||Source| $\|N_{miRNA}\|$ | $\|N_{PCG}\|$ | $\|N_{disease}\|$ | $\|E\|$ |
|---|---|---|---|---|---|
|miRNA|miRBase|1917|/|/|4513|
|PCG|HGNC|/|19258|/|1189771|
|PCG|HumanNet|/|/|/||
|disease|MeSH|/|/|5032|13086|
|miRNA-disease|RNADisease|1874|/|975|57349|
||HMDD 3.2|865|/|569|13081|
||HMDD 4.0|1179|/|1484|23559|
|miRNA-PCG|ENCORI|1855|14451|/|144625|
|PCG-disease|DisGeNet|/|11316|2910|134796|

Later, preprocessing (e.g. de-duplicating) also leads to some changes in the volume of data.  