# <p style="text-align: center;">RNA Knowledge Graph Build Data Preparation</p>
    
***
***

**Authors:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@studenti.unimi.it), [TJCallahan](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=callahantiff@gmail.com), [MMesiti](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=marco.mesiti@unimi.it), [GValentini](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=giorgio.valentini@unimi.it)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/), [PheKnowLator](https://github.com/callahantiff/PheKnowLator/)  
<!--- **Release:** **[v2.0.0](https://github.com/callahantiff/PheKnowLator/wiki/v2.0.0)** --->
  
<br>  
  
**Purpose:** This notebook serves as a script to download, process, map, and clean data in order to build edges for RNA-KG. For more information on the data sources utilize within this script, please see the [PheKnowLator Data Sources](https://github.com/callahantiff/PheKnowLator/wiki/v2-Data-Sources) Wiki page.

<br>

**Assumptions:**   
- Edge data downloads ➞ `./resources/edge_data`  
- Ontologies ➞ `./resources/ontologies`    
- Processed data write location ➞ `./resources/processed_data`  

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts. 
_____
***

## Table of Contents
***

### [Download and process Ontologies](#create-ontologies)

### [Download and create Identifier Maps ](#create-identifier-maps)   

### [Download and process Edge Datasets](#create-edges)  

____

## Set-Up Environment
_____

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

#### Define Global Variables

In [None]:
# directory to store resources
resource_data_location = '../resources/'

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

In [None]:
for rel_txt in ['INVERSE_RELATIONS.txt', 'RELATIONS_LABELS.txt']:
    data_downloader(processed_url+rel_txt, '../resources/relations_data/')
    
# Load data, print row count, and preview it
ro_data_label = pd.read_csv('../resources/relations_data/'+'RELATIONS_LABELS.txt', header=0, delimiter='\t')

print('There are {edge_count} RO Relations and Labels'.format(edge_count=len(ro_data_label)))
ro_data_label.head(n=5)

***
***
## DOWNLOAD AND PROCESS ONTOLOGIES  <a class="anchor" id="create-ontologies"></a>
***
***

In [None]:
def download_ontology(ontology):
    if not os.path.exists(ontology_data_location + ontology+'_with_imports.owl'):
        command = '{} {} --merge-import-closure -o {}'
        os.system(command.format(owltools_location, 'http://purl.obolibrary.org/obo/'+ontology+'.owl',
                                 ontology_data_location + ontology + '_with_imports.owl'))

onto_list = ['ro', 'chebi', 'pr', 'mondo', 'go', 'pw', 'bto', 'so',
             'hp', 'uberon', 'cob', 'obi', 'ero', 'vo', 'clo']

for ontology in onto_list:
    download_ontology(ontology) 

***
***
## DOWNLOAD AND CREATE MAPPING DATASETS  <a class="anchor" id="create-identifier-maps"></a>
***
***

### Mappings provided by PheKnowLator ecosystem

In [None]:
map_list = ['DISEASE_MONDO_MAP.txt', 'ENSEMBL_GENE_ENTREZ_GENE_MAP.txt',
            'ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt', 'GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt',
            'MESH_CHEBI_MAP.txt', 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', 'STRING_PRO_ONTOLOGY_MAP.txt',
            'UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt']

for edge in ['ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt',
'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt',
'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt',
'UNIPROT_PROTEIN_CATALYST.txt',
'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt',
'UNIPROT_PROTEIN_COFACTOR.txt',
'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt',
'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt',
'CLINVAR_VARIANT_GENE_DISEASE_PHENOTYPE_EDGES.txt']:
    data_downloader(processed_url+edge, processed_data_location)

for map_txt in map_list:
    data_downloader(processed_url+map_txt, processed_data_location)

***
### New mappings

***
### Chemical description from ChEBI - ChEBI mapping


**Purpose:** To map Chemical description from ChEBI to ChEBI identifiers.

**Output:** None, this mapping will be used only internally.

In [None]:
# Get dbxrefs for all ontology classes' label
def gets_ontology_class_label(graph: Graph) -> Tuple:
    dbx_uris: Dict = dict()
    dbx = [x for x in graph if 'label' in str(x[1]).lower() if isinstance(x[0], URIRef)]
    for x in dbx:
        if str(x[2]).lower() in dbx_uris.keys(): dbx_uris[str(x[2]).lower()].append(str(x[0]))
        else: dbx_uris[str(x[2]).lower()] = [str(x[0])]
    dbx_type = {str(x[2]).lower(): 'DbXref' for x in dbx}

    ex_uris: Dict = dict()
    ex = [x for x in graph if 'exactmatch' in str(x[1]).lower() if isinstance([0], URIRef)]
    for x in ex:
        if str(x[2]).lower() in ex_uris.keys(): ex_uris[str(x[2]).lower()].append(str(x[0]))
        else: ex_uris[str(x[2]).lower()] = [str(x[0])]
    ex_type = {str(x[2]).lower(): 'ExactMatch' for x in ex}

    return {**dbx_uris, **ex_uris}, {**dbx_type, **ex_type}

In [None]:
chebi_graph = Graph().parse(ontology_data_location + 'chebi_with_imports.owl')

chebi_label = gets_ontology_class_label(chebi_graph)[0]
chebi_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in chebi_label.items()}
list({**chebi_dict}.items())[:5]

with open(unprocessed_data_location + 'DESC_CHEBI_MAP.txt', 'w') as outfile:
    for k, v in {**chebi_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
desc_chebi_map = pd.read_csv(unprocessed_data_location+'DESC_CHEBI_MAP.txt', header=None, delimiter='\t')
desc_chebi_map

***
### GO description from GO - GO mapping


**Purpose:** To map GO description from GO to GO identifiers.

**Output:** None, this mapping will be used only internally.

In [None]:
go_graph = Graph().parse(ontology_data_location + 'go_with_imports.owl')

go_label = gets_ontology_class_label(go_graph)[0]
go_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in go_label.items()}
list({**go_dict}.items())[:5]

with open(unprocessed_data_location + 'DESC_GO_MAP.txt', 'w') as outfile:
    for k, v in {**go_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
desc_go_map = pd.read_csv(unprocessed_data_location+'DESC_GO_MAP.txt', header=None, delimiter='\t')
desc_go_map

***
### PW description from PW - PW mapping


**Purpose:** To map PW description from PW to PW identifiers.

**Output:** None, this mapping will be used only internally.

In [None]:
pw_graph = Graph().parse(ontology_data_location + 'pw_with_imports.owl')

pw_label = gets_ontology_class_label(pw_graph)[0]
pw_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in pw_label.items()}
list({**pw_dict}.items())[:5]

with open(unprocessed_data_location + 'DESC_PW_MAP.txt', 'w') as outfile:
    for k, v in {**pw_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
desc_pw_map = pd.read_csv(unprocessed_data_location+'DESC_PW_MAP.txt', header=None, delimiter='\t')
desc_pw_map

In [None]:
# Get dbxrefs for all ontology classes' label
def gets_ontology_class_synonym(graph: Graph) -> Tuple:
    dbx_uris: Dict = dict()
    dbx = [x for x in graph if 'synonym' in str(x[1]).lower() if isinstance(x[0], URIRef)]
    for x in dbx:
        if str(x[2]).lower() in dbx_uris.keys(): dbx_uris[str(x[2]).lower()].append(str(x[0]))
        else: dbx_uris[str(x[2]).lower()] = [str(x[0])]
    dbx_type = {str(x[2]).lower(): 'DbXref' for x in dbx}

    ex_uris: Dict = dict()
    ex = [x for x in graph if 'exactmatch' in str(x[1]).lower() if isinstance([0], URIRef)]
    for x in ex:
        if str(x[2]).lower() in ex_uris.keys(): ex_uris[str(x[2]).lower()].append(str(x[0]))
        else: ex_uris[str(x[2]).lower()] = [str(x[0])]
    ex_type = {str(x[2]).lower(): 'ExactMatch' for x in ex}

    return {**dbx_uris, **ex_uris}, {**dbx_type, **ex_type}

pw_syn = gets_ontology_class_synonym(pw_graph)[0]
pw_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in pw_syn.items()}
list({**pw_dict}.items())[:5]

with open(unprocessed_data_location + 'SYN_PW_MAP.txt', 'w') as outfile:
    for k, v in {**pw_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
syn_pw_map = pd.read_csv(unprocessed_data_location+'SYN_PW_MAP.txt', header=None, delimiter='\t')
desc_pw_map = syn_pw_map.append(desc_pw_map)
desc_pw_map

***
### miRNA - miRBase mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map miRNA and stem-loop miRNA to miRBase identifiers.

**Output:** `MIRNA_MIRBASE_MAP.txt`

Provided by [miRBase](https://www.mirbase.org/).

In [None]:
data_downloader('https://www.mirbase.org/ftp/CURRENT/aliases.txt.zip', unprocessed_data_location)

In [None]:
mirna_mirbase_map = pd.read_csv(unprocessed_data_location + 'aliases.txt', sep="\t", header=None)
mirna_mirbase_map[1] = mirna_mirbase_map[1].str[:-1]
mirna_mirbase_map

In [None]:
mirna_mirbase_map[1] = mirna_mirbase_map[1].str.split(';')
mirna_mirbase_map = mirna_mirbase_map.explode(1)
mirna_mirbase_map[[1,0]]

In [None]:
mirna_mirbase_map[[1,0]].to_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t', index=None)

***
### Disease Ontology (DO) - MONDO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map DO identifiers to MONDO identifiers.

**Output:** `DISEASE_DOID_MONDO_MAP.txt`

In [None]:
mondo_graph = Graph().parse(ontology_data_location + 'mondo_with_imports.owl')

mondo_dbxref = gets_ontology_class_dbxrefs(mondo_graph)[0]

# Fix DOIDs (substitute : with _)
mondo_dict = {str(k).replace(':','_').upper(): {str(i).split('/')[-1].replace(':','_') for i in v} for k, v in mondo_dbxref.items() if 'doid' in str(k)}
list({**mondo_dict}.items())[:5]

In [None]:
with open(processed_data_location + 'DOID_MONDO_MAP.txt', 'w') as outfile:
    for k, v in mondo_dict.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
doid_mondo_map = pd.read_csv(processed_data_location+'DOID_MONDO_MAP.txt', header=None, delimiter='\t')
doid_mondo_map

***
### Disease description from DO - DO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map Disease descriptions from DO to DO identifiers.

**Output:** None, this mapping will be used only internally.

Provided by [mir2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/).

In [None]:
data_downloader('http://watson.compbio.iupui.edu:8080/miR2Disease/download/diseaseList.txt', unprocessed_data_location)

In [None]:
desc_do_map = pd.read_csv(unprocessed_data_location + 'diseaseList.txt', sep="\t")
desc_do_map.columns = ['desc', 'doid']
desc_do_map['desc'] = desc_do_map['desc'].str.lower()
desc_do_map['doid'] = desc_do_map['doid'].str.upper().str.replace(':', '_')
desc_do_map

***
### TCGA - MONDO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To manually map the 32 TCGA cancer types to MONDO ontology.

**Output:** `TCGA_MONDO_MAP.txt`

In [None]:
cancer_mondo_map = pd.DataFrame(data=[['ACC','MONDO_0004971'],
                                 ['BLCA','MONDO_0004163'],
                                 ['BRCA','MONDO_0006256'],
                                 ['CESC','MONDO_0005131'],
                                 ['CHOL','MONDO_0019087'],
                                 ['COAD','MONDO_0002271'],
                                 ['DLBC','MONDO_0018905'],
                                 ['ESCA','MONDO_0019086'],
                                 ['GBM','MONDO_0018177'],
                                 ['HNSC','MONDO_0010150'],
                                 ['KICH','MONDO_0017885'],
                                 ['KIRC','MONDO_0005005'],
                                 ['KIRP','MONDO_0017884'],
                                 ['LGG','MONDO_0005499'],
                                 ['LIHC','MONDO_0007256'],
                                 ['LUAD','MONDO_0005061'],
                                 ['LUSC','MONDO_0005097'],
                                 ['MESO','MONDO_0005065'],
                                 ['OV','MONDO_0006046'],
                                 ['PAAD','MONDO_0006047'],
                                 ['PCPG','MONDO_0035540'],
                                 ['PRAD','MONDO_0005082'],
                                 ['READ','MONDO_0002169'],
                                 ['SARC','MONDO_0005089'],
                                 ['SKCM','MONDO_0005012'],
                                 ['STAD','MONDO_0005036'],
                                 ['TGCT','MONDO_0010108'],
                                 ['THCA','MONDO_0015075'],
                                 ['THYM','MONDO_0006456'],
                                 ['UCEC','MONDO_0000553'],
                                 ['UCS','MONDO_0006485'],
                                 ['UVM','MONDO_0006486']
                                 ])

cancer_mondo_map.to_csv(processed_data_location + 'TCGA_MONDO_MAP.txt', header=None, sep='\t', index=None)

***
### Amino Acid - ChEBI mapping 


**Purpose:** To manually map amino acids ChEBI ontology (SO could've been used too).

**Output:** `AminoAcid_ChEBI_MAP.txt`

In [None]:
tRNA_aa['Amino Acid'].unique()

In [None]:
aa_chebi_map = pd.DataFrame(data=[['Leu','CHEBI_25017'],
                                 ['Phe','CHEBI_28044'],
                                 ['Ala','CHEBI_16449'],
                                 ['Asn','CHEBI_22653'],
                                 ['Glu','CHEBI_18237'],
                                 ['His','CHEBI_27570'],
                                 ['Asp','CHEBI_22660'],
                                 ['Cys','CHEBI_22660'],
                                 ['Gly','CHEBI_15428'],
                                 ['Ile','CHEBI_24898'],
                                 ['Lys','CHEBI_25094'],
                                 ['Met','CHEBI_16811'],
                                 ['Ser','CHEBI_17822'],
                                 ['Val','CHEBI_27266'],
                                 ['Gln','CHEBI_28300'],
                                 ['Arg','CHEBI_29016'],
                                 ['Pro','CHEBI_26271'],
                                 ['Thr','CHEBI_26986'],
                                 ['iMe','PR_000021937'],
                                 ['Trp','CHEBI_27897'],
                                 ['Tyr','CHEBI_18186']#,
                                 #['Sup','tRNA-Suppressor NOT GROUNDED']
                                 ])

aa_chebi_map.to_csv(processed_data_location + 'AminoAcid_ChEBI_MAP.txt', header=None, sep='\t', index=None)

***
### Gene symbol - PRO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map gene symbols to PRO identifiers.

**Output:** `GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt`

In [None]:
symbol_ensembl_map = pd.read_csv(processed_data_location + 'GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t", header=None)
symbol_ensembl_map[[0,1]]

In [None]:
ensembl_pro_map = pd.read_csv(processed_data_location + 'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt', sep="\t", header=None)
ensembl_pro_map[[1,0]]

In [None]:
symbol_to_pro = pd.merge(symbol_ensembl_map[[0,1]], ensembl_pro_map[[1,0]], left_on=[1], right_on=[0])
symbol_to_pro = symbol_to_pro[['0_x', '1_y']].drop_duplicates()
symbol_to_pro

In [None]:
symbol_to_pro.drop_duplicates().to_csv(processed_data_location+
                                                       'GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt', header=None,
                                                       sep='\t', index=None)

***
### PRO label - PRO mapping


**Purpose:** To map PRO labels to PRO identifiers.

**Output:** None, this mapping will be used only internally.

In [None]:
pro_graph = Graph().parse(ontology_data_location + 'pr_with_imports.owl')

pro_label = gets_ontology_class_label(pro_graph)[0]
pro_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in pro_label.items()}
list({**pro_dict}.items())[:5]

with open(unprocessed_data_location + 'DESC_PRO_MAP.txt', 'w') as outfile:
    for k, v in {**pro_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
desc_pro_map = pd.read_csv(unprocessed_data_location+'DESC_PRO_MAP.txt', header=None, delimiter='\t', dtype=object)
desc_pro_map[0] = desc_pro_map[0].str.replace("human", '')
desc_pro_map[0] = desc_pro_map[0].str.replace("(", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(")", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(",(.*)", '')
desc_pro_map

***
### Gene symbol - ENTREZ mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map gene symbols to ENTREZ identifiers.

**Output:** `GENE_SYMBOL_ENTREZ_ID_MAP.txt`

In [None]:
entrez_enst_map = pd.read_csv(processed_data_location + 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t", header=None)
entrez_enst_map

In [None]:
symbol_entrez_map = pd.merge(symbol_ensembl_map, entrez_enst_map, on=[1])
symbol_entrez_map = symbol_entrez_map[['0_x','0_y']].drop_duplicates()
symbol_entrez_map

In [None]:
symbol_entrez_map.to_csv(processed_data_location+'GENE_SYMBOL_ENTREZ_ID_MAP.txt',header=None, sep='\t', index=None)

***
### tsRNA - tRNA mapping 

**Purpose:** To map tsRNA to tRNA identifiers.

**Output:** `tRNA_tsRNA_MAP.txt`

Provided by [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php).

In [None]:
# Currently offline
data_downloader('https://rna.sysu.edu.cn/tsRFun/download/newID_20210202.txt', unprocessed_data_location)

tsRNA_tRF_map = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")
tsRNA_tRF_map = tsRNA_tRF_map[['tRNA','tsRNAid']]
tsRNA_tRF_map 

tsRNA_tRF_map.to_csv(processed_data_location + 'tRNA_tsRNA_MAP.txt', header=None, sep='\t', index=None)

In [None]:
tsRNA_tRF_map = pd.read_csv(processed_data_location + 'tRNA_tsRNA_MAP.txt', sep="\t", header=None)
tsRNA_tRF_map

***
### ribozyme - RFAM mapping 

**Purpose:** To map ribozyme to RFSM identifiers.

**Output:** `ribozyme_RFAM_MAP.txt`

In [None]:
ribozyme_rfam_map = pd.DataFrame(data=[['LC ribozyme','family/RF00011'],
                                 ['hammerhead ribozyme','clan/CL00010'],
                                 ['glmS ribozyme','family/RF00234'],
                                 ['HDV-F-prausnitzii','family/RF02682'],
                                 ['HDV ribozyme','family/RF00094'],
                                 ['HDV_ribozyme','family/RF00094'],
                                 ['Hairpin','family/RF00173'],
                                 ['Hammerhead_1','clan/CL00010'],
                                 ['Hammerhead_HH9','clan/CL00010'],
                                 ['Hammerhead_3','clan/CL00010'],
                                 ['Hammerhead_HH10','clan/CL00010'],
                                 ['Hammerhead_II','clan/CL00010'],
                                 ['Pistol','family/RF02679'],
                                 ['Pistol ribozyme','family/RF02679'],
                                 ['twister ribozyme','clan/CL00120'],
                                 ['Twister-P5','clan/CL00120'],
                                 ['Twister-P3','clan/CL00120'],
                                 ['RNAse P','family/RF00009']#,
                                 #['VS ribozyme',''] absent in RFAM
                                 ])

ribozyme_rfam_map.to_csv(processed_data_location + 'ribozyme_RFAM_MAP.txt', header=None, sep='\t', index=None)

***
### MINTbase - GtRNAdb tRNA mapping 

**Purpose:** To map MINTbase to GtRNAdb identifiers.

**Output:** `tRNA_MINTbase_GtRNAdb_MAP.txt`

Provided by [MINTbase](https://cm.jefferson.edu/MINTbase/).

In [None]:
tRNA_MINTbase_GtRNAdb_map = pd.read_csv(unprocessed_data_location + 'MINTbase-gtRNAdb_mapping.txt',sep='\t')
tRNA_MINTbase_GtRNAdb_map = tRNA_MINTbase_GtRNAdb_map[['MINTbase tRNA name','gtRNAdb name']]
tRNA_MINTbase_GtRNAdb_map = tRNA_MINTbase_GtRNAdb_map[tRNA_MINTbase_GtRNAdb_map['gtRNAdb name'] != '-']
tRNA_MINTbase_GtRNAdb_map

In [None]:
tRNA_MINTbase_GtRNAdb_map.to_csv(
    processed_data_location + 'tRNA_MINTbase_GtRNAdb_MAP.txt', header=None, sep='\t', index=None)

***
### BTO label - BTO mapping


**Purpose:** To map BTO labels to BTO identifiers.

**Output:** `DESC_BTO_MAP.txt`

In [None]:
bto_graph = Graph().parse(ontology_data_location + 'bto_with_imports.owl')

bto_label = gets_ontology_class_label(bto_graph)[0]
bto_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in bto_label.items()}
list({**bto_dict}.items())[:5]

with open(unprocessed_data_location + 'DESC_BTO_MAP.txt', 'w') as outfile:
    for k, v in {**bto_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
desc_bto_map = pd.read_csv(unprocessed_data_location+'DESC_BTO_MAP.txt', header=None, delimiter='\t', dtype=object)
desc_bto_map[0] = desc_bto_map[0].str.replace("human", '')
desc_bto_map[0] = desc_bto_map[0].str.replace("(", '')
desc_bto_map[0] = desc_bto_map[0].str.replace(")", '')
desc_bto_map[0] = desc_bto_map[0].str.replace(",(.*)", '')
desc_bto_map

***
***
## DOWNLOAD AND PROCESS EDGE DATASETS  <a class="anchor" id="create-edges"></a>
***
***

## Edges already provided by PheKnowLator ecosystem

In [None]:
for edge in ['CTD_chem_gene_ixns.tsv',
'CTD_chem_go_enriched.tsv',
'ChEBI2Reactome_All_Levels.txt',
'CTD_chemicals_diseases.tsv',
'CTD_chem_gene_ixns.tsv',
'phenotype.hpoa',
'curated_gene_disease_associations.tsv',
'COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt',
'CTD_genes_pathways.tsv',
'curated_gene_disease_associations.tsv',
'gene_association.reactome',
'goa_human.gaf',
'UniProt2Reactome_All_Levels.txt',
'9606.protein.links.v11.0.txt']:
    data_downloader(original_url+edge, edge_data_location)

### Gene-RNA

In [None]:
gene_rna = pd.read_csv(processed_data_location+'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep='\t',header=None)
gene_rna[3].unique()

In [None]:
gene_rna.replace({'processed_transcript':'mRNA',
                  'protein_coding':'mRNA',
                  'non_stop_decay':'mRNA',
                  'nonsense_mediated_decay':'mRNA',
                  'transcribed_processed_pseudogene':'pseudo',
                  'transcribed_unitary_pseudogene':'pseudo',
                  'transcribed_unprocessed_pseudogene':'pseudo',
                  'polymorphic_pseudogene':'pseudo',
                  'unprocessed_pseudogene':'pseudo',
                  'processed_pseudogene':'pseudo',
                  'unitary_pseudogene':'pseudo',
                   'pseudogene':'pseudo',
                  'Mt_tRNA':'mt_tRNA'
                 }, inplace=True)

In [None]:
gene_premiRNA2511 = gene_rna[gene_rna[3] == 'miRNA']
gene_premiRNA2511 = pd.merge(gene_premiRNA2511, symbol_entrez_map.rename(columns={'0_y':0}), on=0)

gene_premiRNA2511['0_x'] = 'hsa-' + gene_premiRNA2511['0_x'].str.lower().str.replace(
    'mir','mir-').str.replace('let','let-')

gene_premiRNA2511 = pd.merge(gene_premiRNA2511.rename(columns={'0_x':'a'}), mirna_mirbase_map.rename(columns={1:'a'}), on='a')
gene_premiRNA2511

In [None]:
gene_premiRNA2511[['0_x','0_y']].drop_duplicates().to_csv(
    edge_data_location +'Hgene-premiRNA.txt', header=None, sep='\t', index=None)

In [None]:
gene_rna[6] = gene_rna[0].astype(str) + '#' + gene_rna[3].astype(str)

In [None]:
for i in set(gene_rna[3]):
    if i != 'miRNA':
        gene_rna_ = gene_rna[gene_rna[3]==i]

        if not gene_rna_.empty:   
            #print(i)
            #print(gene_rna_[[0,6]].drop_duplicates())
            gene_rna_[[0,6]].drop_duplicates().to_csv(
                edge_data_location + 'Hgene-' + i + '.txt', header=None, sep='\t', index=None)

### RNA-protein

In [None]:
mRNA_protein = pd.read_csv(processed_data_location+'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt', sep='\t',header=None)
mRNA_protein = mRNA_protein[mRNA_protein[4] == 'protein-coding']
mRNA_protein = pd.merge(mRNA_protein.rename(columns={0:'a'}), entrez_enst_map.rename(columns={1:'a'}), on='a')[[0, 1]]
mRNA_protein[0] = mRNA_protein[0].astype(str) + '#mRNA' 
mRNA_protein

In [None]:
mRNA_protein.drop_duplicates().to_csv(
    edge_data_location + 'HmRNA-protein.txt', header=None, sep='\t', index=None)

### RNA-anatomy

In [None]:
RNA_anatomy = pd.read_csv(processed_data_location+'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt', sep='\t',header=None)
RNA_anatomy = RNA_anatomy[(RNA_anatomy[3]=='Evidence at transcript level') & (RNA_anatomy[4]=='anatomy')]

symbol_entrez_map['0_y'] = symbol_entrez_map['0_y'].astype(str)
gene_rna[0] = gene_rna[0].astype(str)
rna_pro = pd.merge(gene_rna.rename(columns={0:'0_y'}), symbol_entrez_map, on='0_y')
RNA_anatomy = pd.merge(rna_pro.rename(columns={'0_x':'a'}), RNA_anatomy.rename(columns={1:'a'}), on='a')

RNA_anatomy

In [None]:
mirna_anatomy1025 = RNA_anatomy[RNA_anatomy['3_x'] == 'miRNA']
mirna_anatomy1025['a'] = 'hsa-' + mirna_anatomy1025['a'].str.lower().str.replace(
    'mir','mir-').str.replace('let','let-')

mirna_anatomy1025 = pd.merge(mirna_anatomy1025.rename(columns={'0_y':'b'}), mirna_mirbase_map.rename(columns={1:'a'}), on='a')
mirna_anatomy1025

In [None]:
mirna_anatomy1025[['0_y','5_y']].drop_duplicates().to_csv(
    edge_data_location + 'HpremiRNA-anatomy.txt', header=None, sep='\t', index=None)

In [None]:
RNA_anatomy[6] = RNA_anatomy['0_y'].astype(str) + '#' + RNA_anatomy['3_x'].astype(str)

In [None]:
for i in set(RNA_anatomy['3_x']):
    if i != 'miRNA':
        RNA_anatomy_ = RNA_anatomy[RNA_anatomy['3_x']==i]

        if not RNA_anatomy_.empty:   
            #print(i)
            #print(RNA_anatomy_[[6,'5_y']].drop_duplicates())
            RNA_anatomy_[[6,'5_y']].drop_duplicates().to_csv(
                edge_data_location + 'H' + i + '-anatomy.txt', header=None, sep='\t', index=None)

### RNA-cell

In [None]:
RNA_cell = pd.read_csv(processed_data_location+'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt', sep='\t',header=None)
RNA_cell = RNA_cell[(RNA_cell[3]=='Evidence at transcript level') & (RNA_cell[4]=='cell line')]

symbol_entrez_map['0_y'] = symbol_entrez_map['0_y'].astype(str)
gene_rna[0] = gene_rna[0].astype(str)
rna_pro = pd.merge(gene_rna.rename(columns={0:'0_y'}), symbol_entrez_map, on='0_y')
RNA_cell = pd.merge(rna_pro.rename(columns={'0_x':'a'}), RNA_cell.rename(columns={1:'a'}), on='a')

RNA_cell

In [None]:
mirna_cell1025 = RNA_cell[RNA_cell['3_x'] == 'miRNA']
mirna_cell1025['a'] = 'hsa-' + mirna_cell1025['a'].str.lower().str.replace(
    'mir','mir-').str.replace('let','let-')

mirna_cell1025 = pd.merge(mirna_cell1025.rename(columns={'0_y':'b'}), mirna_mirbase_map.rename(columns={1:'a'}), on='a')
mirna_cell1025.head()

In [None]:
mirna_cell1025[['0_y','5_y']].drop_duplicates().to_csv(
    edge_data_location + 'HpremiRNA-cell.txt', header=None, sep='\t', index=None)

In [None]:
RNA_cell[6] = RNA_cell['0_y'].astype(str) + '#' + RNA_cell['3_x'].astype(str)

In [None]:
for i in set(RNA_cell['3_x']):
    if i != 'miRNA':
        RNA_cell_ = RNA_cell[RNA_cell['3_x']==i]

        if not RNA_cell_.empty:   
            #print(i)
            #print(RNA_anatomy_[[6,'5_y']].drop_duplicates())
            RNA_cell_[[6,'5_y']].drop_duplicates().to_csv(
                edge_data_location + 'H' + i + '-cell.txt', header=None, sep='\t', index=None)

***
## New edges

***
### precursor miRNA-miRNA
* [miRBase](https://www.mirbase.org/) <br />  The miRBase database is a searchable database of published miRNA sequences and annotation. Each entry represents a predicted hairpin portion of a miRNA transcript (termed mir in the database), with information on the location and sequence of the mature miRNA sequence (termed miR).

In [None]:
data_downloader('https://www.mirbase.org/ftp/CURRENT/miRNA.xls.gz', unprocessed_data_location)

In [None]:
premiRNA_miRNA = pd.read_excel(unprocessed_data_location+'miRNA.xls')
premiRNA_miRNA = premiRNA_miRNA.iloc[: , :10]

# For the time being, we keep only Homo sapiens rows
premiRNA_miRNA = premiRNA_miRNA[premiRNA_miRNA['ID'].str.startswith("hsa")]
premiRNA_miRNA

In [None]:
# mature miRNAs can be -3p or -5p --> Select only premiRNA <--> miRNA-3p relations (-3p = mature sequence 1)
premiRNA_miRNAmature1 = premiRNA_miRNA.iloc[: , :7]
premiRNA_miRNAmature1 = premiRNA_miRNAmature1[premiRNA_miRNAmature1['Mature1_ID'].notna()]
premiRNA_miRNAmature1.rename(columns={'Mature1_Acc': 'Mature_Acc', 'Mature1_ID': 'Mature_ID',
                                      'Mature1_Seq': 'Mature_Seq'}, inplace=True)
premiRNA_miRNAmature1

In [None]:
premiRNA_miRNAmature2 = premiRNA_miRNA.drop(premiRNA_miRNA.iloc[:, 4:7],axis = 1)
premiRNA_miRNAmature2 = premiRNA_miRNAmature2[premiRNA_miRNAmature2['Mature2_ID'].notna()]
premiRNA_miRNAmature2.rename(columns={'Mature2_Acc': 'Mature_Acc', 'Mature2_ID': 'Mature_ID',
                                      'Mature2_Seq': 'Mature_Seq'}, inplace=True)
premiRNA_miRNAmature2

In [None]:
premiRNA_miRNAmature = premiRNA_miRNAmature1.append(premiRNA_miRNAmature2)
premiRNA_miRNAmature.insert(1, 'Mature_Acc', premiRNA_miRNAmature.pop("Mature_Acc"))
premiRNA_miRNAmature

In [None]:
premiRNA_miRNAmature[['Accession', 'Mature_Acc']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-miRNA.txt', header=None, sep='\t', index=None)

***
### miRNA-mRNA
* [miRDB](https://mirdb.org/index.html) <br />  miRDB is an online database for miRNA target prediction and functional annotations. All the targets in miRDB were predicted by a bioinformatics tool, MirTarget, which was developed by analyzing thousands of miRNA-target interactions from high-throughput sequencing experiments.

In [None]:
data_downloader('https://mirdb.org/download/miRDB_v6.0_prediction_result.txt.gz', unprocessed_data_location)

In [None]:
miRNA_mRNA = pd.read_csv(unprocessed_data_location+'miRDB_v6.0_prediction_result.txt', sep='\t', names=['miRNA', 'mRNA', 'score'])

# For the time being, we keep only Homo sapiens rows
miRNA_mRNA = miRNA_mRNA[miRNA_mRNA['miRNA'].str.startswith("hsa")]

# All the predicted targets have target prediction scores between 50 - 100.
# These scores are assigned by the new computational target prediction algorithm.
# The higher the score, the more confidence we have in this prediction.
# That is why the search result is ordered by prediction score.
# In our experience, a predicted target with prediction score > 80 is most likely to be real.
# If the score is below 60, you need to be cautious and it is recommended to have other supporting evidence as well.  
miRNA_mRNA = miRNA_mRNA[miRNA_mRNA['score']>80]
miRNA_mRNA

***
* [miRecords](http://c1.accurascience.com/miRecords/download_data.php?v=4) <br />  miRecords is a resource for animal miRNA-target interactions.

In [None]:
#http://c1.accurascience.com/miRecords/download_data.php?v=4
miRNA_mRNA2 = pd.read_excel(unprocessed_data_location+"miRecords_version4.xls")  

# For the time being, we keep only Homo sapiens rows
miRNA_mRNA2 = miRNA_mRNA2[miRNA_mRNA2['Target gene_species_scientific'].str.contains("apiens")]
miRNA_mRNA2 = miRNA_mRNA2[miRNA_mRNA2['miRNA_species'].str.contains("apiens")]
miRNA_mRNA2['Target gene_Refseq_acc']= miRNA_mRNA2['Target gene_Refseq_acc'].str.split(".").str[0]
miRNA_mRNA2.rename(columns={'Target gene_Refseq_acc': 'mRNA', 'miRNA_mature_ID': 'miRNA'}, inplace=True)
miRNA_mRNA2.drop(columns=['Target gene_species_scientific','Target gene_name','miRNA_species'], inplace=True)

miRNA_mRNA2

In [None]:
mRNA_miRNA = pd.merge(miRNA_mRNA, miRNA_mRNA2, how='outer', on=['mRNA', 'miRNA'])
mRNA_miRNA

In [None]:
mRNA_miRNA[['mRNA']].to_csv(unprocessed_data_location + 'mRNA.txt', header=None, index=None)

In [None]:
mRNA = pd.read_csv(unprocessed_data_location+'mRNA.csv')
mRNA.rename(columns={'ACCNUM': 'mRNA'}, inplace=True)
mRNA

In [None]:
mRNA_miRNA = pd.merge(mRNA_miRNA, mRNA, on=['mRNA'])
mRNA_miRNA.insert(1, 'ENTREZID', mRNA_miRNA.pop("ENTREZID"))
mRNA_miRNA.drop(columns=['mRNA'], inplace=True)
mRNA_miRNA

***
* [TarBase](https://dianalab.e-ce.uth.gr/html/diana/web/index.php?r=tarbasev8/index) <br />  DIANA-TarBase v8 is a reference database devoted to the indexing of experimentally supported microRNA (miRNA) targets.

In [None]:
data_downloader('https://dianalab.e-ce.uth.gr/downloads/tarbase_v8_data.tar.gz', unprocessed_data_location)

In [None]:
with tarfile.TarFile(unprocessed_data_location+'tarbase_v8_data.tar', 'r') as tar_ref:
    tar_ref.extractall(unprocessed_data_location)
    
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location +
                          'TarBase_v8_download.txt', sep="\t",
                          dtype={"cell_line": "string"})  

# For the time being, we keep only Homo sapiens rows
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['species'].str.contains("Homo sapiens")]
mRNA_miRNA2.drop(columns=['geneId','species'], inplace=True)
mRNA_miRNA2['geneName'] = mRNA_miRNA2['geneName'].str.replace("\(hsa\)", '')
mRNA_miRNA2.rename(columns={'mirna': 'miRNA'}, inplace=True)
mRNA_miRNA2 = pd.merge(symbol_entrez_map[['0_x','0_y']].rename(columns={'0_x': 'geneName','0_y': 'ENTREZID'}),
                       mRNA_miRNA2, on='geneName')
mRNA_miRNA2.drop(columns=['geneName'], inplace=True)
mRNA_miRNA2

In [None]:
mRNA_miRNA = pd.merge(mRNA_miRNA, mRNA_miRNA2, how='outer', on = ['miRNA','ENTREZID'])
mRNA_miRNA

***
* [miRTarBase](https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/php/index.php) <br /> miRTarBase has accumulated more than three hundred and sixty thousand miRNA-target interactions (MTIs), which are collected by manually surveying pertinent literature after NLP of the text systematically to filter research articles related to functional studies of miRNAs.

In [None]:
# https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/miRTarBase_MTI.xlsx
mRNA_miRNA2 = pd.read_excel(unprocessed_data_location+"miRTarBase_MTI.xlsx")
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['Species (miRNA)'].str.contains('apiens')]
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['Species (Target Gene)'].str.contains('apiens')]
mRNA_miRNA2.drop(columns=['miRTarBase ID','Species (miRNA)','Target Gene','Species (Target Gene)'], inplace=True)
mRNA_miRNA2.rename(columns={'Target Gene (Entrez ID)': 'ENTREZID'}, inplace=True)
mRNA_miRNA2

In [None]:
mRNA_miRNA= pd.merge(mRNA_miRNA, mRNA_miRNA2, how='outer', on=['miRNA','ENTREZID'])
mRNA_miRNA

In [None]:
mRNA_miRNA['Pubmed_id'].fillna(mRNA_miRNA['References (PMID)'], inplace=True)
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].astype('Int64').astype(str)
mRNA_miRNA['References (PMID)'] = mRNA_miRNA['References (PMID)'].astype('Int64').astype(str)
mRNA_miRNA.loc[(mRNA_miRNA["Pubmed_id"]!="<NA>") &
               (mRNA_miRNA["References (PMID)"]!="<NA>") &
               (mRNA_miRNA["Pubmed_id"] !=
                mRNA_miRNA["References (PMID)"]),
               ["Pubmed_id"]] = mRNA_miRNA["Pubmed_id"] + ' + ' + mRNA_miRNA["References (PMID)"]

mRNA_miRNA.drop(columns=['References (PMID)'],inplace=True)

mRNA_miRNA['method'].fillna(mRNA_miRNA['Experiments'], inplace=True)
mRNA_miRNA.loc[(mRNA_miRNA["method"].notna()) &
               (mRNA_miRNA["Experiments"].notna()) &
               (mRNA_miRNA["method"] !=
                mRNA_miRNA["Experiments"]),
               ["method"]] = mRNA_miRNA["method"] + ' + ' + mRNA_miRNA["Experiments"]
mRNA_miRNA.drop(columns=['Experiments'],inplace=True)

mRNA_miRNA

***
* [TargetScan](https://www.targetscan.org/vert_80/) <br /> TargetScan predicts biological targets of miRNAs by searching for the presence of conserved 8mer, 7mer, and 6mer sites that match the seed region of each miRNA. 

In [None]:
#https://www.targetscan.org/vert_80/vert_80_data_download/Predicted_Targets_Context_Scores.default_predictions.txt.zip
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location+'Predicted_Targets_Context_Scores.default_predictions.txt',sep='\t')
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['Gene Tax ID'] == 9606]
mRNA_miRNA2.drop(columns=['Gene ID','Transcript ID','Gene Tax ID'], inplace=True)
mRNA_miRNA2

In [None]:
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, symbol_entrez_map.rename(columns={'0_x': 'Gene Symbol'}), on='Gene Symbol')
mRNA_miRNA2.drop(columns=['Gene Symbol'], inplace=True)
mRNA_miRNA = pd.merge(mRNA_miRNA, mRNA_miRNA2.rename(columns={'0_y':'ENTREZID'}), how='outer', on=['miRNA','ENTREZID'])

***
* [SomamiR](https://compbio.uthsc.edu/SomamiR/) <br /> SomamiR is a database of cancer somatic mutations in microRNAs (miRNA) and their target sites that potentially alter the interactions between miRNAs and competing endogenous RNAs (ceRNA) including mRNAs, circular RNAs (circRNA) and long noncoding RNAs (lncRNA).

In [None]:
#https://compbio.uthsc.edu/SomamiR/download/predicted_mRNA_targets_somamir_v2.0.txt.tar.gz
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location+'predicted_mRNA_targets_somamir_v2.0.txt',sep='\t')
mRNA_miRNA2.drop(columns=['Refseq','Chromosome','strand','Mutationlocation','WTallele','Mutantallele',
                           'Targetsiteclass','Seed_mod','mRNAseq','miRseedseq','Seedclass',
                           'WTconservation','Organisms','Sample_id','wildtype_csp',
                           'mutant_csp','display_first','pita_ref','pita_mut','pita_diff'],inplace=True)
# We select only relationships validated by TargetScan
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['Targetscan']==1] 
mRNA_miRNA2.drop(columns=['Targetscan'],inplace=True)
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, symbol_entrez_map.rename(columns={'0_x':'Genesymbol'}), on='Genesymbol')
mRNA_miRNA2.drop(columns=['Genesymbol'],inplace=True)
mRNA_miRNA2.rename(columns={'0_y':'ENTREZID'},inplace=True)
mRNA_miRNA2

In [None]:
mRNA_miRNA= pd.merge(mRNA_miRNA, mRNA_miRNA2, how='outer', on=['miRNA','ENTREZID'])

mRNA_miRNA['Pubmed_id'].fillna(mRNA_miRNA['Pubmedid'], inplace=True)
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].astype(str)
mRNA_miRNA['Pubmedid'] = mRNA_miRNA['Pubmedid'].astype(str)
mRNA_miRNA.loc[(mRNA_miRNA["Pubmed_id"]!="<NA>") &
               (mRNA_miRNA["Pubmedid"].notna()) &
               (mRNA_miRNA["Pubmed_id"] !=
                mRNA_miRNA["Pubmedid"]),
               ["Pubmed_id"]] = mRNA_miRNA["Pubmed_id"] + ' + ' + mRNA_miRNA["Pubmedid"]
mRNA_miRNA.drop(columns=['Pubmedid'],inplace=True)

***
* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php) <br /> miRdSNP is a database of disease-associated SNPs and microRNA target sites on 3'UTRs of human genes.

In [None]:
#http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
#mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['experimentally_confirmed']=='Yes']
mRNA_miRNA2.drop(columns=['refseq_id','distance','experimentally_confirmed'],inplace=True)
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, symbol_entrez_map.rename(columns={'0_x':'gene_name'}), on='gene_name')
mRNA_miRNA2.drop(columns=['gene_name'],inplace=True)
mRNA_miRNA2.rename(columns={'0_y':'ENTREZID','miR':'miRNA'},inplace=True)
mRNA_miRNA2

In [None]:
mRNA_miRNA= pd.merge(mRNA_miRNA, mRNA_miRNA2, how='outer', on=['miRNA','ENTREZID'])

mRNA_miRNA['diseases'].fillna(mRNA_miRNA['Cancertype'], inplace=True)
mRNA_miRNA.loc[(mRNA_miRNA["diseases"].notna()) &
               (mRNA_miRNA["Cancertype"].notna()) &
               (mRNA_miRNA["diseases"] !=
                mRNA_miRNA["Cancertype"]),
               ["diseases"]] = mRNA_miRNA["diseases"] + ' + ' + mRNA_miRNA["Cancertype"]
mRNA_miRNA.drop(columns=['Cancertype'],inplace=True)

Let's divide miRNA-mRNA interactions into mature_miRNA-mRNA interactions and stem-loop_miRNA-mRNA interactions.

In [None]:
mRNA_miRNA = pd.merge(mirna_mirbase_map.rename(columns={1: 'miRNA'}), mRNA_miRNA, on='miRNA')
mRNA_miRNA.drop(columns=['miRNA'], inplace=True)

mRNA_miRNA['ENTREZID'] = mRNA_miRNA['ENTREZID'].astype('str') + '#mRNA'
maturemRNA_miRNA = mRNA_miRNA[mRNA_miRNA[0].str.startswith('MIMAT')]
premRNA_miRNA = mRNA_miRNA[~mRNA_miRNA[0].str.startswith('MIMAT')]
maturemRNA_miRNA[[0, 'ENTREZID']].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-mRNA.txt', header=None, sep='\t', index=None)
premRNA_miRNA[[0, 'ENTREZID']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-mRNA.txt', header=None, sep='\t', index=None)

***
### miRNA-pseudogene

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
# https://www.dropbox.com/s/r01ppq5x42v4lyh/miRNet-mir-pseudogene.csv?dl=0
miRNA_pseudogene = pd.read_csv(unprocessed_data_location+'miRNet-mir-pseudogene.csv')
miRNA_pseudogene.drop(columns=['mirnet','mir_id','symbol','embl','gene_name','mbv'], inplace=True)
miRNA_pseudogene

In [None]:
# Does miRNet contain premiRNA-pseudogene interactions?
miRNA_pseudogene[~miRNA_pseudogene['mir_acc'].str.startswith('MIMAT')].any()

In [None]:
miRNA_pseudogene.entrez = miRNA_pseudogene.entrez.astype(str)+'#pseudo'

In [None]:
miRNA_pseudogene[['mir_acc', 'entrez']].drop_duplicates().to_csv(
    edge_data_location+'RmiRNA-pseudogene.txt', header=None, sep='\t', index=None)

***
### miRNA-epigenetic modification

* [EpimiR](http://www.jianglab.cn/EpimiR/index.jsp) <br />
The EpimiR database have obtained 1974 regulatory relationships between 19 types of epigenetic modifications (including DNA methylation, histone acetylation, H3K4me3 and H3K27me3, etc.) and 617 miRNAs across 7 species (including Homo sapiens) from nearly 2000 literatures.

In [None]:
#via miRNet --> https://www.dropbox.com/s/p852ndpck5jasxz/miRNet-mir-epi-hsa.csv?dl=0
miRNA_epiMod = pd.read_csv(unprocessed_data_location + 'miRNet-mir-epi-hsa.csv')
miRNA_epiMod.drop(columns=['mirnet','mir_id','note','res_type'], inplace=True)
miRNA_epiMod['epi_modification'] = miRNA_epiMod.epi_modification.str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('epi_modification')
miRNA_epiMod.insert(1, 'epi_modification', miRNA_epiMod.pop("epi_modification"))
miRNA_epiMod

In [None]:
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.replace('DNA Methylation','GO_0006306')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.replace('Histone Acetylation','GO_0016573')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.replace('Histone Modification','GO_0016570')
miRNA_epiMod['epi_modification'].unique()

Let's divide miRNA-epiMod interactions into mature_miRNA-epiMod interactions and stem-loop_miRNA-epiMod interactions. Furthermore, let's divide GO classes from entities.

In [None]:
maturemiRNA_epiMod = miRNA_epiMod[miRNA_epiMod['mir_acc'].str.startswith('MIMAT')]
premiRNA_epiMod = miRNA_epiMod[~miRNA_epiMod['mir_acc'].str.startswith('MIMAT')]

In [None]:
maturemiRNA_epiMod_class = miRNA_epiMod[miRNA_epiMod['epi_modification'].str.startswith('GO')]
premiRNA_epiMod_class = miRNA_epiMod[miRNA_epiMod['epi_modification'].str.startswith('GO')]

maturemiRNA_epiMod_class[['mir_acc', 'epi_modification']].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-epiModclass.txt', header=None, sep='\t', index=None)
premiRNA_epiMod_class[['mir_acc', 'epi_modification']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-epiModclass.txt', header=None, sep='\t', index=None)

In [None]:
maturemiRNA_epiMod = miRNA_epiMod[~miRNA_epiMod['epi_modification'].str.startswith('GO')]
premiRNA_epiMod = miRNA_epiMod[~miRNA_epiMod['epi_modification'].str.startswith('GO')]

maturemiRNA_epiMod[['mir_acc', 'epi_modification']].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-epiMod.txt', header=None, sep='\t', index=None)
premiRNA_epiMod[['mir_acc', 'epi_modification']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-epiMod.txt', header=None, sep='\t', index=None)

***
### miRNA-disease

* [miR2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/) <br />miR2Disease is a manually curated database that aims at providing a comprehensive resource of miRNA deregulation in various human diseases.

In [None]:
data_downloader('http://watson.compbio.iupui.edu:8080/miR2Disease/download/AllEntries.txt', unprocessed_data_location)

In [None]:
miRNA_disease = pd.read_csv(unprocessed_data_location + 'AllEntries.txt', sep="\t", header=None) 
miRNA_disease[1] = miRNA_disease[1].str.lower()
miRNA_disease.rename(columns={0: 'mir_id', 1: 'disease'}, inplace=True)
miRNA_disease

* [HMDD](https://www.cuilab.cn/hmdd) <br /> HMDD (the Human microRNA Disease Database) is a database that curated experiment-supported evidence for human microRNA (miRNA) and disease associations. miRNAs are one class of important regulatory RNAs, which mainly repress gene express at the post-transcriptional level.

In [None]:
#https://www.cuilab.cn/static/hmdd3/data/alldata.xlsx
miRNA_disease2 = pd.read_excel(unprocessed_data_location+'alldata.xlsx')
miRNA_disease2.rename(columns={'mir': 'mir_id'}, inplace=True)
miRNA_disease2.disease = miRNA_disease.disease.str.lower()
miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['description'].fillna(miRNA_disease[5], inplace=True)
miRNA_disease.loc[(miRNA_disease['description'].notna()) & (miRNA_disease[5].notna()) &
                  (miRNA_disease['description'] != miRNA_disease[5]),
          ["description"]] = miRNA_disease["description"] + ' + ' + miRNA_disease[5]

miRNA_disease['category'].fillna(miRNA_disease[2], inplace=True)
miRNA_disease.loc[(miRNA_disease['category'].notna()) & (miRNA_disease[2].notna()) &
                  (miRNA_disease['category'] != miRNA_disease[2]),
          ["category"]] = miRNA_disease["category"] + ' + ' + miRNA_disease[2]

miRNA_disease.drop(columns=['category','description'],inplace=True)
miRNA_disease

***
* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
#https://www.dropbox.com/s/o27wz2kg9co76mo/miRNet-mir-disease.csv?dl=0
miRNA_disease2 = pd.read_csv(unprocessed_data_location + "miRNet-mir-disease.csv")
miRNA_disease2.disease = miRNA_disease2.disease.str.lower()
miRNA_disease2 = miRNA_disease2[~miRNA_disease2['database'].str.contains("miR2Disease")]
miRNA_disease2 = miRNA_disease2.drop(columns=['database','mir_acc','mirnet'])
miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['evidence'].fillna(miRNA_disease[2], inplace=True)
miRNA_disease.loc[(miRNA_disease['evidence'].notna()) & (miRNA_disease[2].notna()) &
                  (miRNA_disease['evidence'] != miRNA_disease[2]),
          ["evidence"]] = miRNA_disease["evidence"] + ' + ' + miRNA_disease[2]

miRNA_disease['method'].fillna(miRNA_disease[3], inplace=True)
miRNA_disease.loc[(miRNA_disease['method'].notna()) & (miRNA_disease[3].notna()) &
                  (miRNA_disease['method'] != miRNA_disease[3]),
          ["method"]] = miRNA_disease["method"] + ' + ' + miRNA_disease[3]

miRNA_disease.drop(columns=[2,3],inplace=True)
miRNA_disease

***
* [dbDEMC](https://www.biosino.org/dbDEMC/index) <br /> dbDEMC (database of Differentially Expressed MiRNAs in human Cancers) is an integrated database that designed to store and display differentially expressed microRNAs (miRNAs) in cancers.

In [None]:
# https://www.biosino.org/dbDEMC/download/MiRExpAll
miRNA_disease2 = pd.read_csv(unprocessed_data_location+"miRExpAll.txt", sep="\t")
miRNA_disease2 = miRNA_disease2[miRNA_disease2.Species.str.contains("apiens")]
miRNA_disease2 = miRNA_disease2.drop(columns=['miRNA_ID','ExperimentID','logFC','SourceDataID',
                                              'AveExpr','Tvalue','Pvalue','Bvalue','Species'])
miRNA_disease2 = miRNA_disease2[miRNA_disease2['adjPvalue']<.01]
miRNA_disease2.rename(columns={'miRBaseID': 'mir_id', 'CancerType': 'disease'}, inplace=True)
miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['evidence'].fillna(miRNA_disease['Status'], inplace=True)
miRNA_disease.loc[(miRNA_disease['evidence'].notna()) & (miRNA_disease['Status'].notna()) &
                  (miRNA_disease['evidence'] != miRNA_disease['Status']),
          ["evidence"]] = miRNA_disease["evidence"] + ' + ' + miRNA_disease['Status']

miRNA_disease.drop(columns=['Status'],inplace=True)
miRNA_disease

***
* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php) <br /> miRdSNP is a database of disease-associated SNPs and microRNA target sites on 3'UTRs of human genes.

In [None]:
#http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv
miRdSNP = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
miRdSNP.diseases = miRdSNP.diseases.str.lower()
miRdSNP.experimentally_confirmed = miRdSNP.experimentally_confirmed.str.replace('Yes', 'experimentally confirmed')
miRdSNP.drop(columns='refseq_id', inplace=True)
miRdSNP.rename(columns={'miR': 'mir_id', 'diseases': 'disease'}, inplace=True)
miRdSNP

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRdSNP,how='outer',on=['mir_id','disease'])

miRNA_disease['method'].fillna(miRNA_disease['experimentally_confirmed'], inplace=True)
miRNA_disease.loc[(miRNA_disease['method'].notna()) & (miRNA_disease['experimentally_confirmed'].notna()) &
                  (miRNA_disease['method'] != miRNA_disease['experimentally_confirmed']),
          ["method"]] = miRNA_disease["method"] + ' + ' + miRNA_disease['experimentally_confirmed']

miRNA_disease.drop(columns=['experimentally_confirmed'],inplace=True)
miRNA_disease

***
* [TAM](http://www.lirmed.com/tam2/) <br /> TAM groups miRNAs into six categories of miRNA sets: miRNA-family sets, miRNA cluster sets, miRNA-disease, miRNA-function sets, miRNA-TF sets and tissue specificity sets.

In [None]:
#http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt
TAM = pd.read_csv(unprocessed_data_location+'mirset_v9.txt', sep='\t',names=range(500))
TAM=TAM.dropna(axis=1, how='all')
miRNA_disease2=TAM[TAM[0]==("HMDD")]
miRNA_disease2[1] = miRNA_disease2[1].str.lower()
miRNA_disease2=miRNA_disease2.dropna(axis=1, how='all')
miRNA_disease2=miRNA_disease2.drop(columns=[0])
miRNA_disease2

In [None]:
miRNA_disease2['merged'] = miRNA_disease2[miRNA_disease2.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_disease2=miRNA_disease2[[1,'merged']]

miRNA_disease2['merged'] = miRNA_disease2.merged.str.split(',')
miRNA_disease2 = miRNA_disease2.explode('merged')
miRNA_disease2.rename(columns={1: 'disease', 'merged': 'mir_id'}, inplace=True)
miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])
miRNA_disease

In [None]:
miRNA_disease = pd.merge(miRNA_disease,desc_do_map.rename(columns={'desc': 'disease'}),
                         how='outer',on=['disease'])
miRNA_disease.insert(1, 'doid', miRNA_disease.pop("doid"))
miRNA_disease = pd.merge(mirna_mirbase_map.rename(columns={1: 'mir_id'}), miRNA_disease, on='mir_id')
miRNA_disease.drop(columns=['mir_id','disease'], inplace=True)

miRNA_disease=pd.merge(miRNA_disease, doid_mondo_map.rename(columns={0:'doid'}), on='doid')
miRNA_disease.drop(columns=['doid'],inplace=True)
miRNA_disease

Let's divide miRNA-disease interactions into mature_miRNA-disease interactions and stem-loop_miRNA-disease interactions.

In [None]:
maturemiRNA_disease = miRNA_disease[miRNA_disease[0].str.startswith('MIMAT')]
premiRNA_disease = miRNA_disease[~miRNA_disease[0].str.startswith('MIMAT')]
maturemiRNA_disease[[0, 1]].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-disease.txt', header=None, sep='\t', index=None)
premiRNA_disease[[0, 1]].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-disease.txt', header=None, sep='\t', index=None)

## TODO: add miRCancer (down)

***
### miRNA-lncRNA

* [miRNet](https://www.mirnet.ca/)

In [None]:
miRNA_lncRNA = pd.read_csv(unprocessed_data_location + "miRNet-mir-lncRNA.csv")
miRNA_lncRNA.drop(columns=['mirnet','mir_acc','entrez','embl','gene_name','mbv'],inplace=True)
miRNA_lncRNA

In [None]:
# Are all miRNA molecules human ones?
any(miRNA_lncRNA['mir_id'].str.contains("hsa"))

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/) <br />  LncRNAWiki is devoted to community curation of human long non-coding RNAs (lncRNAs) to provide a comprehensive and up-to-date resource of functionally annotated lncRNAs. It incorporates a comprehensive collection of experimentally studied lncRNAs and integrates a wealth of their annotations based on a standardized curation model, and improves curation quality through expert curator review and community error report. 

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
miRNA_lncRNA2 = LncRNAWiki[LncRNAWiki['target_type'].notna()]
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2.target_type.str.contains('miRNA')]
miRNA_lncRNA2.target = 'hsa-'+miRNA_lncRNA2.target
miRNA_lncRNA2.drop(columns=['gene_locus','synonyms','gene_id','transcript_id','target_interaction'],inplace=True)
miRNA_lncRNA2

In [None]:
miRNA_lncRNA2.rename(columns={'target': 'mir_id'}, inplace=True)

miRNA_lncRNA = pd.merge(miRNA_lncRNA, miRNA_lncRNA2, how='outer', on=['mir_id','symbol'])
miRNA_lncRNA

***
* [SomamiR](https://compbio.uthsc.edu/SomamiR/)

In [None]:
#https://compbio.uthsc.edu/SomamiR/download/lncRNA_somatic_v2.0.txt.tar.gz
miRNA_lncRNA2 = pd.read_csv(unprocessed_data_location+'lncRNA_somatic_v2.0.txt',sep='\t')
miRNA_lncRNA2.drop(columns=['Transcript','Unnamed: 18'],inplace=True)
miRNA_lncRNA2['Gene'] = miRNA_lncRNA2['Gene'].str.replace(r'lnc-', '')
miRNA_lncRNA2.rename(columns={'Gene': 'symbol', 'miRNA':'mir_id'}, inplace=True)
miRNA_lncRNA2

In [None]:
miRNA_lncRNA = pd.merge(miRNA_lncRNA, miRNA_lncRNA2, how='outer', on=['mir_id','symbol'])

miRNA_lncRNA['pmid'].fillna(miRNA_lncRNA['PMID'], inplace=True)
miRNA_lncRNA['pmid'] = miRNA_lncRNA['pmid'].astype(str)
miRNA_lncRNA['PMID'] = miRNA_lncRNA['PMID'].astype(str)
miRNA_lncRNA.loc[(miRNA_lncRNA['pmid']!="<NA>") & (miRNA_lncRNA['PMID']!="<NA>") &
                 (miRNA_lncRNA['pmid'] != miRNA_lncRNA['PMID']),
                 ["pmid"]] = miRNA_lncRNA["pmid"] + ' + ' + miRNA_lncRNA['PMID']

miRNA_lncRNA.drop(columns=['PMID'],inplace=True)

miRNA_lncRNA = pd.merge(miRNA_lncRNA, mirna_mirbase_map.rename(columns={1:'mir_id'}), on='mir_id')
miRNA_lncRNA = pd.merge(miRNA_lncRNA, symbol_entrez_map[['0_x','0_y']].rename(columns={'0_x':'symbol'}), on='symbol')
miRNA_lncRNA.insert(1, '0_y', miRNA_lncRNA.pop("0_y"))

miRNA_lncRNA

In [None]:
miRNA_lncRNA['0_y'] = miRNA_lncRNA['0_y'].astype(str)+'#lncRNA'
maturemiRNA_lncRNA = miRNA_lncRNA[miRNA_lncRNA[0].str.startswith('MIMAT')]
premiRNA_lncRNA = miRNA_lncRNA[~miRNA_lncRNA[0].str.startswith('MIMAT')]
maturemiRNA_lncRNA[[0, '0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-lncRNA.txt', header=None, sep='\t', index=None)
premiRNA_lncRNA[[0, '0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-lncRNA.txt', header=None, sep='\t', index=None)

## TODO: add [LncBook](https://ngdc.cncb.ac.cn/lncbook/); issue: 3h download

***
### miRNA-SNP

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
#https://www.dropbox.com/s/cu4hv35ulu3a8d6/miRNet-snp-mir-hsa.csv?dl=0
miRNA_variant=pd.read_csv(unprocessed_data_location + "miRNet-snp-mir-hsa.csv")
miRNA_variant = miRNA_variant[miRNA_variant['High_Confidence']=='YES']
miRNA_variant.drop(columns=['mirnet','chr_pos','gnomAD_MAF','Mature_Name','Mature_Acc','Mature_Pos',
                            'Family_Name','Robust_FANTOM5','Conserved_ADmiRE','MIRNA_Acc',
                            'AF_Percentile_gnomAD','Phylop_100way','Phastcons_100way','High_Confidence'],
                   inplace=True)

miRNA_variant

***
* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/)

In [None]:
#http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv
miRdSNP = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
miRNA_variant2 = miRdSNP.drop(columns=['refseq_id'])
miRNA_variant2.rename(columns={'SNP':'rsid','miR':'MIRNA_Name'},inplace=True)
miRNA_variant2

In [None]:
miRNA_variant = pd.merge(miRNA_variant, miRNA_variant2, how='outer', on=['rsid','MIRNA_Name'])
miRNA_variant

In [None]:
miRNA_variant = pd.merge(miRNA_variant, mirna_mirbase_map.rename(columns={1:'MIRNA_Name'}), on='MIRNA_Name')
miRNA_variant.insert(0, 0, miRNA_variant.pop(0))

maturemiRNA_variant = miRNA_variant[miRNA_variant[0].str.startswith('MIMAT')]
premiRNA_variant = miRNA_variant[~miRNA_variant[0].str.startswith('MIMAT')]
maturemiRNA_variant[[0, 'rsid']].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-variant.txt', header=None, sep='\t', index=None)
premiRNA_variant[[0, 'rsid']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-variant.txt', header=None, sep='\t', index=None)

In [None]:
premiRNA_variant[[0, 'rsid']].drop_duplicates()

***
### gene-SNP

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
#https://www.dropbox.com/s/8aq8k0yoy5ak0d6/miRNet-snpmirbs-hsa.csv?dl=0
gene_variant=pd.read_csv(unprocessed_data_location + "miRNet-snpmirbs-hsa.csv")
gene_variant

In [None]:
gene_variant[['rsid','entrez']].drop_duplicates().to_csv(
    edge_data_location + 'Rvariant-gene.txt', header=None, sep='\t', index=None)

***
### SNP-disease

* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php)

In [None]:
#http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnps-v11.03.csv
disease_variant = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnps-v11.03.csv')
disease_variant.drop(columns=['pub_year','pub_month','link','journal'],inplace=True)
disease_variant.disease = disease_variant.disease.str.lower()
disease_variant = pd.merge(disease_variant, desc_do_map.rename(columns={'desc':'disease'}), on='disease') 
disease_variant.drop(columns=['disease'],inplace=True)
disease_variant['snps'] = disease_variant.snps.str.split(',')
disease_variant = disease_variant.explode('snps')

disease_variant = pd.merge(disease_variant, doid_mondo_map.rename(columns={0:'doid'}),on='doid')
disease_variant.drop(columns=['doid'],inplace=True)

disease_variant

In [None]:
disease_variant[['snps',1]].drop_duplicates().to_csv(
    edge_data_location + 'Rvariant-disease.txt', header=None, sep='\t', index=None)

***
### TF-SNP

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
#https://www.dropbox.com/s/8aq8k0yoy5ak0d6/miRNet-snpmirbs-hsa.csv?dl=0
TF_variant=pd.read_csv(unprocessed_data_location + "miRNet-snptfbs-hsa.csv")
TF_variant.drop(columns=['mirnet','chr_pos','entrez','name'],inplace=True)

TF_variant = pd.merge(TF_variant,symbol_to_pro.rename(columns={'0_x':'symbol'}),on=['symbol'])
TF_variant.drop(columns=['symbol'],inplace=True)

TF_variant

In [None]:
TF_variant[['rsid','1_y']].drop_duplicates().to_csv(
    edge_data_location + 'Rvariant-TF.txt', header=None, sep='\t', index=None)

***
### tsRNA-miRNA

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php) <br /> tsRFun is a platform for tsRNA functions by High-throughput Small RNA-Seq and CLIP-Seq Data.

In [None]:
#https://rna.sysu.edu.cn/tsRFun/download/tsRNetwork/all_hypgm_df.txt
tsRNA_miRNA = pd.read_csv(unprocessed_data_location + 'all_hypgm_df.txt', sep="\t")  
tsRNA_miRNA

In [None]:
# We consider pairs with FDR < 0.01
tsRNA_miRNA = tsRNA_miRNA[tsRNA_miRNA['adj.p'] < 0.01]
# We also remove unadjusted p-val column since we have FDR
tsRNA_miRNA.drop('p', axis=1, inplace=True)
tsRNA_miRNA

In [None]:
tsRNA_miRNA = pd.merge(tsRNA_miRNA,mirna_mirbase_map.rename(columns={1:'miRNA'}), on=['miRNA'])
tsRNA_miRNA.drop(columns='miRNA',inplace=True)
tsRNA_miRNA = pd.merge(tsRNA_miRNA,tsRNA_tRF_map.rename(columns={0:'tsRNA'}), on=['tsRNA'])
tsRNA_miRNA.drop(columns='tsRNA',inplace=True)
tsRNA_miRNA

In [None]:
tsRNA_miRNA[[1,0]].drop_duplicates().to_csv(
    edge_data_location + 'RtsRNA-miRNA.txt', header=None, sep='\t', index=None)

***
### tsRNA-disease

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php)

In [None]:
#https://rna.sysu.edu.cn/tsRFun/download/tsRinCancer/allCancer_0.txt
tsRNA_disease = pd.read_csv(unprocessed_data_location + 'allCancer_0.txt', sep="\t", index_col=0)  
tsRNA_disease

In [None]:
# We keep only log2FC columns
tsRNA_disease = tsRNA_disease.loc[:, tsRNA_disease.columns.str.endswith('_log2FC')]
tsRNA_disease.columns = tsRNA_disease.columns.str.replace(r'_log2FC$', '')

# tsRNA has a relationship with cancer iff |log2FC| >= 1
tsRNA_disease[abs(tsRNA_disease) < 1] = 0
tsRNA_disease

In [None]:
# We want a dataframe with 2 columns, tRF and associated cancer;
# this is an example with ACC 
tRF=[]
log2FC=[]
for index, row in tsRNA_disease.iterrows():
     if row['ACC'] != 0:
            tRF.append(index)
            log2FC.append(row['ACC'])
            
df_acc = pd.DataFrame (tRF, columns = ['tRF'])
df_acc['dis'] = 'ACC'
df_acc['log2FC'] = log2FC
df_acc

In [None]:
# Empty dataframe to store processed rows
trRF_disease = pd.DataFrame(columns = ["tRF", "dis"])

log2FC=[]
for cancer in tsRNA_disease.columns:    
    tRF=[]
    for index, row in tsRNA_disease.iterrows():
         if row[cancer] != 0:
            tRF.append(index)
            log2FC.append(row[cancer])
    
    df = pd.DataFrame (tRF, columns = ['tRF'])
    df['dis'] = cancer
    
    trRF_disease = trRF_disease.append(df)
    
trRF_disease['log2FC'] = log2FC
trRF_disease

In [None]:
trRF_disease = pd.merge(trRF_disease, cancer_mondo_map.rename(columns={0:'dis'}), on='dis')
trRF_disease.drop(columns=['dis'],inplace=True)
trRF_disease

In [None]:
trRF_disease[['tRF',1]].drop_duplicates().to_csv(
    edge_data_location + 'RtsRNA-disease.txt', header=None, sep='\t', index=None)

***
### tRF-tRNA

* [tRFdb](http://genome.bioch.virginia.edu/trfdb/index.php) <br /> tRFdb is a comprehensive database of tRFs prepared from publicly available high-throughput sequencing data of >50 short RNA libraries. tRFs originate precisely from the extreme 5' (tRF-5) or 3' ends (tRF-3) of mature tRNAs or from the 3' trailer sequence of precursor tRNA transcripts (tRF-1) and are present in humans, mice, flies, worms and yeasts.

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Organism','Type','tRNA Gene Co-ordinates','Experiment Info','Sequence'],inplace=True)
tRF1_tRNA.head()

In [None]:
tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism','Type','tRNA Gene Co-ordinates','Experiment Info','Sequence'],inplace=True)
tRF3_tRNA.head()

In [None]:
tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism','Type','tRNA Gene Co-ordinates','Experiment Info','Sequence'],inplace=True)
tRF5_tRNA.head()

In [None]:
tRF1_tRNA.append(tRF3_tRNA).append(tRF5_tRNA).drop_duplicates().to_csv(
    edge_data_location + 'RtsRNA-tRNA_tRFdb.txt', header=None, sep='\t', index=None)

***
* [MINTbase](https://cm.jefferson.edu/MINTbase/) <br /> The Mitochondrial and Nuclear tRNA fragment database (MINTbase) is a repository of tRNA fragments (tRFs).

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbasetRF-tRNA.txt',sep='\t')
tRF_tRNA2 = tRF_tRNA2[['License Plate (sequence derived)','MINTbase Alternative IDs (GRCh37 assembly-derived)',
                       'D-loop overlap?','Anticodon-loop overlap?','Anticodon-triplet overlap?','T-loop overlap?',
                       'Maximum RPM']]
tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'] = tRF_tRNA2[
    'MINTbase Alternative IDs (GRCh37 assembly-derived)'].str.split('@', 1).str[0]
tRF_tRNA2.rename(columns={'MINTbase Alternative IDs (GRCh37 assembly-derived)':'MINTbase tRNA name'},inplace=True)
tRF_tRNA2 = pd.merge(tRF_tRNA2, tRNA_MINTbase_GtRNAdb_map, on='MINTbase tRNA name')
tRF_tRNA2.drop(columns=['MINTbase tRNA name'],inplace=True)
# For post-processing issues
tRF_tRNA2['gtRNAdb name'] = tRF_tRNA2['gtRNAdb name'].astype(str)+'.html'
tRF_tRNA2

In [None]:
tRF_tRNA2[['License Plate (sequence derived)','gtRNAdb name']].drop_duplicates().to_csv(
    edge_data_location + 'RtsRNA-tRNA_MINTbase.txt', header=None, sep='\t', index=None)

***
### tRNA-amino acid

* [tRNAdb](http://trna.bioinf.uni-leipzig.de/DataOutput/) <br /> tRNAdb contains more than 12 000 tRNA genes, classified into families according to amino acid specificity. The database provides various services including graphical representations of tRNA secondary structures, a customizable output of aligned or un-aligned sequences with a variety of individual and combinable search criteria, as well as the construction of consensus sequences for any selected set of tRNAs.

In [None]:
#http://trna.bioinf.uni-leipzig.de/DataOutput/Result
tRNA_aa = pd.read_html(unprocessed_data_location+'tRNAdb - Transfer RNA database.html')[3]
tRNA_aa.drop(columns=[0,1,2,4,19,20],inplace=True)
tRNA_aa.rename(columns=tRNA_aa.iloc[0], inplace=True)
tRNA_aa = tRNA_aa.iloc[2:]
tRNA_aa.head()

In [None]:
#For the time being (we have no metatdata in RNA-KG), it is better to preoceed this way:
tRNA_aa = tRNA_MINTbase_GtRNAdb_map[['gtRNAdb name']]
tRNA_aa['gtRNAdb name'] = tRNA_aa['gtRNAdb name'] + '.html'
tRNA_aa['new'] = tRNA_MINTbase_GtRNAdb_map['gtRNAdb name'].str.split("-").str[1]
tRNA_aa['Amino Acid'] = tRNA_aa[tRNA_aa['new']=='tRNA']['gtRNAdb name'].str.split("-").str[2]
tRNA_aa['Amino Acid'].fillna(tRNA_aa['new'],inplace=True)
tRNA_aa.drop(columns=['new'],inplace=True)
tRNA_aa

In [None]:
tRNA_aa = pd.merge(tRNA_aa, aa_chebi_map.rename(columns={0:'Amino Acid'}), on='Amino Acid')
tRNA_aa.drop(columns=['Amino Acid'],inplace=True)
tRNA_aa

In [None]:
tRNA_aa.drop_duplicates().to_csv(
    edge_data_location + 'RtRNA-aminoacid.txt', header=None, sep='\t', index=None)

***
### snoRNA-gene

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/) <br /> snoDB is an interactive database of human small nucleolar RNAs (snoRNAs) that includes up-to-date information on snoRNA features, genomic location, conservation, host gene, snoRNA-RNA targets and snoRNA abundance and provides links to other resources.

In [None]:
data_downloader('https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/download_all', unprocessed_data_location)

In [None]:
snoDB = pd.read_csv(unprocessed_data_location + 'download_all', sep="\t")
snoDB = snoDB[['ncbi_id','host_gene_name','target_count','rrna_targets','snrna_targets','lncrna_targets',
               'protein_coding_targets','snorna_targets','mirna_targets','trna_targets','ncrna_targets',
               'pseudogene_targets','other_targets','is_expressed']]
snoDB = snoDB[snoDB['ncbi_id'].notna()]
snoDB['ncbi_id'] = snoDB['ncbi_id'].astype('Int64')
snoDB

In [None]:
snoRNA_gene = snoDB[['ncbi_id', 'host_gene_name', 'target_count', 'is_expressed']]
snoRNA_gene = pd.merge(symbol_entrez_map.rename(columns={'0_x':'host_gene_name'}), snoRNA_gene, on='host_gene_name')
snoRNA_gene.ncbi_id = snoRNA_gene.ncbi_id.astype(str)+'#snoRNA'
snoRNA_gene.drop(columns=['host_gene_name'],inplace=True)
snoRNA_gene

In [None]:
snoRNA_gene[['ncbi_id','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-gene.txt', header=None, sep='\t', index=None)

***
### snoRNA-miRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_miRNA = snoDB[['ncbi_id', 'mirna_targets']]
snoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['mirna_targets'].notna()]
snoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['ncbi_id'].notna()]
snoRNA_miRNA['ncbi_id'] = pd.to_numeric(snoRNA_miRNA['ncbi_id'], downcast='integer')
snoRNA_miRNA.ncbi_id = snoRNA_miRNA.ncbi_id.astype(str)+'#snoRNA'
snoRNA_miRNA

In [None]:
snoRNA_miRNA['miRBase_id'] = ['', 'MI0000075']
snoRNA_miRNA.drop(columns='mirna_targets',inplace=True)
# Up to now, no miRBase ID is associated with AC008521

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
# https://www.dropbox.com/s/gpt1yrwoe1h2gx7/miRNet-mir-sncRNA.csv?dl=0
snoRNA_miRNA2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-sncRNA.csv')
snoRNA_miRNA2 = snoRNA_miRNA2[snoRNA_miRNA2.gene_name.str.contains('small nucleolar')]
snoRNA_miRNA2.drop(columns=['mirnet','mir_id','symbol','embl','gene_name','mbv'],inplace=True)
snoRNA_miRNA2.entrez = snoRNA_miRNA2.entrez.astype(str)+'#snoRNA'
snoRNA_miRNA2.rename(columns={'mir_acc':'miRBase_id', 'entrez':'ncbi_id'},inplace=True)
snoRNA_miRNA2

In [None]:
snoRNA_miRNA = snoRNA_miRNA.append(snoRNA_miRNA2)
maturesnoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['miRBase_id'].str.startswith('MIMAT')]
presnoRNA_miRNA = snoRNA_miRNA[~snoRNA_miRNA['miRBase_id'].str.startswith('MIMAT')]
maturesnoRNA_miRNA[['ncbi_id','miRBase_id']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-miRNA.txt', header=None, sep='\t', index=None)
presnoRNA_miRNA[['ncbi_id','miRBase_id']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-premiRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-snoRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_snoRNA = snoDB[['ncbi_id', 'snorna_targets']]
snoRNA_snoRNA = snoRNA_snoRNA[snoRNA_snoRNA['snorna_targets'].notna()]
snoRNA_snoRNA

In [None]:
snoRNA_snoRNA['processed_targets'] = snoRNA_snoRNA.snorna_targets.str.split(';')
snoRNA_snoRNA = snoRNA_snoRNA.explode('processed_targets')
snoRNA_snoRNA.drop('snorna_targets', axis=1, inplace=True)
snoRNA_snoRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'processed_targets'}), snoRNA_snoRNA, on='processed_targets')
snoRNA_snoRNA.ncbi_id = snoRNA_snoRNA.ncbi_id.astype('Int64').astype(str)+'#snoRNA'
snoRNA_snoRNA.drop(columns=['processed_targets'],inplace=True)
snoRNA_snoRNA = snoRNA_snoRNA[~snoRNA_snoRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_snoRNA['0_y'] = snoRNA_snoRNA['0_y'].astype(str)+'#snoRNA'
snoRNA_snoRNA

In [None]:
snoRNA_snoRNA[['0_y','ncbi_id']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-snoRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-lncRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_lncRNA = snoDB[['ncbi_id', 'lncrna_targets']]
snoRNA_lncRNA = snoRNA_lncRNA[snoRNA_lncRNA['lncrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA
snoRNA_lncRNA['lncrna_targets'] = snoRNA_lncRNA.lncrna_targets.str.split(';')
snoRNA_lncRNA = snoRNA_lncRNA.explode('lncrna_targets')

snoRNA_lncRNA.head()

In [None]:
snoRNA_lncRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'lncrna_targets'}), snoRNA_lncRNA, on='lncrna_targets')
snoRNA_lncRNA.ncbi_id = snoRNA_lncRNA.ncbi_id.astype('Int64').astype(str)+'#snoRNA'
snoRNA_lncRNA['0_y'] = snoRNA_lncRNA['0_y'].astype('Int64').astype(str)+'#lncRNA'
snoRNA_lncRNA.drop(columns=['lncrna_targets'],inplace=True)
snoRNA_lncRNA = snoRNA_lncRNA[~snoRNA_lncRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_lncRNA

In [None]:
snoRNA_lncRNA[['ncbi_id','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-lncRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-snRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_snRNA = snoDB[['ncbi_id','snrna_targets']]
snoRNA_snRNA = snoRNA_snRNA[snoRNA_snRNA['snrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_snRNA['snrna_targets'] = snoRNA_snRNA.snrna_targets.str.split(';')
snoRNA_snRNA = snoRNA_snRNA.explode('snrna_targets')

snoRNA_snRNA

In [None]:
snoRNA_snRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'snrna_targets'}), snoRNA_snRNA, on='snrna_targets')
snoRNA_snRNA.ncbi_id = snoRNA_snRNA.ncbi_id.astype('Int64').astype(str)+'#snoRNA'
snoRNA_snRNA['0_y'] = snoRNA_snRNA['0_y'].astype('Int64').astype(str)+'#snRNA'
snoRNA_snRNA.drop(columns=['snrna_targets'],inplace=True)
snoRNA_snRNA = snoRNA_snRNA[~snoRNA_snRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_snRNA.head()

In [None]:
snoRNA_snRNA[['ncbi_id','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-snRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-rRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_rRNA = snoDB[['ncbi_id','rrna_targets']]
snoRNA_rRNA = snoRNA_rRNA[snoRNA_rRNA['rrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_rRNA['rrna_targets'] = snoRNA_rRNA.rrna_targets.str.split(';')
snoRNA_rRNA = snoRNA_rRNA.explode('rrna_targets')

snoRNA_rRNA

In [None]:
snoRNA_rRNA.ncbi_id = snoRNA_rRNA.ncbi_id.astype('Int64').astype(str)+'#snoRNA'
snoRNA_rRNA['rrna_targets'] = snoRNA_rRNA['rrna_targets'].astype(str)+'#snoDBrRNA'
snoRNA_rRNA = snoRNA_rRNA[~snoRNA_rRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_rRNA

In [None]:
snoRNA_rRNA[['ncbi_id','rrna_targets']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-rRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-mRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_mRNA = snoDB[['ncbi_id','protein_coding_targets']]
snoRNA_mRNA = snoRNA_mRNA[snoRNA_mRNA['protein_coding_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_mRNA['protein_coding_targets'] = snoRNA_mRNA.protein_coding_targets.str.split(';')
snoRNA_mRNA = snoRNA_mRNA.explode('protein_coding_targets')

snoRNA_mRNA

In [None]:
snoRNA_mRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'protein_coding_targets'}),
                       snoRNA_mRNA, on='protein_coding_targets')
snoRNA_mRNA.ncbi_id = snoRNA_mRNA.ncbi_id.astype('Int64').astype(str)+'#snoRNA'
snoRNA_mRNA['0_y'] = snoRNA_mRNA['0_y'].astype('Int64').astype(str)+'#mRNA'
snoRNA_mRNA.drop(columns=['protein_coding_targets'],inplace=True)
snoRNA_mRNA = snoRNA_mRNA[~snoRNA_mRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_mRNA

In [None]:
snoRNA_mRNA[['ncbi_id','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-mRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-tRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_tRNA = snoDB[['ncbi_id','trna_targets']]
snoRNA_tRNA = snoRNA_tRNA[snoRNA_tRNA['trna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA.trna_targets.str.split(';')
snoRNA_tRNA = snoRNA_tRNA.explode('trna_targets')
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA['trna_targets'].astype(str)
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA['trna_targets'].str.replace('_TRNA','')
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA['trna_targets'].str.replace('_','')

snoRNA_tRNA.ncbi_id = snoRNA_tRNA.ncbi_id.astype('Int64').astype(str)+'#snoRNA'

snoRNA_tRNA

In [None]:
snoRNA_tRNA[['ncbi_id','trna_targets']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-tRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-(non-specifically-classified) ncRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_ncRNA = snoDB[['ncbi_id','ncrna_targets']]
snoRNA_ncRNA = snoRNA_ncRNA[snoRNA_ncRNA['ncrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_ncRNA['ncrna_targets'] = snoRNA_ncRNA.ncrna_targets.str.split(';')
snoRNA_ncRNA = snoRNA_ncRNA.explode('ncrna_targets')

snoRNA_ncRNA

In [None]:
entrez_enst_map = pd.read_csv(processed_data_location + 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t", header=None)
symbol_entrez_map = pd.merge(symbol_ensembl_map, entrez_enst_map, on=[1])
symbol_entrez_map = symbol_entrez_map[['0_x','0_y','3_x']]
symbol_entrez_map

In [None]:
snoRNA_ncRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'ncrna_targets'}), snoRNA_ncRNA, on='ncrna_targets')
snoRNA_ncRNA.ncbi_id = snoRNA_ncRNA.ncbi_id.astype('Int64').astype(str)+'#snoRNA'
snoRNA_ncRNA.drop(columns=['ncrna_targets'],inplace=True)
snoRNA_ncRNA = snoRNA_ncRNA[~snoRNA_ncRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_ncRNA

In [None]:
snoRNA_ncRNA['0_y'] = snoRNA_ncRNA['0_y'].astype(str)+'#'+snoRNA_ncRNA['3_x'].astype(str)
snoRNA_ncRNA

In [None]:
snoRNA_ncRNA['3_x'].unique()

In [None]:
snoRNA_lncRNA2 = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='lncRNA'][['ncbi_id','0_y']]
snoRNA_lncRNA2

In [None]:
snoRNA_lncRNA = snoRNA_lncRNA[['ncbi_id','0_y']].append(snoRNA_lncRNA2)

In [None]:
snoRNA_ri = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='retained_intron'][['ncbi_id','0_y']]
snoRNA_ri

In [None]:
snoRNA_ri[['ncbi_id','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-retainedIntron.txt', header=None, sep='\t', index=None)

In [None]:
snoRNA_miscRNA = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='misc_RNA'][['ncbi_id','0_y']]
snoRNA_miscRNA

In [None]:
snoRNA_miscRNA[['ncbi_id','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-miscRNA.txt', header=None, sep='\t', index=None)

In [None]:
snoRNA_scaRNA = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='scaRNA'][['ncbi_id','0_y']]
snoRNA_scaRNA.head()

In [None]:
snoRNA_scaRNA[['ncbi_id','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-scaRNA.txt', header=None, sep='\t', index=None)

In [None]:
symbol_entrez_map.drop(columns=['3_x'],inplace=True)

***
### snoRNA-pseudogene

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_pseudogene = snoDB[['ncbi_id','pseudogene_targets']]
snoRNA_pseudogene = snoRNA_pseudogene[snoRNA_pseudogene['pseudogene_targets'].notna()]
snoRNA_pseudogene.ncbi_id = snoRNA_pseudogene.ncbi_id.astype('Int64').astype(str)+'#snoRNA'

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_pseudogene['pseudogene_targets'] = snoRNA_pseudogene.pseudogene_targets.str.split(';')
snoRNA_pseudogene = snoRNA_pseudogene.explode('pseudogene_targets')

snoRNA_pseudogene

In [None]:
snoRNA_pseudogene['pseudogene_targets'] = ['107075265','100287215','106481730','26121',
                                           '100420364','401914','100420656','26121',
                                           '26121','100873211']
snoRNA_pseudogene['pseudogene_targets'] = snoRNA_pseudogene['pseudogene_targets']+'#pseudo'
snoRNA_pseudogene

In [None]:
snoRNA_pseudogene[['ncbi_id','pseudogene_targets']].drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-pseudogene.txt', header=None, sep='\t', index=None)

***
### snoRNA-(miscellaneous of) other targets

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_other = snoDB[['ncbi_id','other_targets']]
snoRNA_other = snoRNA_other[snoRNA_other['other_targets'].notna()]
snoRNA_other.ncbi_id = snoRNA_other.ncbi_id.astype('Int64').astype(str)+'#snoRNA'

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_other['other_targets'] = snoRNA_other.other_targets.str.split(';')
snoRNA_other = snoRNA_other.explode('other_targets')

snoRNA_other

In [None]:
snoRNA_other['other_targets'] = ['tRNA-SeC-TCA-1-1','3653#lncRNA','tRNA-SeC-TCA-1-1','3653#lncRNA',
                                 '3653#lncRNA','3653#lncRNA','3653#lncRNA','106633801#scaRNA','106633801#scaRNA']

snoRNA_other['type'] = ['tRNA','lncRNA','tRNA','lncRNA','lncRNA','lncRNA','lncRNA','scaRNA','scaRNA']
snoRNA_other

In [None]:
snoRNA_tRNA.append(snoRNA_other[snoRNA_other['type']=='tRNA'][[
    'ncbi_id','other_targets']]).drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-tRNA.txt', header=None, sep='\t', index=None)

In [None]:
snoRNA_scaRNA.append(snoRNA_other[snoRNA_other['type']=='scaRNA'][[
    'ncbi_id','other_targets']]).drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-scaRNA.txt', header=None, sep='\t', index=None)

In [None]:
snoRNA_lncRNA.append(snoRNA_other[snoRNA_other['type']=='lncRNA'][[
    'ncbi_id','other_targets']]).drop_duplicates().to_csv(
    edge_data_location + 'RsnoRNA-lncRNA.txt', header=None, sep='\t', index=None)

***
### lncRNA-gene

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_gene = LncRNAWiki[LncRNAWiki['target_type'].notna()]
lncRNA_gene = lncRNA_gene[lncRNA_gene.target_type.str.contains('PCG')]
lncRNA_gene = lncRNA_gene[['symbol','target','target_effect','pmid']]
lncRNA_gene['target'] = lncRNA_gene['target'].str.split(';')
lncRNA_gene = lncRNA_gene.explode('target')
lncRNA_gene = pd.merge(lncRNA_gene,symbol_entrez_map.rename(columns={'0_x':'target'}), on='target')
lncRNA_gene.drop(columns=['target'],inplace=True)
lncRNA_gene = pd.merge(lncRNA_gene,symbol_entrez_map.rename(columns={'0_x':'symbol'}), on='symbol')
lncRNA_gene.drop(columns=['symbol'],inplace=True)
lncRNA_gene['0_y_y'] = lncRNA_gene['0_y_y'].astype(str)+'#lncRNA'
lncRNA_gene

In [None]:
lncRNA_gene[['0_y_y','0_y_x']].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-gene.txt', header=None, sep='\t', index=None)

***
### lncRNA-disease

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/) <br /> LncRNADisease integrates comprehensive experimentally supported and predicted ncRNA-disease associations curated from manual literatures and other resources.

In [None]:
#data_downloader('http://www.rnanut.net/lncrnadisease/static/download/experimental%20lncRNA-disease%20information.xlsx', unprocessed_data_location)

In [None]:
lncRNA_disease = pd.read_excel(unprocessed_data_location + 'experimental%20lncRNA-disease%20information.xlsx')  
# We keep only rows dealing with HS
lncRNA_disease = lncRNA_disease[lncRNA_disease['Species'].str.contains("sapiens")]
lncRNA_disease.drop(columns=['ncRNA Category','Species'],inplace=True)
lncRNA_disease

In [None]:
lncRNA_disease = lncRNA_disease.rename(columns={"Disease Name": "desc"})
lncRNA_disease['desc'] = lncRNA_disease['desc'].str.lower()
lncRNA_disease = pd.merge(desc_do_map, lncRNA_disease, on=['desc'])
lncRNA_disease.drop(columns=['desc'],inplace=True)
lncRNA_disease

***
* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html) <br /> Lnc2Cancer is a manually curated database that provides comprehensive experimentally supported associations between lncRNA or circRNA and human cancer.

In [None]:
#http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/lncRNA.xlsx
lncRNA_disease2 = pd.read_excel(unprocessed_data_location+'lncRNA.xlsx')  
lncRNA_disease2 = pd.merge(desc_do_map, lncRNA_disease2, left_on=['desc'], right_on=['cancer type'])
lncRNA_disease2.drop(columns=['desc','cancer type'],inplace=True)
lncRNA_disease2.rename(columns={'name':'ncRNA Symbol'},inplace=True)
lncRNA_disease2

In [None]:
lncRNA_disease = pd.merge(lncRNA_disease, lncRNA_disease2, how='outer', on=['doid','ncRNA Symbol'])

lncRNA_disease['Sample'].fillna(lncRNA_disease['sample'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['Sample'].notna()) & (lncRNA_disease['sample'].notna()) &
                 (lncRNA_disease['Sample'] != lncRNA_disease['sample']),
                 ["Sample"]] = lncRNA_disease["Sample"] + ' + ' + lncRNA_disease['sample']

lncRNA_disease.drop(columns=['sample'],inplace=True)

lncRNA_disease['Dysfunction Pattern'].fillna(lncRNA_disease['regulated'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['Dysfunction Pattern'].notna()) & (lncRNA_disease['regulated'].notna()) &
                 (lncRNA_disease['Dysfunction Pattern'] != lncRNA_disease['regulated']),
                 ["Dysfunction Pattern"]] = lncRNA_disease["Dysfunction Pattern"] + ' + ' + lncRNA_disease['regulated']

lncRNA_disease.drop(columns=['regulated'],inplace=True)

lncRNA_disease['Validated Method'].fillna(lncRNA_disease['methods'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['Validated Method'].notna()) & (lncRNA_disease['methods'].notna()) &
                 (lncRNA_disease['Validated Method'] != lncRNA_disease['methods']),
                 ["Validated Method"]] = lncRNA_disease["Validated Method"] + ' + ' + lncRNA_disease['methods']

lncRNA_disease.drop(columns=['methods'],inplace=True)

lncRNA_disease

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_disease2 = LncRNAWiki[LncRNAWiki['biological_context'].notna()]
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2.biological_context.str.contains('isease')]
lncRNA_disease2 = lncRNA_disease2[['symbol', 'context_detail','clinical_detail','tissue/cell line','description','pmid']]
lncRNA_disease2.rename(columns={'context_detail':'desc','symbol':'ncRNA Symbol'},inplace=True)
lncRNA_disease2

In [None]:
lncRNA_disease2 = pd.merge(lncRNA_disease2, desc_do_map, on='desc')
lncRNA_disease2.drop(columns='desc',inplace=True)
lncRNA_disease = pd.merge(lncRNA_disease, lncRNA_disease2, how='outer', on=['doid','ncRNA Symbol'])

lncRNA_disease['Sample'].fillna(lncRNA_disease['tissue/cell line'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['Sample'].notna()) & (lncRNA_disease['tissue/cell line'].notna()) &
                 (lncRNA_disease['Sample'] != lncRNA_disease['tissue/cell line']),
                 ["Sample"]] = lncRNA_disease["Sample"] + ' + ' + lncRNA_disease['tissue/cell line']

lncRNA_disease.drop(columns=['tissue/cell line'],inplace=True)

lncRNA_disease['PubMed ID'] = lncRNA_disease['PubMed ID'].astype(str)
lncRNA_disease['PubMed ID'].fillna(lncRNA_disease['pmid'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['PubMed ID'].notna()) & (lncRNA_disease['pmid'].notna()) &
                 (lncRNA_disease['PubMed ID'] != lncRNA_disease['pmid']),
                 ["PubMed ID"]] = lncRNA_disease["PubMed ID"] + ' + ' + lncRNA_disease['pmid']

lncRNA_disease.drop(columns=['pmid'],inplace=True)

lncRNA_disease

***
* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
# https://ngdc.cncb.ac.cn/lncbook/files/variation_LncBook2.0.csv.gz
lncRNA_disease2 = pd.read_csv(unprocessed_data_location+'variation_LncBook2.0.csv')
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Tumor Name'] != '-']
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['Symbol'] != '-']
lncRNA_disease2 = lncRNA_disease2[['Symbol','dbSNP ID','COSMIC Tumor Name']]
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.split(';')
lncRNA_disease2 = lncRNA_disease2.explode('COSMIC Tumor Name')
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.replace(r"\(.*?\)", "", regex=True)
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.lower()
lncRNA_disease2.rename(columns={'COSMIC Tumor Name':'desc','Symbol':'ncRNA Symbol'},inplace=True)
lncRNA_disease2 = pd.merge(lncRNA_disease2, desc_do_map, on='desc')
lncRNA_disease2.drop(columns='desc',inplace=True)
lncRNA_disease2

In [None]:
lncRNA_disease = pd.merge(lncRNA_disease, lncRNA_disease2, how='outer', on=['doid','ncRNA Symbol'])
lncRNA_disease = pd.merge(lncRNA_disease, symbol_entrez_map.rename(columns={'0_x':'ncRNA Symbol'}), on='ncRNA Symbol')
lncRNA_disease.insert(0,'0_y',lncRNA_disease.pop('0_y'))
lncRNA_disease['0_y'] = lncRNA_disease['0_y'].astype(str)+'#lncRNA'

lncRNA_disease = pd.merge(lncRNA_disease,doid_mondo_map.rename(columns={0:'doid'}),on=['doid'])
lncRNA_disease.drop(columns=['doid'],inplace=True)

lncRNA_disease

In [None]:
lncRNA_disease[['0_y',1]].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-disease.txt', header=None, sep='\t', index=None)

***
### circRNA-disease

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/)

In [None]:
#http://www.rnanut.net/lncrnadisease/static/download/experimental%20circRNA-disease%20information.xlsx

circRNA_disease = pd.read_excel(unprocessed_data_location + 'experimental circRNA-disease information.xlsx')  
# Same reasoning of lncRNA-disease
circRNA_disease = circRNA_disease[circRNA_disease['Species'].str.contains("sapiens")]
circRNA_disease = circRNA_disease[circRNA_disease['ncRNA Category'] == 'circRNA']
circRNA_disease.drop(columns=['ncRNA Category','Species'],inplace=True)
circRNA_disease

In [None]:
circRNA_disease = circRNA_disease.rename(columns={"Disease Name": "desc"})
circRNA_disease['desc'] = circRNA_disease['desc'].str.lower()
circRNA_disease = pd.merge(desc_do_map, circRNA_disease, on=['desc'])
circRNA_disease.drop(columns=['desc'],inplace=True)
circRNA_disease

***
* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html)

In [None]:
circRNA_disease2 = pd.read_excel('http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/circRNA.xlsx')  
circRNA_disease2 = pd.merge(desc_do_map, circRNA_disease2, left_on=['desc'], right_on=['cancer type'])
circRNA_disease2.drop(columns=['desc','cancer type'],inplace=True)
circRNA_disease2.rename(columns={'name':'ncRNA Symbol'},inplace=True)
circRNA_disease2

In [None]:
circRNA_disease = pd.merge(circRNA_disease, circRNA_disease2, how='outer', on=['doid','ncRNA Symbol'])

circRNA_disease['Sample'].fillna(circRNA_disease['sample'], inplace=True)
circRNA_disease.loc[(circRNA_disease['Sample'].notna()) & (circRNA_disease['sample'].notna()) &
                 (circRNA_disease['Sample'] != circRNA_disease['sample']),
                 ["Sample"]] = circRNA_disease["Sample"] + ' + ' + circRNA_disease['sample']

circRNA_disease.drop(columns=['sample'],inplace=True)

circRNA_disease['Dysfunction Pattern'].fillna(circRNA_disease['regulated'], inplace=True)
circRNA_disease.loc[(circRNA_disease['Dysfunction Pattern'].notna()) & (circRNA_disease['regulated'].notna()) &
                 (circRNA_disease['Dysfunction Pattern'] != circRNA_disease['regulated']),
                 ["Dysfunction Pattern"]] = circRNA_disease["Dysfunction Pattern"] + ' + ' + circRNA_disease['regulated']

circRNA_disease.drop(columns=['regulated'],inplace=True)

circRNA_disease['Validated Method'].fillna(circRNA_disease['methods'], inplace=True)
circRNA_disease.loc[(circRNA_disease['Validated Method'].notna()) & (circRNA_disease['methods'].notna()) &
                 (circRNA_disease['Validated Method'] != circRNA_disease['methods']),
                 ["Validated Method"]] = circRNA_disease["Validated Method"] + ' + ' + circRNA_disease['methods']

circRNA_disease.drop(columns=['methods'],inplace=True)

circRNA_disease

In [None]:
circRNA_disease = pd.merge(circRNA_disease, circRNA_disease2, how='outer', on=['doid','ncRNA Symbol'])
circRNA_disease = pd.merge(circRNA_disease, symbol_entrez_map.rename(columns={'0_x':'ncRNA Symbol'}), on='ncRNA Symbol')
circRNA_disease.insert(0,'0_y',circRNA_disease.pop('0_y'))
circRNA_disease['0_y'] = circRNA_disease['0_y'].astype(str)+'#circRNA'
circRNA_disease

In [None]:
circRNA_disease = pd.merge(circRNA_disease,doid_mondo_map.rename(columns={0:'doid'}),on=['doid'])
circRNA_disease.drop(columns=['doid'],inplace=True)

circRNA_disease[['0_y',1]].drop_duplicates().to_csv(
    edge_data_location + 'RcircRNA-disease.txt', header=None, sep='\t', index=None)

***
### lncRNA-chemical

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_chemical = LncRNAWiki[LncRNAWiki['drug'].notna()]
lncRNA_chemical = lncRNA_chemical[['symbol','drug','pmid']]

lncRNA_chemical['drug'] = lncRNA_chemical.drug.str.split(';')
lncRNA_chemical = lncRNA_chemical.explode('drug')
lncRNA_chemical = pd.merge(desc_chebi_map.rename(columns={0:'drug'}), lncRNA_chemical, on=['drug'])
lncRNA_chemical = pd.merge(lncRNA_chemical,symbol_entrez_map.rename(columns={'0_x':'symbol'}),on='symbol')
lncRNA_chemical = lncRNA_chemical.drop(columns=['drug','symbol'])
lncRNA_chemical['0_y'] = lncRNA_chemical['0_y'].astype(str)+'#lncRNA'
lncRNA_chemical

In [None]:
lncRNA_chemical[['0_y',1]].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-chemical.txt', header=None, sep='\t', index=None)

***
### lncRNA-protein

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/) <br /> LncBook accommodates a high-quality collection of human lncRNA genes and transcripts, and incorporates their abundant annotations at different omics levels, thereby enabling users to decipher functional signatures of lncRNAs in human diseases and different biological contexts. 

#### gene product of 

In [None]:
data_downloader('https://ngdc.cncb.ac.cn/lncbook/files/sprotein_LncBook2.0.csv.gz', unprocessed_data_location)

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv') 
lncRNA_protein = lncRNA_protein[lncRNA_protein['Symbol']!='-']
lncRNA_protein.drop(columns=['Gene ID','Transcript ID','SmProt Loci','SmProt Protein Sequence'],inplace=True)
lncRNA_protein

In [None]:
lncRNA_protein = pd.merge(lncRNA_protein, symbol_entrez_map.rename(columns={'0_x':'Symbol'}), on='Symbol')
lncRNA_protein['0_y'] = lncRNA_protein['0_y'].astype(str) + '#lncRNA'
lncRNA_protein.drop(columns='Symbol',inplace=True)
lncRNA_protein

In [None]:
lncRNA_protein[['SmProt ID','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsmallProtein-lncRNA.txt', header=None, sep='\t', index=None)

#### interacts with

In [None]:
data_downloader('https://ngdc.cncb.ac.cn/lncbook/files/lncrna_rbp_LncBook2.0.csv.gz', unprocessed_data_location)

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'lncrna_rbp_LncBook2.0.csv')  
lncRNA_protein = lncRNA_protein[lncRNA_protein['Symbol']!='-']
lncRNA_protein.drop(columns=['Gene ID','Transcript ID'],inplace=True)
lncRNA_protein

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_protein2 = LncRNAWiki[LncRNAWiki['target_type'].notna()]
lncRNA_protein2 = lncRNA_protein2[lncRNA_protein2.target_type.str.contains('TF')].append(
    lncRNA_protein2[lncRNA_protein2.target_type.str.contains('protein')])
lncRNA_protein2=lncRNA_protein2[['symbol','target','experimental_method','clinical_detail','description','pmid']]
lncRNA_protein2['target'] = lncRNA_protein2.target.str.split(';')
lncRNA_protein2 = lncRNA_protein2.explode('target')
lncRNA_protein2.rename(columns={'symbol':'Symbol', 'target':'Protein'},inplace=True)
lncRNA_protein2
# We don't care about miRNA wrongly labeled as TF as they will be discarded when terms will be mapped on PRO

In [None]:
lncRNA_protein = pd.merge(lncRNA_protein, lncRNA_protein2, how='outer', on=['Symbol','Protein'])
lncRNA_protein = pd.merge(lncRNA_protein, symbol_entrez_map.rename(columns={'0_x':'Symbol'}), on='Symbol')
lncRNA_protein['0_y'] = lncRNA_protein['0_y'].astype(str) + '#lncRNA'
lncRNA_protein = pd.merge(lncRNA_protein, symbol_to_pro.rename(columns={'0_x':'Protein'}), on='Protein')
lncRNA_protein.drop(columns=['Symbol','Protein'],inplace=True)
lncRNA_protein

In [None]:
lncRNA_protein[['0_y','1_y']].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-protein.txt', header=None, sep='\t', index=None)

***
### lncRNA-expression

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
data_downloader('https://ngdc.cncb.ac.cn/lncbook/files/expression_LncBook2.0.csv.gz', unprocessed_data_location)

In [None]:
lncRNA_expression = pd.read_csv(unprocessed_data_location + 'expression_LncBook2.0.csv') 
lncRNA_expression = lncRNA_expression[lncRNA_expression['Symbol']!='-']
lncRNA_expression.drop(columns=['Gene ID','Featured Expression','Expression Capacity'],inplace=True)
lncRNA_expression['Symbol'] = lncRNA_expression.Symbol.str.split(',')
lncRNA_expression = lncRNA_expression.explode('Symbol')
lncRNA_expression.rename(columns={'Normal Tissue/Cell Line': 'UBERON_0000479',
                                 'Organ Development':'GO_0048513',
                                 'Preimplantation Embryo':'BTO_0000379',
                                 'Cell Differentiation':'GO_0030154',
                                 'Subcellular Localization':'COB_0000020',
                                 'Exosome':'GO_0070062',
                                 'Cancer Cell Line':'OBI_0001906',
                                 'Virus Infection':'ERO_0000729',
                                 'Circadian Rhythm':'GO_0007623'},inplace=True)
lncRNA_expression

In [None]:
# HC
HCfinal=pd.DataFrame()
# NE
NEfinal=pd.DataFrame()
# MC
MCfinal=pd.DataFrame()
# LC
LCfinal=pd.DataFrame()

for i in ['UBERON_0000479','GO_0048513','BTO_0000379','GO_0030154','COB_0000020',
          'GO_0070062','OBI_0001906','ERO_0000729','GO_0007623']:
    HC = lncRNA_expression[['Symbol',i]][lncRNA_expression[['Symbol',i]][i]=='HC']
    HC[i]=i
    HC.rename(columns={i:'HC'},inplace=True)
    HCfinal = HCfinal.append(HC)
    
    NE = lncRNA_expression[['Symbol',i]][lncRNA_expression[['Symbol',i]][i]=='NE']
    NE[i]=i
    NE.rename(columns={i:'NE'},inplace=True)
    NEfinal = NEfinal.append(NE)
    
    MC = lncRNA_expression[['Symbol',i]][lncRNA_expression[['Symbol',i]][i]=='MC']
    MC[i]=i
    MC.rename(columns={i:'MC'},inplace=True)
    MCfinal = MCfinal.append(MC)

    LC = lncRNA_expression[['Symbol',i]][lncRNA_expression[['Symbol',i]][i]=='LC']
    LC[i]=i
    LC.rename(columns={i:'LC'},inplace=True)
    LCfinal = LCfinal.append(LC)
 
HCfinal=pd.merge(HCfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
HCfinal.drop(columns='Symbol',inplace=True)
HCfinal.insert(0,'0_y',HCfinal.pop('0_y'))
HCfinal['0_y'] = HCfinal['0_y'].astype(str)+'#lncRNA'
NEfinal=pd.merge(NEfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
NEfinal.drop(columns='Symbol',inplace=True)
NEfinal.insert(0,'0_y',NEfinal.pop('0_y'))
NEfinal['0_y'] = NEfinal['0_y'].astype(str)+'#lncRNA'
MCfinal=pd.merge(MCfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
MCfinal.drop(columns='Symbol',inplace=True)
MCfinal.insert(0,'0_y',MCfinal.pop('0_y'))
MCfinal['0_y'] = MCfinal['0_y'].astype(str)+'#lncRNA'
LCfinal=pd.merge(LCfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
LCfinal.drop(columns='Symbol',inplace=True)
LCfinal.insert(0,'0_y',LCfinal.pop('0_y'))
LCfinal['0_y'] = LCfinal['0_y'].astype(str)+'#lncRNA'
HCfinal

In [None]:
HCfinal.drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-expression2245.txt', header=None, sep='\t', index=None)
LCfinal.drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-expression2246.txt', header=None, sep='\t', index=None)
MCfinal.drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-expression2291.txt', header=None, sep='\t', index=None)
# RO extension: NOT EXPRESSED IN
#NEfinal.drop_duplicates().to_csv(
    #edge_data_location + 'lncRNA-expressionNE.txt', header=None, sep='\t', index=None)

***
### lncRNA-biological role

* [dbEssLnc](https://esslnc.pufengdu.org/home) <br /> dbEssLnc contains lncRNA annotations; data are constently added by manual screening. 

In [None]:
#https://esslnc.pufengdu.org/data/essential%20lncRNA.json
dbEssLnc = pd.read_json(unprocessed_data_location + 'essential%20lncRNA.json')
lncRNA_role = dbEssLnc[dbEssLnc['Organism']=='Human']
lncRNA_role.drop(columns=['ID','Name','Aliases','fId','NONCODEId','Organism'], inplace=True)
lncRNA_role

In [None]:
lncRNA_role.Role.unique()

In [None]:
# For grounding purposes
lncRNA_role.replace('Tumor suppressor gene', 'Tumor-Suppressor-Gene', inplace=True)
lncRNA_role.NCBI_gene_Id = lncRNA_role.NCBI_gene_Id.astype(str) + '#lncRNA'

lncRNA_role[['NCBI_gene_Id','Role']].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-role.txt', header=None, sep='\t', index=None)

***
### lncRNA-cellular compartment

* [lncATLAS](https://lncatlas.crg.eu/) <br /> LncATLAS displays the subcellular localisation for GENCODE-annotated lncRNAs. This localisation is expressed in units of Relative Concentration Index (RCI) - a comparison of the concentration of a gene, per unit mass of RNA, between two cellular compartments.

In [None]:
#https://lncatlas.crg.eu/session/014e12df4b0975891edb6d8ba3a33b0e/download/retrieveall?w=
lncRNA_comp = pd.read_csv(unprocessed_data_location + '2023-05-09_lncATLAS_all_data.csv')
lncRNA_comp = lncRNA_comp[['Data Source','Data Type','Value','Gene Name']]
lncRNA_comp = pd.merge(lncRNA_comp, symbol_entrez_map.rename(columns={'0_x':'Gene Name'}), on='Gene Name')
lncRNA_comp.drop(columns=['Gene Name'],inplace=True)
lncRNA_comp['0_y'] = lncRNA_comp['0_y'].astype(str)+'#lncRNA'
lncRNA_comp

In [None]:
# Data cleaning rule to estabilish relations: discard RCI below the mean
lncRNA_comp.Value.mean()

In [None]:
lncRNA_comp = lncRNA_comp[lncRNA_comp.Value >= lncRNA_comp.Value.mean()]

In [None]:
# Mapping to GO CC
lncRNA_comp['Data Type'].unique()

In [None]:
lncRNA_comp['gocc'] = lncRNA_comp['Data Type'].replace({'nucleus': 'GO_0005634', 'cytosol': 'GO_0005829',
                                                        'chromatin': 'GO_0000785', 'membrane': 'GO_0016020',
                                                        'nucleolus': 'GO_0005730', 'nucleoplasm': 'GO_0005654'})

lncRNA_comp = lncRNA_comp[lncRNA_comp['gocc'].astype(str).str.startswith('GO_')]
lncRNA_comp.drop(columns=['Data Type'],inplace=True)
lncRNA_comp

In [None]:
lncRNA_comp[['0_y','gocc']].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-gocc.txt', header=None, sep='\t', index=None)

***
### lncRNA-pathway

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_pw = LncRNAWiki[LncRNAWiki['pathway'].notna()]
lncRNA_pw = lncRNA_pw[['symbol','pathway','pmid']]
lncRNA_pw.pathway = lncRNA_pw.pathway.str.lower()
lncRNA_pw

In [None]:
lncRNA_pw = pd.merge(lncRNA_pw, symbol_entrez_map.rename(columns={'0_x':'symbol'}), on='symbol')
lncRNA_pw = pd.merge(lncRNA_pw, desc_pw_map.rename(columns={0:'pathway'}), on='pathway')
lncRNA_pw = lncRNA_pw[['0_y',1,'pmid']]
lncRNA_pw['0_y'] = lncRNA_pw['0_y'].astype(str)+'#lncRNA'
lncRNA_pw.head()

In [None]:
lncRNA_pw[['0_y',1]].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-pw.txt', header=None, sep='\t', index=None)

***
### lncRNA-biological process

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_gobp2 = LncRNAWiki[LncRNAWiki['biological_context'].notna()]
lncRNA_gobp2 = lncRNA_gobp2[['symbol','biological_context','pmid']]
lncRNA_gobp2.biological_context = lncRNA_gobp2.biological_context.str.lower()
lncRNA_gobp2.biological_context = lncRNA_gobp2.biological_context[lncRNA_gobp2.biological_context!='disease']
lncRNA_gobp2.dropna(inplace=True)
lncRNA_gobp2 = pd.merge(lncRNA_gobp2, desc_go_map.rename(columns={0:'biological_context'}), on=['biological_context'])
lncRNA_gobp2.drop(columns='biological_context',inplace=True)
lncRNA_gobp2 = pd.merge(lncRNA_gobp2, symbol_entrez_map.rename(columns={'0_x':'symbol'}), on=['symbol'])
lncRNA_gobp2.drop(columns='symbol',inplace=True)
lncRNA_gobp2['0_y'] = lncRNA_gobp2['0_y'].astype(str)+'#lncRNA'
lncRNA_gobp2.rename(columns={'0_y':'geneid',1:'gobp','pmid':'PMID'},inplace=True)
lncRNA_gobp2.head()

In [None]:
lncRNA_gobp2[['geneid', 'gobp']].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-gobp.txt', header=None, sep='\t', index=None)

***
### miRNA-TF(protein)

* [PuTmiR 1.1](https://www.isical.ac.in/~bioinfo_miu/TF-miRNA1.php) <br/>
PuTmiR is a web server designed for extracting the putative TFs for human miRNAs, as per the requirement of a user, based on genomic locality, i.e., any upstream or downstream region of interest less than 10 kb.

#### is upstream of sequence of

In [None]:
#https://www.isical.ac.in/~bioinfo_miu/UpstreamRegionTF-miRNA1.txt
miRNA_TF_up = pd.read_csv(unprocessed_data_location+'UpstreamRegionTF-miRNA1.txt', sep='\t')
miRNA_TF_up = pd.merge(miRNA_TF_up,symbol_to_pro.rename(columns={'0_x':'TF'}),on='TF')
miRNA_TF_up = pd.merge(miRNA_TF_up,mirna_mirbase_map.rename(columns={1:'name'}),on='name')
miRNA_TF_up.drop(columns=['chrom','chromStart','chromEnd','Refseq','TF','name'],inplace=True)
miRNA_TF_up

In [None]:
maturemiRNA_TF_up = miRNA_TF_up[(miRNA_TF_up[0].str.startswith('MIMAT'))]
premiRNA_TF_up = miRNA_TF_up[(miRNA_TF_up[0].str.startswith('MI')) &
                             (~miRNA_TF_up[0].str.startswith('MIMAT'))]

maturemiRNA_TF_up[[0,'1_y']].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-TFup.txt', header=None, sep='\t', index=None)
premiRNA_TF_up[[0,'1_y']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-TFup.txt', header=None, sep='\t', index=None)

#### is downstream of sequence of

In [None]:
#https://www.isical.ac.in/~bioinfo_miu/DownstreamRegionTF-miRNA1.txt
miRNA_TF_down = pd.read_csv(unprocessed_data_location+'DownstreamRegionTF-miRNA1.txt', sep='\t')
miRNA_TF_down = pd.merge(miRNA_TF_down,symbol_to_pro.rename(columns={'0_x':'TF'}),on='TF')
miRNA_TF_down = pd.merge(miRNA_TF_down,mirna_mirbase_map.rename(columns={1:'name'}),on='name')
miRNA_TF_down.drop(columns=['chrom','chromStart','chromEnd','Refseq','TF','name'],inplace=True)
miRNA_TF_down

In [None]:
maturemiRNA_TF_down = miRNA_TF_down[(miRNA_TF_down[0].str.startswith('MIMAT'))]
premiRNA_TF_down = miRNA_TF_down[(miRNA_TF_down[0].str.startswith('MI')) &
                                 (~miRNA_TF_down[0].str.startswith('MIMAT'))]

maturemiRNA_TF_down[[0,'1_y']].drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-TFdown.txt', header=None, sep='\t', index=None)
premiRNA_TF_down[[0,'1_y']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-TFdown.txt', header=None, sep='\t', index=None)

***
### Stem-loop miRNA-TF(protein)

* [miRNet](https://www.mirnet.ca/)
miRNet is an easy-to-use web-based tool that offers statistical, visual and network-based approaches to help researchers understand miRNAs functions and regulatory mechanisms. miRNet offers a comprehensive tool suite to enable statistical analysis and functional interpretation of various data generated from current miRNA studies.

In [None]:
#data_downloader('https://www.dropbox.com/s/78r0tazedtkhi5g/miRNet-mir-tf-hsa.csv', unprocessed_data_location)

miRNA_TF = pd.read_csv(unprocessed_data_location + 'miRNet-mir-tf-hsa.csv')  
miRNA_TF

***
* [TransmiR](https://www.cuilab.cn/transmir) <br /> TransmiR is a database for transcription factor (TF)-microRNA (miRNA) regulations, through which one can find regulatory relations between TFs and miRNAs.

In [None]:
#https://www.cuilab.cn/files/images/transmir2/download/literature/hsa.xlsx
miRNA_TF2 = pd.read_excel(unprocessed_data_location+"hsa.xlsx", header=None)
miRNA_TF2

In [None]:
miRNA_TF = pd.merge(miRNA_TF, miRNA_TF2, how='outer', left_on=['mir_id','symbol'], right_on = [1,0])
miRNA_TF[0].fillna(miRNA_TF['symbol'], inplace=True)
miRNA_TF['symbol'].fillna(miRNA_TF[0], inplace=True)
miRNA_TF['mir_id'].fillna(miRNA_TF[1], inplace=True)
miRNA_TF[1].fillna(miRNA_TF['mir_id'], inplace=True)
miRNA_TF

***
* [TAM](http://www.lirmed.com/tam2/)

In [None]:
#TAM = pd.read_csv(unprocessed_data_location+'mirset_v9.txt', sep='\t',names=range(500))
#TAM=TAM.dropna(axis=1, how='all')

miRNA_TF2=TAM[(TAM[0].str.contains("TF"))]
miRNA_TF2=miRNA_TF2.dropna(axis=1, how='all')
miRNA_TF2=miRNA_TF2.drop(columns=[0])
miRNA_TF2['merged'] = miRNA_TF2[miRNA_TF2.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_TF2=miRNA_TF2[[1,'merged']]

miRNA_TF2['merged'] = miRNA_TF2.merged.str.split(',')
miRNA_TF2 = miRNA_TF2.explode('merged')
miRNA_TF2

In [None]:
miRNA_TF = pd.merge(miRNA_TF, miRNA_TF2, how='outer', left_on=['mir_id','symbol'], right_on = ['merged',1])
miRNA_TF['1_x'].fillna(miRNA_TF['symbol'], inplace=True)
miRNA_TF['symbol'].fillna(miRNA_TF['1_x'], inplace=True)

miRNA_TF = pd.merge(miRNA_TF, symbol_to_pro.rename(columns={'0_x':'symbol'}), on=['symbol'])


In [None]:
miRNA_TF[['mir_acc','1_y_y']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-TF.txt', header=None, sep='\t', index=None)

***
### premiRNA-GO

* [TAM](http://www.lirmed.com/tam2/) <br /> TAM groups miRNAs into six categories of miRNA sets: miRNA-family sets, miRNA cluster sets, miRNA-disease, miRNA-function sets, miRNA-TF sets and tissue specificity sets.

In [None]:
#TAM = pd.read_csv('http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt', sep='\t',names=range(500))
#TAM=TAM.dropna(axis=1, how='all')

miRNA_GO=TAM[TAM[0].str.contains("unction")]
miRNA_GO[1] = miRNA_GO[1].str.lower()
miRNA_GO=miRNA_GO.dropna(axis=1, how='all')
miRNA_GO=miRNA_GO.drop(columns=[0])
miRNA_GO

In [None]:
miRNA_GO = pd.merge(desc_go_map, miRNA_GO, left_on=[0], right_on=[1]).drop(columns=['1_y'])
miRNA_GO=miRNA_GO.dropna(axis=1, how='all')
miRNA_GO

In [None]:
dflist = list()
for i in range(len(miRNA_GO)):
    df=pd.DataFrame(columns=[0,1,2])
    df[0] = miRNA_GO.T[i].drop(index=[0,'1_x'])
    df[1] = miRNA_GO.T[i][0]
    df[2] = miRNA_GO.T[i]['1_x']
    dflist.append(df)

miRNA_GO=pd.DataFrame(columns=[0,1,2])
for dataframe in dflist:
    miRNA_GO=miRNA_GO.append(dataframe)
miRNA_GO=miRNA_GO.dropna()

miRNA_GO = pd.merge(miRNA_GO.rename(columns={0:'a'}), mirna_mirbase_map.rename(columns={1:'a'}), on='a')
miRNA_GO.drop(columns=['a',1], inplace=True)
miRNA_GO

In [None]:
miRNA_GO[[0,2]].drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-go.txt', header=None, sep='\t', index=None)

***
### premiRNA-premiRNA

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
#TAM = pd.read_csv('http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt', sep='\t',names=range(500))
#TAM=TAM.dropna(axis=1, how='all')
miRNA_miRNA=TAM[(TAM[0].str.contains("luster"))].append(TAM[TAM[0].str.contains("amily")])
miRNA_miRNA[1] = miRNA_miRNA[1].str.lower()
miRNA_miRNA=miRNA_miRNA.dropna(axis=1, how='all')
miRNA_miRNA=miRNA_miRNA.drop(columns=[0,1])
miRNA_miRNA['merged'] = miRNA_miRNA[miRNA_miRNA.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_miRNA=miRNA_miRNA[[2,'merged']]

miRNA_miRNA['merged'] = miRNA_miRNA.merged.str.split(',')
miRNA_miRNA = miRNA_miRNA.explode('merged')
miRNA_miRNA

In [None]:
miRNA_miRNA['merged'] = miRNA_miRNA.merged.str.split(',')
miRNA_miRNA = miRNA_miRNA.explode('merged')
miRNA_miRNA

***
è in DOWN
* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html) <br /> miRPathDB includes miRNA candidates, experimentally validated target genes, extended analysis functionality, and intuitive visualizations of query results. 

In [None]:
miRNA_miRNA = pd.merge(mirna_mirbase_map.rename(columns={1: 'merged'}), miRNA_miRNA, on='merged')
miRNA_miRNA = pd.merge(mirna_mirbase_map.rename(columns={1: 2}), miRNA_miRNA, on=2)
miRNA_miRNA.drop(columns=[2,'merged'], inplace=True)
miRNA_miRNA = miRNA_miRNA[~miRNA_miRNA['0_y'].str.startswith('MIMAT')]
miRNA_miRNA

In [None]:
miRNA_miRNA[['0_x','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-premiRNA.txt', header=None, sep='\t', index=None)

***
### miRNA-anatomy

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
miRNA_anatomy = TAM[(TAM[0].str.contains("TissueSpecific"))]
miRNA_anatomy=miRNA_anatomy.drop(columns=[0])
miRNA_anatomy=miRNA_anatomy.dropna(axis=1, how='all')
miRNA_anatomy=miRNA_anatomy.append(miRNA_anatomy.loc[(1236)])
miRNA_anatomy=miRNA_anatomy.reset_index(drop=True)
miRNA_anatomy.iloc[(3)][1] = "Heart"
miRNA_anatomy.iloc[(6)][1] = "Muscle"
miRNA_anatomy

In [None]:
miRNA_anatomy['merged'] = miRNA_anatomy[miRNA_anatomy.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_anatomy['Uberon'] = ['UBERON_0002369', 'UBERON_0000955', 'UBERON_0001155',
                           'UBERON_0002349', 'UBERON_0001150', 'UBERON_0001987', 'UBERON_0001630']
miRNA_anatomy=miRNA_anatomy[[1,'Uberon','merged']]
miRNA_anatomy['merged'] = miRNA_anatomy.merged.str.split(',')
miRNA_anatomy = miRNA_anatomy.explode('merged')
miRNA_anatomy

In [None]:
miRNA_anatomy = pd.merge(mirna_mirbase_map.rename(columns={1: 'merged'}), miRNA_anatomy, on='merged')
miRNA_anatomy.drop(columns=[1,'merged'], inplace=True)
miRNA_anatomy

In [None]:
miRNA_anatomy.drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-anatomy.txt', header=None, sep='\t', index=None)

***
### miRNA-chemical

* [SM2miR](http://www.jianglab.cn/SM2miR/) <br /> SM2miR is a manual curated database which collects and incorporates the experimentally validated small molecules' effects on miRNA expression from the published papers. Each entry contains the detailed information about small molecules, miRNAs and their relationships.

In [None]:
data_downloader('http://www.jianglab.cn/SM2miR/files/SM2miR3.xls', unprocessed_data_location)

In [None]:
miRNA_chemical = pd.read_excel(unprocessed_data_location + 'SM2miR3.xls')  
miRNA_chemical = miRNA_chemical[miRNA_chemical['Species'].str.contains('sapiens')]
miRNA_chemical

In [None]:
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.lower().str.replace("\(.*?\)| \(.*?\)", '').str.rstrip()

miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.split('+')
miRNA_chemical = miRNA_chemical.explode('small melocule')
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.rstrip().str.lstrip()

# Fix join columns
miRNA_chemical = pd.merge(miRNA_chemical, desc_chebi_map, left_on=['small melocule'], right_on=[0])

miRNA_chemical

***
* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
# https://www.dropbox.com/s/abaeonmjpftbspx/miRNet-mir-mol-hsa.csv?dl=0

miRNA_chemical2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-mol-hsa.csv')
miRNA_chemical2.molecule=miRNA_chemical2.molecule.str.lower()

In [None]:
miRNA_chemical2

In [None]:
miRNA_chemical2 = pd.merge(miRNA_chemical2, desc_chebi_map, left_on=['molecule'], right_on=[0])

miRNA_chemical2

In [None]:
miRNA_chemical=pd.merge(miRNA_chemical, miRNA_chemical2, how='outer', left_on=['miRBase',1], right_on = ['mir_acc',1])

In [None]:
miRNA_chemical

In [None]:
miRNA_chemical['miRBase'].fillna(miRNA_chemical['mir_acc'], inplace=True)
miRNA_chemical['mir_acc'].fillna(miRNA_chemical['miRBase'], inplace=True)

miRNA_chemical_old=miRNA_chemical.copy()

***
* [miRandola](http://mirandola.iit.cnr.it/index.php)

In [None]:
drug_list=['aspirin','bevacizumab','clopidogrel',
           'conventional%20synthetic%20disease-modifying%20antirheumatic%20drugs%20(cs-dmards)',
           'docetaxel', 'epirubicin%20plus%20paclitaxel','fluorouracil%20(5-fu)','gemcitabine',
           'hypomethylating%20agents%20(hmas)','lapatinib','lithium','mercury','n-acetyl%20cysteine%20(nac)',
           'paracetamol','platinum','praziquantel%20(pzq)','sorafenib','testosterone',
           'transarterial%20chemoembolization%20(tace)','trastuzumab','xuezhikang'
          ]
miRNA_chemical_mirandola=[]
for drug in drug_list:
    miRNA_chemical = pd.read_html('http://mirandola.iit.cnr.it/view_drug.php?LV='+drug)
    for miRNA in range(len(miRNA_chemical)):
        miRNA_chemical = pd.read_html('http://mirandola.iit.cnr.it/view_drug.php?LV='+drug,header=0)[miRNA]
        miRNA_chemical = miRNA_chemical.T
        miRNA_chemical.columns = miRNA_chemical.iloc[0]
        miRNA_chemical = miRNA_chemical.drop(index=miRNA_chemical.iloc[0].name)
        miRNA_chemical = miRNA_chemical.drop(index=miRNA_chemical.iloc[1].name)
        miRNA_chemical = miRNA_chemical.drop(miRNA_chemical.iloc[:, :16],axis = 1)
        miRNA_chemical_mirandola.append(miRNA_chemical) 

miRNA_chemical_mirandola = pd.concat(miRNA_chemical_mirandola)
miRNA_chemical_mirandola        

In [None]:
miRNA_chemical_mirandola = pd.merge(miRNA_chemical_mirandola, desc_chebi_map, left_on=['Drug'], right_on=[0])
miRNA_chemical_mirandola

In [None]:
miRNA_chemical=pd.merge(miRNA_chemical_old, miRNA_chemical_mirandola, how='outer', left_on=['mir_acc',1], right_on = ['miRBase Accession',1])
miRNA_chemical['mir_acc'].fillna(miRNA_chemical['miRBase Accession'], inplace=True)
miRNA_chemical['miRBase Accession'].fillna(miRNA_chemical['mir_acc'], inplace=True)
miRNA_chemical[[1,'miRBase']].drop_duplicates().to_csv(
    edge_data_location + 'Rchemical-miRNA.txt', header=None, sep='\t', index=None)

***
### gRNA-gene

* [Addgene](https://www.addgene.org/)

In [None]:
# copy-paste from https://www.addgene.org/crispr/reference/grna-sequence/#datatable
gRNA_gene = pd.read_csv(unprocessed_data_location + 'grna_addgene.txt', sep='\t', dtype = {"Plasmid ID":str})  
gRNA_gene.columns=gRNA_gene.columns.str.rstrip()
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].notna()]
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].str.contains('apiens')]
gRNA_gene['Plasmid ID'] = 'www.addgene.org/'+gRNA_gene['Plasmid ID'].str.rstrip()
gRNA_gene['Target Gene'] = gRNA_gene['Target Gene'].str.upper().str.rstrip()

gRNA_gene.drop(columns=['Target Species','Cas9 Species','Depositor'],inplace=True)
gRNA_gene

In [None]:
gRNA_gene = pd.merge(gRNA_gene, symbol_entrez_map[['0_x','0_y']].rename(columns={"0_x": "Target Gene"}), on='Target Gene').drop(columns=['Target Gene'])
gRNA_gene.insert(0,'0_y',gRNA_gene.pop('0_y'))
gRNA_gene.insert(0,'Plasmid ID',gRNA_gene.pop('Plasmid ID'))
gRNA_gene

In [None]:
gRNA_gene[['Plasmid ID', '0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RgRNA-gene.txt', header=None, sep='\t', index=None)

***
### ASO-mRNA

* [eSkip-Finder](https://eskip-finder.org/cgi-bin/input.cgi) <br /> eSkip-Finder is the first machine learning-based design tool and database of antisense oligonucleotides (ASOs) for exon skipping. A significant challenge, however, is the difficulty in selecting an optimal target sequence for exon skipping.

In [None]:
# https://eskip-finder.org/cgi-bin/search.cgi
ASO_mRNA = pd.read_html(unprocessed_data_location + 'eSkip-Finder.html')[2]
ASO_mRNA = ASO_mRNA[ASO_mRNA['Species'] == 'human']
ASO_mRNA = ASO_mRNA[ASO_mRNA['Oligo name in literature'] != 'Null']
ASO_mRNA = ASO_mRNA[ASO_mRNA['confidence level (1:describe to explicitly / 0:speculated from context)']=='1']
ASO_mRNA.drop(columns=['Oligo index in literature',
                       'Oligo sequence /: Cocktail. -: weasel (connected).',
                       'Species','Oligo chemistry','Literature info (Patent ID) (original)',
                       'Alternative/translated literature','Title','Date','Inventor','Assignee/Applicants',
                       'Figure/Table in literature','Unnamed: 31'],inplace=True)
ASO_mRNA = pd.merge(ASO_mRNA,symbol_entrez_map.rename(columns={'0_x':'Target gene (RNA)'}), on='Target gene (RNA)')
ASO_mRNA.drop(columns=['Target gene (RNA)'],inplace=True)
ASO_mRNA['0_y'] = ASO_mRNA['0_y'].astype(str) + '#mRNA'
ASO_mRNA

In [None]:
ASO_mRNA[['Oligo name in literature','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RASO-mRNA.txt', header=None, sep='\t', index=None)

***
### ASO drug-mRNA

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709) <br /> DrugBank is a comprehensive, free-to-access, online database containing information on drugs and drug targets. As both a bioinformatics and a cheminformatics resource, it combines detailed drug (i.e. chemical, pharmacological and pharmaceutical) data with comprehensive drug target (i.e. sequence, structure, and pathway) information.

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_mRNA = pd.read_csv(unprocessed_data_location + 'ASO-gene_DrugBank.txt', sep='\t') 
ASO_mRNA

In [None]:
ASO_mRNA['NCBI']=[338, np.nan, np.nan, np.nan, np.nan, np.nan, 1756, np.nan,
                  338, 211, np.nan, 1756, 1756, np.nan, np.nan, np.nan, np.nan, 7276, np.nan]
ASO_mRNA['NCBI'] = ASO_mRNA['NCBI'].astype('Int64').astype(str) + '#mRNA'
ASO_mRNA['NCBI'] = ASO_mRNA['NCBI'].replace('<NA>#mRNA','<NA>')
ASO_mRNA['DB ID']=['DB05528',
                   'DB05487', 'DB05487', 'DB05487',
                   'DB06759', 'DB06759',
                   'DB06014',
                   'DB13161',
                   'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984', 'DB14984', 'DB14984', 'DB14984', 'DB14984',
                   'DB16699', 'DB16699']
ASO_mRNA.drop(columns=['Drug','Target'], inplace=True)
ASO_mRNA.insert(2,'Type',ASO_mRNA.pop('Type'))
ASO_mRNA

In [None]:
ASO_mRNA[['NCBI','DB ID']].drop_duplicates().to_csv(
    edge_data_location + 'RASOd-mRNA.txt', header=None, sep='\t', index=None)

***
### ASO drug-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_disease = pd.read_csv(unprocessed_data_location + 'ASO-disease_DrugBank.txt', sep='\t') 
pd.set_option('display.max_colwidth', None)
ASO_disease

In [None]:
ASO_disease['DB ID']=['DB05528',
                   'DB05487',
                   'DB06759',
                    'DB13811',
                   'DB06014',
                   'DB13161',
                   'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984',
                   'DB16699']
ASO_disease['MONDO']=['MONDO_0018328',
                      'MONDO_0001657,MONDO_0007254',
                      'MONDO_0000878',
                      '<NA>',
                      'MONDO_0010679',
                      'MONDO_0001516',
                      'MONDO_0017132,MONDO_0001824',
                      'MONDO_0002520',
                      'MONDO_0010679',
                      'MONDO_0010679',
                      'MONDO_0010679',
                      'MONDO_0017132'
                     ]
ASO_disease['MONDO'] = ASO_disease.MONDO.str.split(',')
ASO_disease = ASO_disease.explode('MONDO')
ASO_disease.drop(columns=['Drug','Drug Description'],inplace=True)
ASO_disease

In [None]:
ASO_disease.drop_duplicates().to_csv(edge_data_location + 'RASOd-disease.txt', header=None, sep='\t', index=None)

***
### ASO drug-protein

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_protein = pd.read_csv(unprocessed_data_location + 'ASO-gene_DrugBank.txt', sep='\t') 

ASO_protein['DB ID']=['DB05528',
                   'DB05487', 'DB05487', 'DB05487',
                   'DB06759', 'DB06759',
                   'DB06014',
                   'DB13161',
                   'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984', 'DB14984', 'DB14984', 'DB14984', 'DB14984',
                   'DB16699', 'DB16699']
ASO_protein['PRO']=[np.nan, 'PR_000007204', 'PR_000011178', 'PR_000001754', 'PR_Q9BTL4', 'PR_Q16621',
                    np.nan, 'PR_Q16637', np.nan, np.nan, 'PR_P11532', np.nan, np.nan, 'PR_P08684',
                    'PR_P20815', 'PR_P11712', 'PR_P33261', np.nan, 'PR_P02768']
ASO_protein
ASO_protein.drop(columns=['Drug','Target'], inplace=True)
ASO_protein.insert(2,'Type',ASO_protein.pop('Type'))
ASO_protein

In [None]:
ASO_protein.iloc[0:18].dropna().to_csv(
    edge_data_location + 'RASOd-protein11007.txt', header=None, sep='\t', index=None)

In [None]:
ASO_protein.iloc[[18]].to_csv(edge_data_location + 'RASOd-protein10002.txt', header=None, sep='\t', index=None)

***
### siRNA drug-mRNA

* [DrugBank](https://go.drugbank.com/categories/DBCAT005484) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_mRNA = pd.read_csv(unprocessed_data_location + 'siRNA-gene_DrugBank.txt', sep='\t') 
siRNA_mRNA

In [None]:
siRNA_mRNA['NCBI']=[7276, np.nan, np.nan, 338, 54363, np.nan, np.nan, 7276, np.nan]
siRNA_mRNA['NCBI'] = siRNA_mRNA['NCBI'].astype('Int64').astype(str) + '#mRNA'
siRNA_mRNA['NCBI'] = siRNA_mRNA['NCBI'].replace('<NA>#mRNA','<NA>')
siRNA_mRNA['DB ID']=['DB14582', 'DB14582', 'DB14582',
                     'DB15066',
                     'DB15935', 'DB15935', 'DB15935',
                     'DB16699', 'DB16699']
siRNA_mRNA.drop(columns=['Drug','Target'], inplace=True)
siRNA_mRNA.insert(2,'Type',siRNA_mRNA.pop('Type'))
siRNA_mRNA

In [None]:
siRNA_mRNA[['DB ID', 'NCBI']].drop_duplicates().to_csv(
    edge_data_location + 'RsiRNAd-mRNA.txt', header=None, sep='\t', index=None)

***
### siRNA-mRNA

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html) <br /> The MIT/ICBP siRNA Database has validated siRNA and shRNA sequences against over 100 genes.

In [None]:
ICBP = pd.read_html('http://web.mit.edu/sirna/sirnas-gene.html')
ICBP = ICBP[1]
ICBP.columns = ICBP.iloc[[0]].squeeze()
ICBP.drop(0, inplace=True)
ICBP

In [None]:
# For post-processing purposes
ICBP[['ID#']] = ICBP[['ID#']] + '.html'

ICBPsiRNA = ICBP.loc[(ICBP['siRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPsiRNA.drop(columns=['siRNA','shRNA','Mouse','Human','Protein knockdown'],inplace=True)
ICBPsiRNA

In [None]:
ICBPsiRNA = pd.merge(symbol_entrez_map[['0_x','0_y']].rename(columns={"0_x": "Target Gene"}),
                     ICBPsiRNA, on="Target Gene").drop(columns='Target Gene')
ICBPsiRNA['0_y'] = ICBPsiRNA['0_y'].astype('Int64').astype(str) + '#mRNA'

ICBPsiRNA[['ID#','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RsiRNA-mRNA.txt', header=None, sep='\t', index=None)

***
### shRNA-mRNA

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html)

In [None]:
ICBPshRNA = ICBP.loc[(ICBP['shRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPshRNA.drop(columns=['siRNA','shRNA','Mouse','Human','Protein knockdown'],inplace=True)
ICBPshRNA

In [None]:
ICBPshRNA = pd.merge(symbol_entrez_map[['0_x','0_y']].rename(columns={"0_x": "Target Gene"}),
                     ICBPshRNA, on="Target Gene").drop(columns='Target Gene')
ICBPshRNA['0_y'] = ICBPshRNA['0_y'].astype('Int64').astype(str) + '#mRNA'

ICBPshRNA[['ID#','0_y']].drop_duplicates().to_csv(
    edge_data_location + 'RshRNA-mRNA.txt', header=None, sep='\t', index=None)

***
### siRNA drug-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT005484) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_disease = pd.read_csv(unprocessed_data_location + 'siRNA-disease_DrugBank.txt', sep='\t') 
siRNA_disease

In [None]:
siRNA_disease['DB ID']=['DB14582','DB15066','DB15935','DB16699']
siRNA_disease['MONDO']=['MONDO_0017132,MONDO_0001824',
                        'MONDO_0002520',
                        'MONDO_0009823',
                        'MONDO_0017132,MONDO_0001824']

siRNA_disease['MONDO'] = siRNA_disease.MONDO.str.split(',')
siRNA_disease = siRNA_disease.explode('MONDO')
siRNA_disease.drop(columns=['Drug','Drug Description'],inplace=True)
siRNA_disease

In [None]:
siRNA_disease.drop_duplicates().to_csv(
    edge_data_location + 'RsiRNAd-disease.txt', header=None, sep='\t', index=None)

***
### aptamer-protein

* [Apta-Index](https://www.aptagen.com/apta-index/) <br/>
Apta-index is the most advanced user-friendly database on aptamers. Aptagen does not list this information contained herein as products but as a database of information obtained from the published literature. 

In [None]:
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',
                              names=['Name', 'ID', 'Target', 'Sequence']) 
aptamer_protein.Target = aptamer_protein.Target.str.lower()
aptamer_protein = pd.merge(aptamer_protein[['ID','Target']], desc_pro_map.rename(columns={0:'Target'}),on='Target')
aptamer_protein['ID'] = 'aptamer-details/?id=' + aptamer_protein['ID'].astype(str)

In [None]:
aptamer_protein[['ID',1]].drop_duplicates().to_csv(
    edge_data_location + 'Raptamer-protein.txt', header=None, sep='\t', index=None)

***
### aptamer-chemical

* [Apta-Index](https://www.aptagen.com/apta-index/)

In [None]:
aptamer_chemical = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',
                               names=['Name', 'ID', 'Target', 'Sequence']) 
aptamer_chemical.Target = aptamer_chemical.Target.str.lower()
aptamer_chemical = pd.merge(aptamer_chemical[['ID','Target']],
                            desc_chebi_map.rename(columns={0:'Target'}),on='Target')
aptamer_chemical['ID'] = 'aptamer-details/?id=' + aptamer_chemical['ID'].astype(str)
aptamer_chemical

In [None]:
aptamer_chemical[['ID',1]].drop_duplicates().to_csv(
    edge_data_location + 'Raptamer-chemical.txt', header=None, sep='\t', index=None)

***
### aptamer drug-protein

* [DrugBank](https://go.drugbank.com/categories/DBCAT001641) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptamer-protein_DrugBank.txt', sep='\t') 
aptamer_protein

In [None]:
aptamer_protein['DB ID']=['DB04932', 'DB04932', 'DB04932', 'DB04998']
aptamer_protein['PRO']=['PR_000001575', 'PR_000001576', 'PR_000001577', 'PR_000001752']
aptamer_protein.drop(columns=['Drug','Target'],inplace=True)
aptamer_protein.insert(2,'Type',aptamer_protein.pop('Type'))
aptamer_protein

In [None]:
aptamer_protein.drop_duplicates().to_csv(
    edge_data_location + 'Raptamerd-protein.txt', header=None, sep='\t', index=None)

***
### aptamer drug-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT001641) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_disease = pd.read_csv(unprocessed_data_location + 'aptamer-disease_DrugBank.txt', sep='\t') 
aptamer_disease

In [None]:
aptamer_disease['DB ID']=['DB04932', 'DB04998']
aptamer_disease['MONDO']=['MONDO_0019514', 'MONDO_0004992,MONDO_0002367,MONDO_0004643,MONDO_0009831']
aptamer_disease['MONDO'] = aptamer_disease.MONDO.str.split(',')
aptamer_disease = aptamer_disease.explode('MONDO')
aptamer_disease.drop(columns=['Drug','Drug Description'],inplace=True)
aptamer_disease

In [None]:
aptamer_disease.drop_duplicates().to_csv(
    edge_data_location + 'Raptamerd-disease.txt', header=None, sep='\t', index=None)

***
### mRNA vaccines-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT005631) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005631
mRNAv_disease = pd.read_csv(unprocessed_data_location + 'mRNAv-disease_DrugBank.txt', sep='\t') 
mRNAv_disease

In [None]:
mRNAv_disease['DB ID']=['DB15654', 'DB15695', 'DB15696' , 'DB16401', 'DB16402', 'DB17088', 'DB17090', 'DB17095']
mRNAv_disease['MONDO']='MONDO_0100096'
mRNAv_disease.drop(columns=['Drug Description','Drug'],inplace=True)
mRNAv_disease

In [None]:
mRNAv_disease.drop_duplicates().to_csv(
    edge_data_location + 'RmRNAv-disease.txt', header=None, sep='\t', index=None)

***
### lncRNA-mRNA

* [LncExpDB](https://ngdc.cncb.ac.cn/lncexpdb/) <br /> LncExpDB is a comprehensive database for lncRNA expression. It covers expression profiles of lncRNA genes across various biological contexts, predicts potential functional lncRNAs and their interacting partners, and thus provides essential guidance on experimental design.

In [None]:
# Download from https://ngdc.cncb.ac.cn/lncexpdb/interactions --> Download button
lncRNA_mRNA = pd.read_csv(unprocessed_data_location + 'interaction.txt', sep='\t') 
lncRNA_mRNA = lncRNA_mRNA[lncRNA_mRNA['lncname'].notna()]
lncRNA_mRNA['lncname'] = lncRNA_mRNA.lncname.str.split(',')
lncRNA_mRNA = lncRNA_mRNA.explode('lncname')
lncRNA_mRNA = lncRNA_mRNA.drop(columns=['geneid','pcg','lnclocation','pcglocation','ID'])
lncRNA_mRNA

In [None]:
lncRNA_mRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'lncname'}),lncRNA_mRNA,on='lncname')
lncRNA_mRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'pcgname'}),lncRNA_mRNA,on='pcgname')
lncRNA_mRNA.drop(columns=['lncname','pcgname'],inplace=True)
lncRNA_mRNA['0_y_y'] = lncRNA_mRNA['0_y_y'].astype(str)+'#lncRNA'
lncRNA_mRNA['0_y_x'] = lncRNA_mRNA['0_y_x'].astype(str)+'#mRNA'
lncRNA_mRNA

In [None]:
lncRNA_mRNA[['0_y_y','0_y_x']].drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-mRNA.txt', header=None, sep='\t', index=None)

***
### riboswitch-protein

* [TBDB](https://tbdb.io/) <br /> 

TBDB contains T-box riboswitch fold prediction, tRNA pairs from host organisms, information regarding T-box riboswitch genetic context, and thermodynamic calculations of putative T-box riboswitch sequences found in nature.

In [None]:
#https://tbdb.io/database/tbdb.csv
riboswitch_protein = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') 
riboswitch_protein

In [None]:
# For post-processing purposes
riboswitch_protein.unique_name = riboswitch_protein.unique_name+'.html'

riboswitch_protein.downstream_protein = riboswitch_protein.downstream_protein.str.lower()
riboswitch_protein = riboswitch_protein[['Rank','E_value','Score','Bias','Tbox_start','Tbox_end','CM_accuracy','GC','unique_name',
                   'locus_start','tbox_length','locus_end','locus_view_start','locus_view_end','deltadelta_g','downstream_protein',
                   'downstream_protein_id','downstream_protein_EC','protein_desc','protein_url','protein_id_short']]

In [None]:
# Fix join columns
riboswitch_protein = pd.merge(riboswitch_protein, desc_pro_map, left_on=['downstream_protein'], right_on=[0])
riboswitch_protein.drop(columns=[0],inplace=True)

riboswitch_protein

In [None]:
riboswitch_protein[['unique_name',1]].drop_duplicates().to_csv(
    edge_data_location + 'Rriboswitch-protein.txt', header=None, sep='\t', index=None)

***
### riboswitch-bacterial strain

* [RSwitch database](https://penchovsky.atwebpages.com/applications.php?page=58) <br /> 
The RSwitch database contains information on using riboswitches as antibacterial drug targets. Each riboswitch represented by the ID, name, aptamer sequences, secondary structures, multiple alignments, consensus motifs, and biochemical pathways.

In [None]:
riboswitch_bactStrain = pd.read_csv(unprocessed_data_location + 'rswitch.csv', header=None) 
riboswitch_bactStrain

In [None]:
riboswitch_bactStrain[2].drop_duplicates().to_csv(
    unprocessed_data_location + 'bacteria.txt', header=None, sep='\n', index=None)
# --> https://www.ncbi.nlm.nih.gov/Taxonomy/TaxIdentifier/tax_identifier.cgi (Note that some manual work is needed)

In [None]:
bacteria = pd.read_csv(unprocessed_data_location + 'tax_report.txt', sep='\t\|\t', engine='python') 
bacteria.taxid = bacteria.taxid.astype('Int64')
bacteria

In [None]:
riboswitch_bactStrain = pd.merge(riboswitch_bactStrain.rename(columns={2:'name'}),
                                 bacteria[['name','taxid']],on=['name'])
riboswitch_bactStrain.taxid = 'wwwtax.cgi?id='+riboswitch_bactStrain.taxid.astype(str)
riboswitch_bactStrain

In [None]:
riboswitch_bactStrain[[0, 'taxid']].to_csv(edge_data_location + 'Rriboswitch-bactStrain.txt',
                                           header=None, sep='\t', index=None)

***
### riboswitch-gobp

* [TBDB](https://tbdb.io/) <br /> 

In [None]:
#https://tbdb.io/database/
riboswitch_gobp = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') 
riboswitch_gobp.unique_name = riboswitch_gobp.unique_name+'.html'

# Extract only GO terms
gobp = riboswitch_gobp.protein_desc.str.rpartition('[')[2].str.rpartition(']')[0].str.replace(":", "_")
riboswitch_gobp = pd.concat([riboswitch_gobp, gobp.rename('gobp')], axis=1)
riboswitch_gobp = riboswitch_gobp[riboswitch_gobp.gobp.str.contains("GO", na=False)]
riboswitch_gobp[['unique_name', 'gobp']]

In [None]:
riboswitch_gobp[['unique_name', 'gobp']].drop_duplicates().to_csv(
    edge_data_location + 'Rriboswitch-gobp.txt', header=None, sep='\t', index=None)

***
### ribozyme-GO

* [Ribocentre](https://www.ribocentre.org/) <br />
Ribocentre is designed to contain comprehensive information of all natural ribozymes.

In [None]:
data_downloader('https://www.ribocentre.org/38dffd70-0f9f-499b-b442-be2f6e91a156', unprocessed_data_location)

In [None]:
ribozyme_go = pd.read_excel(unprocessed_data_location + 'Ribocentre - Application.xlsx', header=1) 
ribozyme_go

In [None]:
ribozyme_go['go'] = ['','','GO_0015867', 'GO_0032363', 'GO_0010468', 'GO_0010468', 'GO_0010468', 'GO_2000232',
                         'GO_0010468', 'GO_0010468', 'GO_0003743', '', '', '', '', '', '', '', 'GO_0010468',
                         '', '', '', 'GO_0050790', '', '', '', '', '', 'GO_0050790', '', '', '', '', '']
ribozyme_go.insert(1,'go',ribozyme_go.pop('go'))
ribozyme_go = pd.merge(ribozyme_rfam_map.rename(columns={0:'ribozyme name'}),
                       ribozyme_go, on='ribozyme name').drop(columns='ribozyme name')
ribozyme_go

In [None]:
ribozyme_go[[1, 'go']].to_csv(edge_data_location + 'Rribozyme-GO.txt', header=None, sep='\t', index=None)

***
### viral RNA-ribozyme

* [ViroidDB](https://viroids.org/) <br />
ViroidDB is the most comprehensive collection of viroid, satellite RNA, retrozyme, and deltavirus genome sequences available on the internet. 

In [None]:
data_downloader('https://viroids.org/db/latest/all.json', unprocessed_data_location)

In [None]:
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

# List of all possible ribozymes (useful for mapping)
a = [i for j in ribozyme for i in j]
set(a)

In [None]:
vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']),
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme.drop(columns=['bioSample','genus','family','identicalSeqs','genBankTitle','displayTitle','length',
                            'sequenceType','nucCompleteness','genotype','segment','publications',
                            'geoLocation','country','usa','submitters','releaseDate','isolate',
                            'genus','family','sequence','structure','type','ribozymes','Cls_ID80',
                            'Cls_ID70','Cls_ID85','Cls_ID75','Cls_ID95','Cls_ID90','sraAccession','submitters','species','host'],
                   inplace=True)
vRNA_ribozyme.insert(0,1,vRNA_ribozyme.pop(0))
vRNA_ribozyme

In [None]:
vRNA_ribozyme = pd.merge(ribozyme_rfam_map,vRNA_ribozyme,left_on=0,right_on=1)
vRNA_ribozyme.drop(columns=[0],inplace=True)
vRNA_ribozyme.insert(1,'accession',vRNA_ribozyme.pop('accession'))
vRNA_ribozyme

In [None]:
vRNA_ribozyme[['accession', '1_x']].drop_duplicates().to_csv(
    edge_data_location + 'RviralRNA-ribozyme.txt', header=None, sep='\t', index=None)

***
### circRNA-extracellular form

* [miRandola](http://mirandola.iit.cnr.it/) <br /> miRandola is a comprehensive manually curated classification of different extracellular circulating non-coding RNA types.

In [None]:
#http://mirandola.iit.cnr.it/download/miRandola_version_02_2017.txt

In [None]:
circRNA_ev = pd.read_csv(unprocessed_data_location+'miRandola_version_02_2017.txt', sep='\t')
circRNA_ev = circRNA_ev[(circRNA_ev['RNA_class'] == 'circRNA') & (circRNA_ev['organism'].str.contains('apiens'))]

# circRNA in miRandola only circulates in blood
circRNA_ev['gocc'] = 'GO_0072562'
circRNA_ev = pd.merge(circRNA_ev, symbol_entrez_map.rename(columns={'0_x':'RNA'}), on='RNA')

circRNA_ev['0_y'] = circRNA_ev['0_y'].astype(str) + '#' + circRNA_ev['RNA_class'].astype(str)
circRNA_ev

In [None]:
circRNA_ev[['0_y','gocc']].to_csv(edge_data_location + 'RcircRNA-gocc.txt', header=None, sep='\t', index=None)

***
### circRNA-miRNA

* [SomamiR DB](https://compbio.uthsc.edu/SomamiR/) <br /> SomamiR is a database of cancer somatic mutations in microRNAs (miRNA) and their target sites that potentially alter the interactions between miRNAs and competing endogenous RNAs (ceRNA) including mRNAs, circular RNAs (circRNA) and long noncoding RNAs (lncRNA). It also provides an integrated platform for the functional analysis of these somatic mutations.

In [None]:
data_downloader('https://compbio.uthsc.edu/SomamiR/download/circRNA_somatic_v2.0.txt.tar.gz',
                unprocessed_data_location)

In [None]:
circRNA_miRNA = pd.read_csv(unprocessed_data_location + 'circRNA_somatic_v2.0.txt', sep="\t", header=None, skiprows=[0])
circRNA_miRNA

***
* [miRNet](https://www.mirnet.ca/)

In [None]:
circRNA_miRNA2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-circRNA.csv')
circRNA_miRNA2

In [None]:
circRNA_miRNA = pd.merge(circRNA_miRNA, circRNA_miRNA2, how='outer', left_on=[0, 4], right_on=['symbol', 'mir_id'])

circRNA_miRNA[0].fillna(circRNA_miRNA['symbol'], inplace=True)
circRNA_miRNA[4].fillna(circRNA_miRNA['mir_id'], inplace=True)
circRNA_miRNA['mir_id'].fillna(circRNA_miRNA[4], inplace=True)
circRNA_miRNA['symbol'].fillna(circRNA_miRNA[0], inplace=True)

circRNA_miRNA = pd.merge(circRNA_miRNA, symbol_entrez_map.rename(columns={'0_x':0}), on=0)
circRNA_miRNA = pd.merge(circRNA_miRNA, mirna_mirbase_map.rename(columns={1:4, 0:'a'}), on=4)
circRNA_miRNA['0_y'] = circRNA_miRNA['0_y'].astype(str) + '#circRNA'

circRNA_miRNA

In [None]:
circRNA_maturemiRNA = circRNA_miRNA[(circRNA_miRNA['a'].str.startswith('MIMAT'))]
circRNA_premiRNA = circRNA_miRNA[(circRNA_miRNA['a'].str.startswith('MI')) &
                                 (~circRNA_miRNA['a'].str.startswith('MIMAT'))]

circRNA_maturemiRNA[['0_y','a']].drop_duplicates().to_csv(
    edge_data_location + 'RcircRNA-miRNA.txt', header=None, sep='\t', index=None)
circRNA_premiRNA[['0_y','a']].drop_duplicates().to_csv(
    edge_data_location + 'RcircRNA-premiRNA.txt', header=None, sep='\t', index=None)

***
### Remove unprocessed raw data

In [None]:
#shutil.rmtree(unprocessed_data_location)

***
#### PheKnowLator works with at least 2 rows (we removed headers) per dataframe

In [None]:
nodes = pd.read_csv('../resources/edge_source_list.txt',sep=', ',header=None, engine='python')
nodes[1] = '../'+nodes[1].astype(str)
nodes

for i in set(nodes[1]):
    #Read every df
    #print(i)
    df = pd.read_csv(i,sep='\t',header=None)
    #If df has one single row, then double it
    if len(df) == 1:
        df.append(df).to_csv(i, header=None, sep='\t', index=None)

## Non-ontology data

In [None]:
nodes = pd.read_csv('../resources/edge_source_list.txt',sep=', ',header=None, engine='python')[:115]
nodes[['A', 'B']] = nodes[0].str.split('-', 1, expand=True)
a = set(nodes['A'])
b = set(nodes['B'])
print(a.union(b))

In [None]:
# Provided by PKL ecosystem
data_downloader(processed_url+'subclass_construction_map.pkl', '../resources/construction_approach/')

# Load data, print row count, and preview it
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

# For instance, ncbi IDs are mapped to appropriate SO Ontology entries
list(nonO_data.items())[:5]

***
### miRNA sequences

In [None]:
mature_mirna = mirna_mirbase_map[mirna_mirbase_map[0].str.startswith('MIMAT')]
mature_mirna['SO'] = [['SO_0000276']] * len(mature_mirna)

pre_mirna = mirna_mirbase_map[~mirna_mirbase_map[0].str.startswith('MIMAT')]
pre_mirna['SO'] = [['SO_0000647']] * len(pre_mirna)

mirna_mirbase_map = pd.concat([mature_mirna, pre_mirna])

mirna_nonO = mirna_mirbase_map.drop(1, axis=1).set_index(0).to_dict()
nonO_data = {**nonO_data, **mirna_nonO['SO']}

***
### ASO sequences

In [None]:
ASOnonO_data = pd.read_csv('../resources/edge_data/RASO-mRNA.txt',sep='\t',header=None)[0].drop_duplicates()

ASOnonO_data = pd.DataFrame(ASOnonO_data)
ASOnonO_data['SO'] = [['SO_0000644']] * len(ASOnonO_data)
ASOnonO_data = ASOnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ASOnonO_data['SO']}

***
### ASO drugs

In [None]:
ASOdnonO_data = pd.read_csv('../resources/edge_data/RASOd-mRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RASOd-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RASOd-protein11007.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RASOd-protein10002.txt',sep='\t',header=None)[0]).drop_duplicates()

ASOdnonO_data = pd.DataFrame(ASOdnonO_data)
ASOdnonO_data['SO'] = [['CHEBI_76720']] * len(ASOdnonO_data)
ASOdnonO_data = ASOdnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ASOdnonO_data['SO']}

***
### Aptamer drugs

In [None]:
aptamerdnonO_data = pd.read_csv('../resources/edge_data/Raptamerd-protein.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/Raptamerd-disease.txt',sep='\t',header=None)[0]).drop_duplicates()

aptamerdnonO_data = pd.DataFrame(aptamerdnonO_data)
aptamerdnonO_data['SO'] = [['CHEBI_140490']] * len(aptamerdnonO_data)
aptamerdnonO_data = aptamerdnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **aptamerdnonO_data['SO']}

***
### Aptamer sequences

In [None]:
aptamernonO_data = pd.read_csv('../resources/edge_data/Raptamer-protein.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/Raptamer-chemical.txt',sep='\t',header=None)[0]).drop_duplicates()

aptamernonO_data = pd.DataFrame(aptamernonO_data)
aptamernonO_data['SO'] = [['CHEBI_140490']] * len(aptamernonO_data)
aptamernonO_data = aptamernonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **aptamernonO_data['SO']}

***
### circRNA sequences

In [None]:
circRNAnonO_data = pd.read_csv('../resources/edge_data/RcircRNA-disease.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RcircRNA-gocc.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RcircRNA-miRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RcircRNA-premiRNA.txt',sep='\t',header=None)[0]).drop_duplicates()

circRNAnonO_data = pd.DataFrame(circRNAnonO_data)
circRNAnonO_data['SO'] = [['SO_0002291']] * len(circRNAnonO_data)
circRNAnonO_data = circRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **circRNAnonO_data['SO']}

***
### gRNA sequences

In [None]:
gRNAnonO_data = pd.read_csv('../resources/edge_data/RgRNA-gene.txt',sep='\t',header=None)[0].drop_duplicates().dropna()

gRNAnonO_data = pd.DataFrame(gRNAnonO_data)
gRNAnonO_data['SO'] = [['SO_0000602']] * len(gRNAnonO_data)
gRNAnonO_data = gRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **gRNAnonO_data['SO']}

***
### lncRNA sequences

In [None]:
lncRNAnonO_data = pd.read_csv('../resources/edge_data/RmiRNA-lncRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RpremiRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-gene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-chemical.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsmallProtein-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-expression2245.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-expression2246.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-expression2291.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-role.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-gocc.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-pw.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-gobp.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hgene-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HlncRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HlncRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

lncRNAnonO_data = pd.DataFrame(lncRNAnonO_data)
lncRNAnonO_data['SO'] = [['SO_0001877']] * len(lncRNAnonO_data)
lncRNAnonO_data = lncRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **lncRNAnonO_data['SO']}

***
### mRNA vaccines sequences

In [None]:
mRNAvnonO_data = pd.read_csv('../resources/edge_data/RmRNAv-disease.txt',sep='\t',header=None)[0].drop_duplicates()

mRNAvnonO_data = pd.DataFrame(mRNAvnonO_data)
mRNAvnonO_data['SO'] = [['VO_0000186']] * len(mRNAvnonO_data)
mRNAvnonO_data = mRNAvnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **mRNAvnonO_data['SO']}

***
### scaRNA sequences

In [None]:
scaRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-scaRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-scaRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HscaRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HscaRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates().dropna()

scaRNAnonO_data = pd.DataFrame(scaRNAnonO_data)
scaRNAnonO_data['SO'] = [['SO_0002095']] * len(scaRNAnonO_data)
scaRNAnonO_data = scaRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **scaRNAnonO_data['SO']}

***
### scRNA sequences

In [None]:
scRNAnonO_data = pd.read_csv('../resources/edge_data/Hgene-scRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/HscRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HscRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

scRNAnonO_data = pd.DataFrame(scRNAnonO_data)
scRNAnonO_data['SO'] = [['SO_0000013']] * len(scRNAnonO_data)
scRNAnonO_data = scRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **scRNAnonO_data['SO']}

***
### snRNA sequences

In [None]:
snRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-snRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-snRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HsnRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HsnRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

snRNAnonO_data = pd.DataFrame(snRNAnonO_data)
snRNAnonO_data['SO'] = [['SO_0000274']] * len(snRNAnonO_data)
snRNAnonO_data = snRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snRNAnonO_data['SO']}

***
### tRNA sequences

In [None]:
tRNAnonO_data = pd.read_csv('../resources/edge_data/RtsRNA-tRNA_tRFdb.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RtsRNA-tRNA_MINTbase.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RtRNA-aminoacid.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-tRNA.txt',sep='\t',header=None)[1]).drop_duplicates().dropna()

tRNAnonO_data = pd.DataFrame(tRNAnonO_data)
tRNAnonO_data['SO'] = [['SO_0000253']] * len(tRNAnonO_data)
tRNAnonO_data = tRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **tRNAnonO_data['SO']}

***
### Retained intron sequences

In [None]:
rinonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-retainedIntron.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-retained_intron.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hretained_intron-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hretained_intron-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

rinonO_data = pd.DataFrame(rinonO_data)
rinonO_data['SO'] = [['SO_0000188']] * len(rinonO_data)
rinonO_data = rinonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **rinonO_data['SO']}

***
### rRNA sequences

In [None]:
rRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-rRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-rRNA.txt',sep='\t',header=None)[1]).drop_duplicates()

rRNAnonO_data = pd.DataFrame(rRNAnonO_data)
rRNAnonO_data['SO'] = [['SO_0000252']] * len(rRNAnonO_data)
rRNAnonO_data = rRNAnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **rRNAnonO_data['SO']}

***
### Pseudogene sequences

In [None]:
pseudononO_data = pd.read_csv('../resources/edge_data/RmiRNA-pseudogene.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RsnoRNA-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hgene-pseudo.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hpseudo-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hpseudo-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

pseudononO_data = pd.DataFrame(pseudononO_data)
pseudononO_data['SO'] = [['SO_0000336']] * len(pseudononO_data)
pseudononO_data = pseudononO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **pseudononO_data['SO']}

***
### Mitochondrial tRNA sequences

In [None]:
mttRNAnonO_data = pd.read_csv('../resources/edge_data/Hgene-mt_tRNA.txt',sep='\t',header=None)[1].drop_duplicates()

mttRNAnonO_data = pd.DataFrame(mttRNAnonO_data)
mttRNAnonO_data['SO'] = [['SO_0000253','SO_0001272']] * len(mttRNAnonO_data)
mttRNAnonO_data = mttRNAnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **mttRNAnonO_data['SO']}

***
### miscRNA sequences

In [None]:
unknownRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-miscRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-misc_RNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hmisc_RNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hmisc_RNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

unknownRNAnonO_data = pd.DataFrame(unknownRNAnonO_data)
unknownRNAnonO_data['SO'] = [['SO_0000356']] * len(unknownRNAnonO_data)
unknownRNAnonO_data = unknownRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **unknownRNAnonO_data['SO']}

***
### mRNA sequences

In [None]:
mRNAnonO_data = pd.read_csv('../resources/edge_data/RpremiRNA-mRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RmiRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RASO-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RASOd-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsiRNAd-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsiRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RshRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hgene-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HmRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HmRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HmRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

mRNAnonO_data = pd.DataFrame(mRNAnonO_data)
mRNAnonO_data['SO'] = [['SO_0000234']] * len(mRNAnonO_data)
mRNAnonO_data = mRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **mRNAnonO_data['SO']}

***
### Bacterial strain taxids

In [None]:
bactSnonO_data = pd.read_csv('../resources/edge_data/Rriboswitch-bactStrain.txt',sep='\t',header=None)[1].drop_duplicates()

bactSnonO_data = pd.DataFrame(bactSnonO_data)
bactSnonO_data['SO'] = [['NCBITaxon_2']] * len(bactSnonO_data) # NCBITaxon_2 ∈ VO
bactSnonO_data = bactSnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **bactSnonO_data['SO']}

***
### TEC sequences

In [None]:
TECnonO_data = pd.read_csv('../resources/edge_data/Hgene-TEC.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/HTEC-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HTEC-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

TECnonO_data = pd.DataFrame(TECnonO_data)
TECnonO_data['SO'] = [['SO_0002139']] * len(TECnonO_data)
TECnonO_data = TECnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **TECnonO_data['SO']}

***
### tsRNA sequences

In [None]:
tsRNAnonO_data = pd.read_csv('../resources/edge_data/RtsRNA-miRNA.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RtsRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RtsRNA-tRNA_tRFdb.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RtsRNA-tRNA_MINTbase.txt',sep='\t',header=None)[0]).drop_duplicates()

tsRNAnonO_data = pd.DataFrame(tsRNAnonO_data)
tsRNAnonO_data['SO'] = [['SO_0000253']] * len(tsRNAnonO_data)
tsRNAnonO_data = tsRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **tsRNAnonO_data['SO']}

***
### Riboswitch sequences

In [None]:
riboswitchnonO_data = pd.read_csv('../resources/edge_data/Rriboswitch-protein.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/Rriboswitch-bactStrain.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Rriboswitch-gobp.txt',sep='\t',header=None)[0]).drop_duplicates()

riboswitchnonO_data = pd.DataFrame(riboswitchnonO_data)
riboswitchnonO_data['SO'] = [['SO_0000035']] * len(riboswitchnonO_data)
riboswitchnonO_data = riboswitchnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **riboswitchnonO_data['SO']}

***
### Ribozyme sequences

In [None]:
ribozymenonO_data = pd.read_csv('../resources/edge_data/Rribozyme-GO.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RviralRNA-ribozyme.txt',sep='\t',header=None)[1]).drop_duplicates()

ribozymenonO_data = pd.DataFrame(ribozymenonO_data)
ribozymenonO_data['SO'] = [['SO_0000374']] * len(ribozymenonO_data)
ribozymenonO_data = ribozymenonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ribozymenonO_data['SO']}

***
### Viral RNA sequences

In [None]:
vRNA_ribozyme.moleculeType.unique()

In [None]:
ssRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA']
ssRNA['SO'] = [['SO_0001199']] * len(ssRNA)
ssRNA

In [None]:
ssRNAnonO_data = ssRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAnonO_data['SO']}

In [None]:
ssRNAm = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA(-)']
ssRNAm['SO'] = [['SO_0001200']] * len(ssRNAm)
ssRNAmnonO_data = ssRNAm.set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAmnonO_data['SO']}
    
dsRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'RNA']
dsRNA['SO'] = [['SO_0001169']] * len(dsRNA)
dsRNAnonO_data = dsRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **dsRNAnonO_data['SO']}
    
viralRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'].isna()]
viralRNA['SO'] = [['SO_0001041']] * len(viralRNA)
viralRNAnonO_data = viralRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **viralRNAnonO_data['SO']}

***
### siRNA sequences

In [None]:
siRNAnonO_data = pd.read_csv('../resources/edge_data/RsiRNA-mRNA.txt',sep='\t',header=None)[0].drop_duplicates()

siRNAnonO_data = pd.DataFrame(siRNAnonO_data)
siRNAnonO_data['SO'] = [['SO_0000646']] * len(siRNAnonO_data)
siRNAnonO_data = siRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **siRNAnonO_data['SO']}

***
### shRNA sequences

In [None]:
shRNAnonO_data = pd.read_csv('../resources/edge_data/RshRNA-mRNA.txt',sep='\t',header=None)[0].drop_duplicates()

shRNAnonO_data = pd.DataFrame(shRNAnonO_data)
shRNAnonO_data['SO'] = [['SO_0002031']] * len(shRNAnonO_data)
shRNAnonO_data = shRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **shRNAnonO_data['SO']}

***
### snoRNA sequences

In [None]:
snoRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-gene.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RsnoRNA-premiRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-miRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-snoRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-snoRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-snRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-rRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-tRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-retainedIntron.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-miscRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-scaRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-pseudogene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hgene-snoRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HsnoRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HsnoRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

snoRNAnonO_data = pd.DataFrame(snoRNAnonO_data)
snoRNAnonO_data['SO'] = [['SO_0000275']] * len(snoRNAnonO_data)
snoRNAnonO_data = snoRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snoRNAnonO_data['SO']}

***
### Small proteins

In [None]:
spnonO_data = pd.read_csv('../resources/edge_data/RsmallProtein-lncRNA.txt',sep='\t',header=None)[0].drop_duplicates()

spnonO_data = pd.DataFrame(spnonO_data)
spnonO_data['SO'] = [['SO_0000104']] * len(spnonO_data)
spnonO_data = spnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **spnonO_data['SO']}

***
### siRNA drugs

In [None]:
siRNAdnonO_data = pd.read_csv('../resources/edge_data/RsiRNAd-mRNA.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RsiRNAd-disease.txt',sep='\t',header=None)[0]).drop_duplicates()

siRNAdnonO_data = pd.DataFrame(siRNAdnonO_data)
siRNAdnonO_data['SO'] = [['SO_0002031', 'CHEBI_23888']] * len(siRNAdnonO_data)
siRNAdnonO_data = siRNAdnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **siRNAdnonO_data['SO']}

***
### Biological roles in ChEBI

In [None]:
bio_role = pd.DataFrame(columns = ["role", "ChEBI"])
bio_role['role'] = ['General', 'Tumor-Suppressor-Gene', 'Oncogene']
bio_role['ChEBI'] = [['CHEBI_24432']] * 3
bio_role

In [None]:
role_nonO_data = bio_role.set_index('role').to_dict()
nonO_data = {**nonO_data, **role_nonO_data['ChEBI']}

***
### Epigenetic modifications in GO

In [None]:
#miRNA_epiMod.epi_modification.unique()

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3K4me3', 'H3K9me2', 'H3K9me3', 'H3K27me3', 'H3K4me', 'H3K79me2', 'H3K4me2',
                 'H3K9me', 'H3K27me', 'H3K36me2', 'H3R17me2']
epiMod['GO'] = [['GO_0016571']] * len(epiMod['mod'])
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3S10P']
epiMod['GO'] = [['GO_0006468']]
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3ac', 'H4ac', 'H3K9ac', 'H5ac', 'H3K4ac', 'H3K14ac']
epiMod['GO'] = [['GO_0016573']] * len(epiMod['mod'])
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# DO NOT RUN, this cell is only intended to CHECK everything's OK
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

nonO_data.items()